Skip to content

Commit

Permalink
Fixed the file type detection
Browse files Browse the repository at this point in the history
  • Loading branch information
kishori82 committed Nov 5, 2014
1 parent 22559e5 commit c13e405
Show file tree
Hide file tree
Showing 16 changed files with 651 additions and 470 deletions.
104 changes: 39 additions & 65 deletions MetaPathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def createParser():
help="print lots of information on the stdout [default]")

parser.add_option("-b", "--block-mode",
action="store_true", dest="block_mode", default=False,
action="store_true", dest="block_mode", default=True,
help="processes the samples by blocking the stages before and after functional search [default off]")

parser.add_option("-P", "--print-only",
Expand Down Expand Up @@ -123,51 +123,25 @@ def valid_arguments(opts, args):
else:
return False

def remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = None):
def derive_sample_name(filename):
basename = path.basename(filename)

shortname = re.sub('[.]gbk$','',basename, re.IGNORECASE)
shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',shortname, re.IGNORECASE)
return shortname



def remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = None):
""" keep only the samples that are specified before processing """
shortened_names = {}

for input_file in input_output_list.keys():
shortname = None
if format in ['gbk-unannotated', 'gbk-annotated']:
shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE)
elif format =='fasta':
shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE)

if shortname==None:
continue

shortname = re.sub(r'[.]','_',shortname)
shortened_names[shortname] = input_file

shortened_subset_names = []
for sample_in_subset in sample_subset:
shortname = None
if format in ['gbk-unannotated', 'gbk-annotated']:
shortname = re.sub('[.]gbk$','',sample_in_subset, re.IGNORECASE)
elif format =='fasta':
shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',sample_in_subset, re.IGNORECASE)

if shortname==None:
continue

if check_for_error_in_input_file_name(shortname, globalerrorlogger=globalerrorlogger):
shortened_subset_names.append(shortname)

samples_to_keep = {}

for keep_sample in shortened_subset_names:
sampleMatchPAT = re.compile(r'' + keep_sample + '$')
for sample in shortened_names:
result = sampleMatchPAT.search(sample, re.IGNORECASE)
if result:
samples_to_keep[shortened_names[sample]]= True
break


input_sample_list = input_output_list.keys()
for sample in input_sample_list:
if not sample in samples_to_keep:
del input_output_list[sample]
for sample_name in input_sample_list:
if not derive_sample_name(sample_name) in sample_subset:
del input_output_list[sample_name]



def check_for_error_in_input_file_name(shortname, globalerrorlogger=None):
Expand Down Expand Up @@ -203,7 +177,7 @@ def check_for_error_in_input_file_name(shortname, globalerrorlogger=None):
return False


def create_an_input_output_pair(input_file, output_dir, format, globalerrorlogger=None):
def create_an_input_output_pair(input_file, output_dir, globalerrorlogger=None):
""" creates an input output pair if input is just an input file """

input_output = {}
Expand All @@ -212,12 +186,9 @@ def create_an_input_output_pair(input_file, output_dir, format, globalerrorlogge
return input_output

shortname = None
if format in ['gbk-unannotated', 'gbk-annotated']:
shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE)
elif format =='fasta':
shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE)
else:
shortname = re.sub('[.]gff$','',input_file, re.IGNORECASE)
shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE)
shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE)
# shortname = re.sub('[.]gff$','',input_file, re.IGNORECASE)

shortname = re.sub(r'.*' + PATHDELIM ,'',shortname)

Expand All @@ -228,23 +199,25 @@ def create_an_input_output_pair(input_file, output_dir, format, globalerrorlogge
return input_output


def create_input_output_pairs(input_dir, output_dir, format, globalerrorlogger=None):
def create_input_output_pairs(input_dir, output_dir, globalerrorlogger=None):
""" creates a list of input output pairs if input is an input dir """
fileslist = listdir(input_dir)
gbkPatt = re.compile('[.]gbk$',re.IGNORECASE)

gbkPatt = re.compile('[.]gbk$',re.IGNORECASE)
fastaPatt = re.compile('[.](fasta|fas|fna|faa|fa)$',re.IGNORECASE)
gffPatt = re.compile('[.]gff$',re.IGNORECASE)

input_files = {}
for input_file in fileslist:

shortname = None
if format in ['gbk-unannotated', 'gbk-annotated']:
result = gbkPatt.search(input_file)
if result:
shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE)
result = None

elif format in [ 'fasta' ]:
result = gbkPatt.search(input_file)
if result:
shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE)

if result==None:
result = fastaPatt.search(input_file)
if result:
shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE)
Expand All @@ -258,7 +231,7 @@ def create_input_output_pairs(input_dir, output_dir, format, globalerrorlogger=N

paired_input = {}
for key, value in input_files.iteritems():
paired_input[input_dir + PATHDELIM + key] = path.abspath(output_dir) + PATHDELIM + value
paired_input[input_dir + PATHDELIM + key] = path.abspath(output_dir) + PATHDELIM + value

return paired_input

Expand Down Expand Up @@ -353,7 +326,6 @@ def main(argv):
command_line_params['verbose']= opts.verbose

params=parse_metapaths_parameters(parameter_f)
format = params['INPUT']['format']

""" load the sample inputs it expects either a fasta
file or a directory containing fasta and yaml file pairs
Expand All @@ -364,11 +336,11 @@ def main(argv):
input_output_list = {}
if path.isfile(input_fp):
""" check if it is a file """
input_output_list = create_an_input_output_pair(input_fp, output_dir, format, globalerrorlogger = globalerrorlogger)
input_output_list = create_an_input_output_pair(input_fp, output_dir, globalerrorlogger=globalerrorlogger)
else:
if path.exists(input_fp):
""" check if dir exists """
input_output_list = create_input_output_pairs(input_fp, output_dir, format, globalerrorlogger=globalerrorlogger)
input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger)
else:
""" must be an error """
eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
Expand All @@ -377,15 +349,16 @@ def main(argv):

""" these are the subset of sample to process if specified
in case of an empty subset process all the sample """
if sample_subset:
remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = globalerrorlogger)


if sample_subset:
remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger)

# add check the config parameters
sorted_input_output_list = sorted(input_output_list.keys())
print sample_subset
print sorted_input_output_list


filetypes = check_file_types(sorted_input_output_list)

config_settings = read_pipeline_configuration(config_file, globalerrorlogger)

Expand All @@ -396,7 +369,6 @@ def main(argv):
exit_process("ERROR\tFailed to pass the test for required scripts and inputs before run\n")



samplesData = {}
# PART1 before the blast

Expand All @@ -415,6 +387,8 @@ def main(argv):
s.setParameter('algorithm', algorithm)
s.setParameter('ncbi_params_file', ncbi_sequin_params)
s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt)
s.setParameter('FILE_TYPE', filetypes[input_file][0])
s.setParameter('SEQ_TYPE', filetypes[input_file][1])
s.clearJobs()

if run_type=='overwrite' and path.exists(sample_output_dir):
Expand Down
1 change: 1 addition & 0 deletions config/template_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ RPKM_EXECUTABLE 'rpkm'
GBK_TO_FNA_FAA_GFF 'libs/python_scripts/MetaPathways_parse_genbank.py'
GFF_TO_FNA_FAA_GFF 'libs/python_scripts/MetaPathways_input_gff.py'
PREPROCESS_INPUT 'libs/python_scripts/MetaPathways_filter_input.py'
PREPROCESS_AMINOS 'libs/python_scripts/MetaPathways_preprocess_amino_input.py'
ORF_PREDICTION 'libs/python_scripts/MetaPathways_orf_prediction.py'
ORF_TO_AMINO 'libs/python_scripts/MetaPathways_create_amino_sequences.py'
COMPUTE_REFSCORES 'libs/python_scripts/MetaPathways_refscore.py'
Expand Down
33 changes: 17 additions & 16 deletions config/template_param.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@ orf_prediction:translation_table 11
annotation:algorithm LAST
# e.g. blast or last

#annotation:dbs COG_2013-12-27,kegg-pep-2011-06-18,refseq-nr-2014-01-18,metacyc-v4-2011-07-03,seed-2014-01-30
annotation:dbs metacyc-v4-2011-07-03,MDM_SAG_proteins,COG_2013-12-27
#annotation:dbs metacyc-v4-2011-07-03,CAZY_2014_09_04,COG_2013-12-27
#annotation:dbs metacyc-v4-2011-07-03,refseq-nr-2014-01-18,CAZY_2014_09_04,MDM_SAG_proteins,COG_2013-12-27,kegg-pep-2011-06-18,seed-2014-01-30
annotation:dbs metacyc-v4-2011-07-03,COG_2013-12-27
# e.g. annotation:dbs cog,kegg,refseq,metacyc
annotation:min_bsr 0.4
annotation:min_bsr 0.36
annotation:max_evalue 0.000001
annotation:min_score 20
annotation:min_length 60
annotation:max_hits 5

# rRNA annotation parameters LSURef_115_tax_silva
rRNA:refdbs LSURef_115_tax_silva
rRNA:refdbs GREENGENES_gg16S-2012-11-06,LSURef_115_tax_silva,SSURef_NR99_115_tax_silva

rRNA:max_evalue 0.000001
rRNA:min_identity 20
Expand All @@ -42,18 +43,18 @@ ptools_settings:taxonomic_pruning no

# pipeline execution flags
# e.g. yes, skip, yes
metapaths_steps:PREPROCESS_INPUT skip
metapaths_steps:ORF_PREDICTION skip
metapaths_steps:ORF_TO_AMINO skip
metapaths_steps:FILTER_AMINOS skip
metapaths_steps:COMPUTE_REFSCORES skip
metapaths_steps:FUNC_SEARCH skip
metapaths_steps:PARSE_FUNC_SEARCH yes
metapaths_steps:SCAN_rRNA skip
metapaths_steps:SCAN_tRNA skip
metapaths_steps:ANNOTATE_ORFS skip
metapaths_steps:PATHOLOGIC_INPUT skip
metapaths_steps:GENBANK_FILE skip
metapaths_steps:PREPROCESS_INPUT redo
metapaths_steps:ORF_PREDICTION redo
metapaths_steps:ORF_TO_AMINO redo
metapaths_steps:FILTER_AMINOS redo
metapaths_steps:COMPUTE_REFSCORES redo
metapaths_steps:FUNC_SEARCH redo
metapaths_steps:PARSE_FUNC_SEARCH redo
metapaths_steps:SCAN_rRNA redo
metapaths_steps:SCAN_tRNA redo
metapaths_steps:ANNOTATE_ORFS redo
metapaths_steps:PATHOLOGIC_INPUT redo
metapaths_steps:GENBANK_FILE redo
metapaths_steps:CREATE_ANNOT_REPORTS redo
metapaths_steps:MLTREEMAP_CALCULATION skip
metapaths_steps:BUILD_PGDB skip
Expand Down
36 changes: 19 additions & 17 deletions libs/python_modules/pipeline/jobscreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def addJobs(self, s, block_mode=False):
contextCreator = ContextCreator(self.params, self.configs)

contextBlock = []
for stageList in contextCreator.getStageLists():
for stageList in contextCreator.getStageLists(s.getType()):
if block_mode ==True:
contextBlock = []

Expand Down Expand Up @@ -464,7 +464,7 @@ def create_blastp_against_refdb_cmd(self, s):
pyScript = self.configs.METAPATHWAYS_PATH + PATHDELIM + self.configs.FUNC_SEARCH
searchExec = self.configs.METAPATHWAYS_PATH + PATHDELIM + self.configs.EXECUTABLES_DIR +\
PATHDELIM + self.configs.LAST_EXECUTABLE
cmd= "%s --algorithm %s --last_executable %s --last_o %s --last_f 0 --last_db %s --last_query %s" \
cmd= "%s --algorithm %s --last_executable %s --last_o %s --last_f 2 --last_db %s --last_query %s" \
%(pyScript, s.algorithm , searchExec, blastoutput, refDbFullName, input_filtered_faa)

context.message = self._Message("LASTING AMINO SEQS AGAINST " + db)
Expand Down Expand Up @@ -814,13 +814,12 @@ def create_report_files_cmd(self, s):
basencbi = self.configs.REFDBS + PATHDELIM + 'ncbi_tree'
context.inputs = {
'input_annot_gff':input_annot_gff,
'KO_classification':basefun + PATHDELIM + 'KO_classification.txt',
'COG_categories':basefun + PATHDELIM + 'COG_categories.txt',
'SEED_subsystems':basefun + PATHDELIM + 'SEED_subsystems.txt',
'CAZY_hierarchy':basefun + PATHDELIM + 'CAZY_hierarchy.txt',
'ncbi_taxonomy_tree': basencbi + PATHDELIM + 'NCBI_TAXONOMY_TREE.TXT',
'ncbi_megan_map': basencbi + PATHDELIM + 'ncbi.map'

'KO_classification':basefun + PATHDELIM + 'KO_classification.txt',
'COG_categories':basefun + PATHDELIM + 'COG_categories.txt',
'SEED_subsystems':basefun + PATHDELIM + 'SEED_subsystems.txt',
'CAZY_hierarchy':basefun + PATHDELIM + 'CAZY_hierarchy.txt',
'ncbi_taxonomy_tree': basencbi + PATHDELIM + 'NCBI_TAXONOMY_TREE.TXT',
'ncbi_megan_map': basencbi + PATHDELIM + 'ncbi.map'
}
context.outputs = {
'output_results_annotation_table_dir':s.output_results_annotation_table_dir,
Expand Down Expand Up @@ -1120,22 +1119,23 @@ def create_rpkm_cmd(self, s):
def __init__(self, params, configs):
self.params = Singleton(Params)(params)
self.configs = Singleton(Configs)(configs)
self.format = params['INPUT']['format']
#self.format = params['INPUT']['format']
self.initFactoryList()
pass

def getContexts(self, s, stage):
stageList = {}

for stageBlock in self.stageList[self.format]:
for stageBlock in self.stageList[s.getType()]:
for _stage in stageBlock:
stageList[_stage] = True

if stage in stageList:
return self.factory[stage](s)

def getStageLists(self):
return self.stageList[self.format]
def getStageLists(self, type):
print 'type' , type
return self.stageList[type]


def initFactoryList(self):
Expand All @@ -1159,7 +1159,7 @@ def initFactoryList(self):
self.factory['MLTREEMAP_CALCULATION'] = self.create_mltreemap_cmd
self.factory['COMPUTE_RPKM'] = self.create_rpkm_cmd

self.stageList['fasta-amino'] = [
self.stageList['AMINO-FASTA'] = [
['PREPROCESS_AMINOS',
'FILTER_AMINOS',
'COMPUTE_REFSCORES' ],
Expand All @@ -1173,7 +1173,7 @@ def initFactoryList(self):
'BUILD_PGDB' ]
]

self.stageList['fasta'] = [
self.stageList['NUCL-FASTA'] = [
['PREPROCESS_INPUT',
'ORF_PREDICTION',
'ORF_TO_AMINO',
Expand All @@ -1192,11 +1192,13 @@ def initFactoryList(self):
'COMPUTE_RPKM']
]

self.stageList['gbk-unannotated'] = [
self.stageList['AMINO-GENBANK-UNANNOT'] = [
[ 'GBK_TO_FNA_FAA_GFF',
'FILTER_AMINOS',
'COMPUTE_REFSCORES' ],

[ 'FUNC_SEARCH' ],

[ 'PARSE_FUNC_SEARCH',
'SCAN_rRNA',
'SCAN_tRNA',
Expand All @@ -1208,7 +1210,7 @@ def initFactoryList(self):
'BUILD_PGDB']
]

self.stageList['gbk-annotated'] = [
self.stageList['AMINO-GENBANK-ANNOT'] = [
'GBK_TO_FNA_FAA_GFF_ANNOT',
'FILTER_AMINOS',
'SCAN_rRNA',
Expand Down
Loading

0 comments on commit c13e405

Please sign in to comment.