Fixed the file type detection

hallamlab · Nov 5, 2014 · c13e405 · c13e405
1 parent 22559e5
commit c13e405
Show file tree

Hide file tree

Showing 16 changed files with 651 additions and 470 deletions.
diff --git a/MetaPathways.py b/MetaPathways.py
@@ -95,7 +95,7 @@ def createParser():
                       help="print lots of information on the stdout [default]")
 
     parser.add_option("-b", "--block-mode",
-                      action="store_true", dest="block_mode", default=False,
+                      action="store_true", dest="block_mode", default=True,
                       help="processes the samples by blocking the stages before and after functional search [default off]")
 
     parser.add_option("-P", "--print-only",
@@ -123,51 +123,25 @@ def valid_arguments(opts, args):
     else:
        return False
 
-def remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = None):
+def derive_sample_name(filename):
+    basename = path.basename(filename) 
+
+    shortname = re.sub('[.]gbk$','',basename, re.IGNORECASE) 
+    shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',shortname, re.IGNORECASE) 
+    return shortname
+
+
+
+def remove_unspecified_samples(input_output_list, sample_subset,  globalerrorlogger = None):
    """ keep only the samples that are specified  before processing  """
    shortened_names = {}
 
-   for input_file in input_output_list.keys():
-      shortname = None 
-      if format in ['gbk-unannotated', 'gbk-annotated']:
-          shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE) 
-      elif format =='fasta':
-          shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE) 
-
-      if shortname==None:
-         continue
-
-      shortname = re.sub(r'[.]','_',shortname) 
-      shortened_names[shortname] = input_file
-
-   shortened_subset_names = [] 
-   for sample_in_subset in sample_subset:
-      shortname = None 
-      if format in ['gbk-unannotated', 'gbk-annotated']:
-          shortname = re.sub('[.]gbk$','',sample_in_subset, re.IGNORECASE) 
-      elif format =='fasta':
-          shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',sample_in_subset, re.IGNORECASE) 
-
-      if shortname==None:
-         continue
-
-      if check_for_error_in_input_file_name(shortname, globalerrorlogger=globalerrorlogger):
-         shortened_subset_names.append(shortname)
-
-   samples_to_keep = {} 
-
-   for keep_sample in shortened_subset_names:
-      sampleMatchPAT = re.compile(r'' + keep_sample + '$') 
-      for sample  in shortened_names:
-         result = sampleMatchPAT.search(sample, re.IGNORECASE)
-         if result:
-            samples_to_keep[shortened_names[sample]]= True
-            break
-
+
    input_sample_list = input_output_list.keys()
-   for sample in input_sample_list:
-      if not sample in samples_to_keep:
-         del input_output_list[sample]
+   for sample_name in input_sample_list:
+      if not derive_sample_name(sample_name) in sample_subset:
+         del input_output_list[sample_name]
+
 
 
 def check_for_error_in_input_file_name(shortname, globalerrorlogger=None):
@@ -203,7 +177,7 @@ def check_for_error_in_input_file_name(shortname, globalerrorlogger=None):
     return False
 
 
-def create_an_input_output_pair(input_file, output_dir, format, globalerrorlogger=None):
+def create_an_input_output_pair(input_file, output_dir,  globalerrorlogger=None):
     """ creates an input output pair if input is just an input file """
 
     input_output = {}
@@ -212,12 +186,9 @@ def create_an_input_output_pair(input_file, output_dir, format, globalerrorlogge
        return input_output
 
     shortname = None 
-    if format in ['gbk-unannotated', 'gbk-annotated']:
-        shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE) 
-    elif format =='fasta':
-        shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE) 
-    else:
-        shortname = re.sub('[.]gff$','',input_file, re.IGNORECASE) 
+    shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE) 
+    shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE) 
+    #    shortname = re.sub('[.]gff$','',input_file, re.IGNORECASE) 
 
     shortname = re.sub(r'.*' + PATHDELIM ,'',shortname) 
 
@@ -228,23 +199,25 @@ def create_an_input_output_pair(input_file, output_dir, format, globalerrorlogge
     return input_output
 
 
-def create_input_output_pairs(input_dir, output_dir, format, globalerrorlogger=None):
+def create_input_output_pairs(input_dir, output_dir,  globalerrorlogger=None):
     """  creates a list of  input output pairs if input is  an input dir """
     fileslist =  listdir(input_dir)
-    gbkPatt = re.compile('[.]gbk$',re.IGNORECASE) 
 
+    gbkPatt = re.compile('[.]gbk$',re.IGNORECASE) 
     fastaPatt = re.compile('[.](fasta|fas|fna|faa|fa)$',re.IGNORECASE) 
     gffPatt = re.compile('[.]gff$',re.IGNORECASE) 
 
     input_files = {}
     for input_file in fileslist:
+
        shortname = None 
-       if format in ['gbk-unannotated', 'gbk-annotated']:
-          result =  gbkPatt.search(input_file)
-          if result:
-             shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE) 
+       result = None
 
-       elif format in [ 'fasta' ]:
+       result =  gbkPatt.search(input_file)
+       if result:
+         shortname = re.sub('[.]gbk$','',input_file, re.IGNORECASE) 
+
+       if result==None:
           result =  fastaPatt.search(input_file)
           if result:
              shortname = re.sub('[.](fasta|fas|fna|faa|fa)$','',input_file, re.IGNORECASE) 
@@ -258,7 +231,7 @@ def create_input_output_pairs(input_dir, output_dir, format, globalerrorlogger=N
 
     paired_input = {} 
     for key, value in input_files.iteritems():
-            paired_input[input_dir + PATHDELIM + key] = path.abspath(output_dir) + PATHDELIM + value
+       paired_input[input_dir + PATHDELIM + key] = path.abspath(output_dir) + PATHDELIM + value
 
     return paired_input
 
@@ -353,7 +326,6 @@ def main(argv):
     command_line_params['verbose']= opts.verbose
 
     params=parse_metapaths_parameters(parameter_f)
-    format = params['INPUT']['format']
 
     """ load the sample inputs  it expects either a fasta 
         file or  a directory containing fasta and yaml file pairs
@@ -364,11 +336,11 @@ def main(argv):
     input_output_list = {}
     if path.isfile(input_fp):   
        """ check if it is a file """
-       input_output_list = create_an_input_output_pair(input_fp, output_dir, format, globalerrorlogger = globalerrorlogger)
+       input_output_list = create_an_input_output_pair(input_fp, output_dir,  globalerrorlogger=globalerrorlogger)
     else:
        if path.exists(input_fp):   
           """ check if dir exists """
-          input_output_list = create_input_output_pairs(input_fp, output_dir, format, globalerrorlogger=globalerrorlogger)
+          input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger)
        else:   
           """ must be an error """
           eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
@@ -377,15 +349,16 @@ def main(argv):
 
     """ these are the subset of sample to process if specified
         in case of an empty subset process all the sample """
-    if sample_subset:
-       remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = globalerrorlogger)
 
 
+    if sample_subset:
+       remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger)
+
     # add check the config parameters 
     sorted_input_output_list = sorted(input_output_list.keys())
-    print sample_subset 
-    print sorted_input_output_list
 
+
+    filetypes = check_file_types(sorted_input_output_list) 
 
     config_settings = read_pipeline_configuration(config_file, globalerrorlogger)
 
@@ -396,7 +369,6 @@ def main(argv):
         exit_process("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
 
 
-
     samplesData = {}
     # PART1 before the blast
 
@@ -415,6 +387,8 @@ def main(argv):
                 s.setParameter('algorithm', algorithm)
                 s.setParameter('ncbi_params_file', ncbi_sequin_params)
                 s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt)
+                s.setParameter('FILE_TYPE', filetypes[input_file][0])
+                s.setParameter('SEQ_TYPE', filetypes[input_file][1])
                 s.clearJobs()
 
                 if run_type=='overwrite' and  path.exists(sample_output_dir):

diff --git a/config/template_config.txt b/config/template_config.txt
@@ -32,6 +32,7 @@ RPKM_EXECUTABLE 'rpkm'
 GBK_TO_FNA_FAA_GFF   'libs/python_scripts/MetaPathways_parse_genbank.py'
 GFF_TO_FNA_FAA_GFF   'libs/python_scripts/MetaPathways_input_gff.py'
 PREPROCESS_INPUT          'libs/python_scripts/MetaPathways_filter_input.py'
+PREPROCESS_AMINOS    'libs/python_scripts/MetaPathways_preprocess_amino_input.py'
 ORF_PREDICTION          'libs/python_scripts/MetaPathways_orf_prediction.py'
 ORF_TO_AMINO 'libs/python_scripts/MetaPathways_create_amino_sequences.py'
 COMPUTE_REFSCORES 'libs/python_scripts/MetaPathways_refscore.py' 

diff --git a/config/template_param.txt b/config/template_param.txt
@@ -19,17 +19,18 @@ orf_prediction:translation_table 11
 annotation:algorithm LAST
 # e.g. blast or last
 
-#annotation:dbs COG_2013-12-27,kegg-pep-2011-06-18,refseq-nr-2014-01-18,metacyc-v4-2011-07-03,seed-2014-01-30 
-annotation:dbs metacyc-v4-2011-07-03,MDM_SAG_proteins,COG_2013-12-27
+#annotation:dbs metacyc-v4-2011-07-03,CAZY_2014_09_04,COG_2013-12-27
+#annotation:dbs metacyc-v4-2011-07-03,refseq-nr-2014-01-18,CAZY_2014_09_04,MDM_SAG_proteins,COG_2013-12-27,kegg-pep-2011-06-18,seed-2014-01-30
+annotation:dbs metacyc-v4-2011-07-03,COG_2013-12-27
 # e.g. annotation:dbs    cog,kegg,refseq,metacyc
-annotation:min_bsr 0.4
+annotation:min_bsr 0.36
 annotation:max_evalue 0.000001
 annotation:min_score 20
 annotation:min_length 60
 annotation:max_hits 5
 
 # rRNA annotation parameters LSURef_115_tax_silva
-rRNA:refdbs LSURef_115_tax_silva
+rRNA:refdbs GREENGENES_gg16S-2012-11-06,LSURef_115_tax_silva,SSURef_NR99_115_tax_silva
 
 rRNA:max_evalue 0.000001
 rRNA:min_identity 20
@@ -42,18 +43,18 @@ ptools_settings:taxonomic_pruning no
 
 # pipeline execution flags
 # e.g. yes, skip, yes
-metapaths_steps:PREPROCESS_INPUT skip
-metapaths_steps:ORF_PREDICTION skip
-metapaths_steps:ORF_TO_AMINO skip
-metapaths_steps:FILTER_AMINOS skip
-metapaths_steps:COMPUTE_REFSCORES skip
-metapaths_steps:FUNC_SEARCH skip
-metapaths_steps:PARSE_FUNC_SEARCH yes
-metapaths_steps:SCAN_rRNA skip
-metapaths_steps:SCAN_tRNA skip
-metapaths_steps:ANNOTATE_ORFS skip
-metapaths_steps:PATHOLOGIC_INPUT skip
-metapaths_steps:GENBANK_FILE skip
+metapaths_steps:PREPROCESS_INPUT redo
+metapaths_steps:ORF_PREDICTION redo
+metapaths_steps:ORF_TO_AMINO redo
+metapaths_steps:FILTER_AMINOS redo
+metapaths_steps:COMPUTE_REFSCORES redo
+metapaths_steps:FUNC_SEARCH redo
+metapaths_steps:PARSE_FUNC_SEARCH redo
+metapaths_steps:SCAN_rRNA redo
+metapaths_steps:SCAN_tRNA redo
+metapaths_steps:ANNOTATE_ORFS redo
+metapaths_steps:PATHOLOGIC_INPUT redo
+metapaths_steps:GENBANK_FILE redo
 metapaths_steps:CREATE_ANNOT_REPORTS redo
 metapaths_steps:MLTREEMAP_CALCULATION skip
 metapaths_steps:BUILD_PGDB skip

diff --git a/libs/python_modules/pipeline/jobscreator.py b/libs/python_modules/pipeline/jobscreator.py
@@ -46,7 +46,7 @@ def addJobs(self, s, block_mode=False):
           contextCreator = ContextCreator(self.params, self.configs)
 
           contextBlock = []
-          for stageList in contextCreator.getStageLists():
+          for stageList in contextCreator.getStageLists(s.getType()):
             if block_mode ==True:
                contextBlock = []
 
@@ -464,7 +464,7 @@ def create_blastp_against_refdb_cmd(self, s):
                   pyScript      = self.configs.METAPATHWAYS_PATH + PATHDELIM +  self.configs.FUNC_SEARCH
                   searchExec =   self.configs.METAPATHWAYS_PATH + PATHDELIM + self.configs.EXECUTABLES_DIR +\
                                  PATHDELIM + self.configs.LAST_EXECUTABLE
-                  cmd= "%s --algorithm %s --last_executable %s --last_o %s --last_f 0 --last_db %s --last_query %s" \
+                  cmd= "%s --algorithm %s --last_executable %s --last_o %s --last_f 2 --last_db %s --last_query %s" \
                        %(pyScript, s.algorithm ,  searchExec, blastoutput, refDbFullName, input_filtered_faa) 
 
                   context.message = self._Message("LASTING AMINO SEQS AGAINST " + db)
@@ -814,13 +814,12 @@ def create_report_files_cmd(self, s):
           basencbi = self.configs.REFDBS + PATHDELIM + 'ncbi_tree' 
           context.inputs = {
                             'input_annot_gff':input_annot_gff,
-                           'KO_classification':basefun + PATHDELIM +  'KO_classification.txt',
-                           'COG_categories':basefun + PATHDELIM +  'COG_categories.txt',
-                           'SEED_subsystems':basefun + PATHDELIM + 'SEED_subsystems.txt',
-                           'CAZY_hierarchy':basefun + PATHDELIM + 'CAZY_hierarchy.txt',
-                           'ncbi_taxonomy_tree': basencbi + PATHDELIM + 'NCBI_TAXONOMY_TREE.TXT',
-                           'ncbi_megan_map': basencbi + PATHDELIM + 'ncbi.map'
-
+                            'KO_classification':basefun + PATHDELIM +  'KO_classification.txt',
+                            'COG_categories':basefun + PATHDELIM +  'COG_categories.txt',
+                            'SEED_subsystems':basefun + PATHDELIM + 'SEED_subsystems.txt',
+                            'CAZY_hierarchy':basefun + PATHDELIM + 'CAZY_hierarchy.txt',
+                            'ncbi_taxonomy_tree': basencbi + PATHDELIM + 'NCBI_TAXONOMY_TREE.TXT',
+                            'ncbi_megan_map': basencbi + PATHDELIM + 'ncbi.map'
                            }
           context.outputs = {
                            'output_results_annotation_table_dir':s.output_results_annotation_table_dir,
@@ -1120,22 +1119,23 @@ def create_rpkm_cmd(self, s):
       def __init__(self, params, configs): 
           self.params = Singleton(Params)(params)
           self.configs = Singleton(Configs)(configs)
-          self.format = params['INPUT']['format']
+          #self.format = params['INPUT']['format']
           self.initFactoryList()
           pass
 
       def getContexts(self, s, stage):
           stageList  = {}
 
-          for stageBlock in self.stageList[self.format]:
+          for stageBlock in self.stageList[s.getType()]:
             for _stage in  stageBlock:
                stageList[_stage] = True
 
           if stage in stageList:
               return self.factory[stage](s)
 
-      def getStageLists(self):
-           return self.stageList[self.format]
+      def getStageLists(self, type):
+           print 'type' , type
+           return self.stageList[type]
 
 
       def initFactoryList(self):
@@ -1159,7 +1159,7 @@ def initFactoryList(self):
            self.factory['MLTREEMAP_CALCULATION'] = self.create_mltreemap_cmd
            self.factory['COMPUTE_RPKM'] = self.create_rpkm_cmd
 
-           self.stageList['fasta-amino'] = [
+           self.stageList['AMINO-FASTA'] = [
                              ['PREPROCESS_AMINOS',
                               'FILTER_AMINOS',
                               'COMPUTE_REFSCORES' ],
@@ -1173,7 +1173,7 @@ def initFactoryList(self):
                               'BUILD_PGDB' ]
                              ]
 
-           self.stageList['fasta'] = [
+           self.stageList['NUCL-FASTA'] = [
                              ['PREPROCESS_INPUT',
                               'ORF_PREDICTION',
                               'ORF_TO_AMINO',
@@ -1192,11 +1192,13 @@ def initFactoryList(self):
                               'COMPUTE_RPKM']
                              ]
 
-           self.stageList['gbk-unannotated'] = [
+           self.stageList['AMINO-GENBANK-UNANNOT'] = [
                              [ 'GBK_TO_FNA_FAA_GFF',
                               'FILTER_AMINOS',
                               'COMPUTE_REFSCORES' ],
+
                              [ 'FUNC_SEARCH' ],
+
                              [ 'PARSE_FUNC_SEARCH',
                               'SCAN_rRNA',
                               'SCAN_tRNA',
@@ -1208,7 +1210,7 @@ def initFactoryList(self):
                               'BUILD_PGDB']
                              ]
 
-           self.stageList['gbk-annotated'] = [
+           self.stageList['AMINO-GENBANK-ANNOT'] = [
                               'GBK_TO_FNA_FAA_GFF_ANNOT',
                               'FILTER_AMINOS',
                               'SCAN_rRNA',