sped up the report generation

hallamlab · Mar 12, 2014 · 72d3824 · 72d3824
1 parent 55af473
commit 72d3824
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 62 deletions.
diff --git a/MetaPathways.py b/MetaPathways.py
@@ -32,7 +32,7 @@
    print """ Could not load some user defined  module functions"""
    print """ Make sure your typed \"source MetaPathwaysrc\""""
    print """ """
-   print traceback.print_exc(10)
+   #print traceback.print_exc(10)
    sys.exit(3)
 
 
@@ -110,8 +110,13 @@ def create_an_input_output_pair(input_file, output_dir):
     shortname = re.sub('[.](fasta|fas|fna|faa|gbk|gff|fa)$','',input_file, re.I) 
     shortname = re.sub(r'.*' + PATHDELIM ,'',shortname) 
     shortname = re.sub(r'[.]','_',shortname) 
+
     if re.search(r'.(fasta|fas|fna|faa|gbk|gff|fa)$',input_file, re.I):
-       input_output[input_file] = path.abspath(output_dir) + PATHDELIM + shortname
+       if len(shortname)>1:
+           input_output[input_file] = path.abspath(output_dir) + PATHDELIM + shortname
+       else:
+           print "WARNING : sample with one character name " + shortname + "(i.e., file \"" + input_file + "\" will be ignored"
+           print "          because prodigal creates some problem with such files"
 
     return input_output
 
@@ -125,7 +130,11 @@ def create_input_output_pairs(input_dir, output_dir):
        shortname = re.sub('.(fasta|fas|fna|faa|gbk|gff|fa)$','',file,re.I) 
        shortname = re.sub(r'[.]','_',shortname) 
        if re.search('.(fasta|fas|fna|faa|gff|gbk|fa)$',file, re.I):
-         input_files[file] = shortname
+          if len(shortname)>1:
+             input_files[file] = shortname
+          else:
+             print "WARNING : sample with one character name " + shortname + "(i.e., file \"" + file + "\" will be ignored"
+             print "          because prodigal creates some problem with such files"
 
     paired_input = {} 
     for key, value in input_files.iteritems():

diff --git a/README b/README
@@ -0,0 +1,59 @@
+
+# Description of the data for the MetaPathways 2.0 data product
+
+1. Sample_Name/preprocessed/   # the preprocessed folder
+  Sample_Name.fasta   : the preprocessed fasta sequence after filtering the input sequences based
+                        on the cutoffs on length, removal of ambiguous base pairs
+  Sample_Name.mapping.txt   : the original names of the input sequences are mapped to produce 
+                               uniform name in the format Sample_Name_x, where
+                                  x is the contig number
+
+
+2. Sample_Name/orf_prediction/:  this folder contains the output related to the prodigal
+   Sample_Name.fna  :  Open Read Frame (ORFs) detected by prodigal
+   Sample_Name.faa  :  the translated ORFs
+   Sample_Name.gff  :  a gff file containing the ORF information by prodigal
+   Sample_Name.qced.faa  : these are QCed amino acid sequences from file Sample_Name.faa after removing 
+                           sequences that are too short. The names of the ORFs are in the format
+                           Sample_Name_x_y, where 
+                                x is the contig number and y is the ORF number in contig Sample_Name_x 
+   Sample_Name.unannot.gff : These are created from the Sample_Name.gff file after removing the filtered sequences
+
+3. Sample_Name/blast_results/:  this folder contains the BLAST or LAST results
+    Sample_Name.dbname.LASTout :  tabular format of the LAST results of the Sample_Name.qced.faa file against the 
+                                  database dbname
+    Sample_Name.dbname.BLASTout : tabular format of the BLAST results of the Sample_Name.qced.faa file against the 
+                                  database dbname
+    Sample_Name.dbname.BLASTout.parsed.txt : tabular format of the parsed BLAST results of the Sample_Name.qced.faa file against the 
+                                  database dbname
+    Sample_Name.dbname.LASTout.parsed.txt : tabular format of the parsed LAST results of the Sample_Name.qced.faa file against the 
+                                  database dbname
+
+4. Sample_Name/genbank/:  this folder contains the annotatied and genbank files
+   Sample_Name.annot.gff : The annotatated file created from Sample_Name.unannot.gff (folder 2 above )  by combining 
+                           the annotations created from the parsed LAST/BLAST files 
+   Sample_Name.gbk      : The genbank file created by putting the annotations into it
+
+5. Sample_Name/mltreemap_calculations/: folder for the MLTreeMap result. Currently, it is empty
+
+
+6. Sample_Name/ptools/:  this is the input to the pathway tools to create the PGDB
+
+7. Sample_Name/results/: this is the results folder
+   annotation_table  : folder that has the annotatation information tables
+   megan : folder to drop the megan input file
+   mltreemap : mltreemap summary results
+   pgdb : the zipped PGDB file that needs to be unzipped in the ptools-local/pgdb/user/ folder
+     Sample_Name.pathways.txt : provides the list of pathways in the column format 
+                PWY_NAME   PWY_COMMON_NAME NUM_REACTIONS   NUM_COVERED_REACTIONS   ORF_COUNT
+                note that at the end of each line (after the column ORF_COUNT) all the ORFs names are appended 
+   rRNA : the rRNA scan stats file
+   sequin : not used currently
+   tRNA : the tRNA scan stats file
+
+8 Sample_Name/run_statistics/: this folder has informatio about the run stats
+   Sample_Name.amino.stats : the amino acid stats file before and after filtering the translated ORFs
+   Sample_Name.contig.lengths.txt  : the contig length distribution file
+   Sample_Name.nuc.stats : the nucleotide sequence stats file before and after filtering the translated ORFs
+   Sample_Name.orf.lengths.txt : the ORF length stats file
+   run_parameters.txt : this file stores the parameters  that were used for the run
diff --git a/libs/python_modules/metapaths.py b/libs/python_modules/metapaths.py
@@ -107,7 +107,7 @@ def format_db(formatdb_executable, seqType, raw_sequence_file, formatted_db,  al
 
      if algorithm=='LAST':
          # dirname = os.path.dirname(raw_sequence_file)    
-         cmd='%s -s 5G -p -c %s  %s' %(formatdb_executable, formatted_db, raw_sequence_file)
+         cmd='%s -s 4G -p -c %s  %s' %(formatdb_executable, formatted_db, raw_sequence_file)
 
      result= getstatusoutput(cmd)
      if result[0]==0:
@@ -375,9 +375,9 @@ def create_annotate_genebank_cmd(sample_name, mapping_txt, input_unannotated_gff
 
     for refdb in refdbs:
         if algorithm == "LAST":
-            cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ ".lastout.parsed.txt -d " + refdb + " -w 1 "
+            cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ "." + algorithm + "out.parsed.txt -d " + refdb + " -w 1 "
         else:
-            cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ ".blastout.parsed.txt -d " + refdb + " -w 1 "
+            cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ "." + algorithm + "out.parsed.txt -d " + refdb + " -w 1 "
 
     cmd = cmd + " -m " +  mapping_txt
     return cmd
@@ -406,9 +406,9 @@ def  create_report_files_cmd(dbs, input_dir, input_annotated_gff,  sample_name,
         db_argument_string += ' -d ' + dbname
         db_argument_string += ' -b ' + input_dir + sample_name +'.' + dbname
         if algorithm == "LAST":
-            db_argument_string += '.lastout.parsed.txt'
+            db_argument_string += '.' + algorithm + 'out.parsed.txt'
         else:
-            db_argument_string += '.blastout.parsed.txt'
+            db_argument_string += '.' + algorithm + 'out.parsed.txt'
     basefun = config_settings['REFDBS'] + PATHDELIM + 'functional_categories' + PATHDELIM
     basencbi = config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' + PATHDELIM
     # construct command        
@@ -978,8 +978,8 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
        removeFileOnRedo(command_Status, output_gff)
        enable_flag = shouldRunStep(run_type, output_fna) or shouldRunStep(run_type, output_faa)  or shouldRunStep(run_type, output_gff)  
 
-       commands.append(["\n1. Converting genbank file to fna, faa and gff files ....", genbank_to_fna_faa_gff_cmd,\
-                       'PREPROCESS_FASTA', command_Status, enable_flag])
+       message = "\n" + '*** ' + sample_name + ' ***\n' +"\n1. Converting genbank file to fna, faa and gff files ...."
+       commands.append([message, genbank_to_fna_faa_gff_cmd, 'PREPROCESS_FASTA', command_Status, enable_flag])
 
     #################################
     if config_params['INPUT']['format'] in ['fasta']:
@@ -1003,8 +1003,8 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
          # Niels - changed from shouldRunStep1() to shouldRunStep()
          enable_flag = shouldRunStep(run_type, output_fas) or shouldRunStep(run_type, nuc_stats_file ) or shouldRunStep(run_type, contig_lengths_file)
 
-
-         commands.append( ["\n1. Running Quality Check ....", quality_check_cmd, 'PREPROCESS_FASTA', command_Status, enable_flag])
+         message = "\n" + '*** ' + sample_name + ' ***\n' +"\n1. Running Quality Check ...."
+         commands.append( [message, quality_check_cmd, 'PREPROCESS_FASTA', command_Status, enable_flag])
 
          #################################
 
@@ -1020,7 +1020,8 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
          enable_flag = shouldRunStep(run_type,  output_gff) 
 
          #add command
-         commands.append(["\n2. Running ORF Prediction ....", orf_prediction_cmd, 'ORF_PREDICTION',command_Status, enable_flag])
+         message = "\n" + '*** ' + sample_name + ' ***\n' +"\n2. Running ORF Prediction ...."
+         commands.append([message, orf_prediction_cmd, 'ORF_PREDICTION',command_Status, enable_flag])
          #################################
 
 
@@ -1072,7 +1073,6 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
        #algorithm=  get_parameter( config_params,'annotation','algorithm')
        refscores_compute_cmd = create_refscores_compute_cmd(input_faa, output_refscores, config_settings, algorithm) 
        command_Status=  get_parameter( config_params,'metapaths_steps','COMPUTE_REFSCORE')
-       print os.path.exists(input_faa)
        # if not hasInput(input_faa):
        #     command_Status = "missing"
        #     print "Missing " + input_faa
@@ -1135,7 +1135,8 @@ def run_metapathways_at_BLAST(input_fp, output_dir, command_handler, command_lin
        for db in dbs:
              dbname = get_refdb_name(db);
              blastoutput = blast_results_dir + PATHDELIM + sample_name + "." + dbname    # append "algorithout"
-             blastoutput += "."+algorithm+ "out"
+             blastoutput += "."+algorithm + "out"
+             blastoutputtmp = blastoutput+ ".tmp"
 
              removeFileOnRedo(command_Status, blastoutput)
              enable_flag=shouldRunStep(run_type, blastoutput)  
@@ -1204,11 +1205,11 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
           output_db_blast_parsed = blast_results_dir + PATHDELIM + sample_name + "." + dbname 
 
           if algorithm == 'BLAST':
-            output_db_blast_parsed += ".blastout.parsed.txt"
-            input_db_blastout += ".blastout"
+            output_db_blast_parsed += "." + algorithm + "out.parsed.txt"
+            input_db_blastout += "." + algorithm + "out"
           if algorithm == 'LAST':
-            output_db_blast_parsed += ".lastout.parsed.txt"
-            input_db_blastout += ".lastout"
+            output_db_blast_parsed += "." + algorithm + "out.parsed.txt"
+            input_db_blastout += "." + algorithm + "out"
 
           command_Status=  get_parameter( config_params,'metapaths_steps','PARSE_BLAST')
           removeFileOnRedo(command_Status, output_db_blast_parsed)
@@ -1230,14 +1231,14 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
     input_fasta = preprocessed_dir +  PATHDELIM + sample_name + ".fasta" 
     db_type = 'taxonomic'
     refdbstring = get_parameter(config_params, 'rRNA', 'refdbs', default=None)
-    refdbnames= [ x.strip() for x in refdbstring.split(',') ]
+    refdbnames= [ x.strip() for x in refdbstring.split(',') if len(x.strip()) ]
 
 
     message = "\n" + '*** ' + sample_name + ' ***\n' +"\n8. Scan for rRNA sequences in reference database - "
     for refdbname in refdbnames:
        # print '----------'
        # print refdbname
-       rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".blastout"
+       rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".BLASTout"
 
        command_Status=  get_parameter( config_params,'metapaths_steps','SCAN_rRNA')
        scan_rRNA_seqs_cmd = create_scan_rRNA_seqs_cmd(input_fasta,rRNA_blastout, refdbname, config_settings, config_params, command_Status, db_type)
@@ -1260,7 +1261,7 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
     #print refdbnames
     for refdbname in refdbnames:
         rRNA_stat_results= output_results_rRNA_dir + sample_name + "." + get_refdb_name(refdbname) + ".rRNA.stats.txt"
-        rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".blastout"
+        rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".BLASTout"
 
         rRNA_stats_cmd = create_rRNA_scan_statistics(rRNA_blastout, refdbname, bscore_cutoff, eval_cutoff, identity_cutoff, config_settings, rRNA_stat_results)
 
@@ -1387,8 +1388,8 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
 
     genbank_annotation_table_cmd = create_genbank_ptinput_sequin_cmd(input_annot_gff, input_nucleotide_fasta, input_amino_acid_fasta, outputs, config_settings, ncbi_sequin_params, ncbi_sequin_sbt)
     #add command
-
-    commands.append( ["\n" + '*** ' + sample_name + ' ***\n' +"\n12. Create genbank/sequin/ptools input command    ....", genbank_annotation_table_cmd,'GENBANK_FILE', command_Status, enable_flag])
+    message = "\n" + '*** ' + sample_name + ' ***\n' +"\n12. Create reports/genbank/tools command ...."
+    commands.append( [message, genbank_annotation_table_cmd,'GENBANK_FILE', command_Status, enable_flag])
     #################################
 
 
@@ -1399,12 +1400,12 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
        command_Status=  get_parameter(config_params,'metapaths_steps','CREATE_REPORT_FILES')
        removeFileOnRedo(command_Status, output_annot_table)
        enable_flag=shouldRunStep(run_type, output_annot_table)  
-       message = "\n" + '*** ' + sample_name + ' ***\n' +"\n13. Creating KEGG and COG hits table and LCA based taxonomy table  "
+       message = "\n" + '*** ' + sample_name + ' ***\n' +"\n13. Creating KEGG and COG hits table and LCA based taxonomy table...."
        input_dir = blast_results_dir + PATHDELIM # D+ sample_name + "." + dbname + ".blastout.parsed"
        create_report_cmd = create_report_files_cmd(dbs, input_dir, input_annotated_gff, sample_name, 
                                                    output_results_annotation_table_dir, config_settings, 
                                                    algorithm)
-       commands.append( [message  + " ....", create_report_cmd, 'CREATE_REPORT_FILES', command_Status, enable_flag])
+       commands.append( [message, create_report_cmd, 'CREATE_REPORT_FILES', command_Status, enable_flag])
 
     #################################
 

diff --git a/libs/python_scripts/MetaPathways_create_amino_sequences.py b/libs/python_scripts/MetaPathways_create_amino_sequences.py
@@ -196,16 +196,12 @@ def process_gff_file(gff_file_name, output_amino_file_name, output_nuc_file_name
      gfffile.close()
 
      contig_dict={} 
-     count = 0
      for line in gff_lines:
         line = line.strip() 
         if gff_beg_pattern.search(line):
           continue
         insert_orf_into_dict(line, contig_dict)
 
-        #if count > 100:
-        #   break 
-        #count = count + 1
      create_the_gene_IDs(contig_dict, nucleotide_seq_dict)
      create_sequnces(output_amino_file_name, output_nuc_file_name, contig_dict, nucleotide_seq_dict)
      write_gff_file(output_gff_file_name, contig_dict)
@@ -432,6 +428,7 @@ def process_sequence_file(sequence_file_name,  seq_dictionary):
 
 
 
+
 def main(argv):
     # Parse options (many!)
     # TODO: Create option groups

diff --git a/libs/python_scripts/MetaPathways_create_reports_fast.py b/libs/python_scripts/MetaPathways_create_reports_fast.py
@@ -699,6 +699,8 @@ def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks):
 
     try:
        for i in range(len(filenames)):
+         print 'handles ' + str(len(readerhandles))
+         print filenames
          readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) )
     except OSError:
       print "ERROR: Cannot read sequence file : " + filenames[i]
@@ -708,7 +710,6 @@ def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks):
        outputfile = open(outputfilename, 'w')
        fieldmapHeaderLine = readerhandles[0].getHeaderLine()
        fprintf(outputfile, "%s\n",fieldmapHeaderLine) 
-
     except OSError: 
        print "ERROR: Cannot create sequence file : " + outputfilename
        sys.exit(1)

diff --git a/libs/python_scripts/MetaPathways_parse_blast.py b/libs/python_scripts/MetaPathways_parse_blast.py
@@ -141,14 +141,18 @@ def create_query_dictionary(blastoutputfile, query_dictionary, algorithm ):
           pass 
 
 def create_dictionary(databasemapfile, annot_map, query_dictionary):
-#       print "query size " + str(len(query_dictionary))
+       if not query_dictionary:
+          print "WARNING : empty query dictionary in parse B/LAST"
+          return 
+
        seq_beg_pattern = re.compile(">")
        dbmapfile = open( databasemapfile,'r')
 
        for line in dbmapfile:
           if seq_beg_pattern.search(line):
               words = line.rstrip().split()
               name = words[0].replace('>','',1)
+
               if not name in query_dictionary: 
                  continue
               words.pop(0)
@@ -229,19 +233,18 @@ def permuteForLAST(self, words):
            sys.exit(0) 
 
     def refillBuffer(self):
-       i = 0
-       self.lines = []
-       line = self.blastoutputfile.readline()
-       while line and i < self.Size:
-         line=self.blastoutputfile.readline()
-         if self.commentPATTERN.match(line):
-            continue
-         self.lines.append(line)
-         if not line:
-           break
-         i += 1
-
-       self.size = len(self.lines)
+        i = 0
+        self.lines = []
+        line = self.blastoutputfile.readline()
+        while line and i < self.Size:
+          line=self.blastoutputfile.readline()
+          if self.commentPATTERN.match(line):
+             continue
+          self.lines.append(line)
+          if not line:
+            break
+          i += 1
+        self.size = len(self.lines)
 
     def next(self):
         if self.i % self.Size ==0: