Skip to content

Commit

Permalink
sped up the report generation
Browse files Browse the repository at this point in the history
  • Loading branch information
kishori82 committed Mar 12, 2014
1 parent 55af473 commit 72d3824
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 62 deletions.
15 changes: 12 additions & 3 deletions MetaPathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
print """ Could not load some user defined module functions"""
print """ Make sure your typed \"source MetaPathwaysrc\""""
print """ """
print traceback.print_exc(10)
#print traceback.print_exc(10)
sys.exit(3)


Expand Down Expand Up @@ -110,8 +110,13 @@ def create_an_input_output_pair(input_file, output_dir):
shortname = re.sub('[.](fasta|fas|fna|faa|gbk|gff|fa)$','',input_file, re.I)
shortname = re.sub(r'.*' + PATHDELIM ,'',shortname)
shortname = re.sub(r'[.]','_',shortname)

if re.search(r'.(fasta|fas|fna|faa|gbk|gff|fa)$',input_file, re.I):
input_output[input_file] = path.abspath(output_dir) + PATHDELIM + shortname
if len(shortname)>1:
input_output[input_file] = path.abspath(output_dir) + PATHDELIM + shortname
else:
print "WARNING : sample with one character name " + shortname + "(i.e., file \"" + input_file + "\" will be ignored"
print " because prodigal creates some problem with such files"

return input_output

Expand All @@ -125,7 +130,11 @@ def create_input_output_pairs(input_dir, output_dir):
shortname = re.sub('.(fasta|fas|fna|faa|gbk|gff|fa)$','',file,re.I)
shortname = re.sub(r'[.]','_',shortname)
if re.search('.(fasta|fas|fna|faa|gff|gbk|fa)$',file, re.I):
input_files[file] = shortname
if len(shortname)>1:
input_files[file] = shortname
else:
print "WARNING : sample with one character name " + shortname + "(i.e., file \"" + file + "\" will be ignored"
print " because prodigal creates some problem with such files"

paired_input = {}
for key, value in input_files.iteritems():
Expand Down
59 changes: 59 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@

# Description of the data for the MetaPathways 2.0 data product

1. Sample_Name/preprocessed/ # the preprocessed folder
Sample_Name.fasta : the preprocessed fasta sequence after filtering the input sequences based
on the cutoffs on length, removal of ambiguous base pairs
Sample_Name.mapping.txt : the original names of the input sequences are mapped to produce
uniform name in the format Sample_Name_x, where
x is the contig number


2. Sample_Name/orf_prediction/: this folder contains the output related to the prodigal
Sample_Name.fna : Open Read Frame (ORFs) detected by prodigal
Sample_Name.faa : the translated ORFs
Sample_Name.gff : a gff file containing the ORF information by prodigal
Sample_Name.qced.faa : these are QCed amino acid sequences from file Sample_Name.faa after removing
sequences that are too short. The names of the ORFs are in the format
Sample_Name_x_y, where
x is the contig number and y is the ORF number in contig Sample_Name_x
Sample_Name.unannot.gff : These are created from the Sample_Name.gff file after removing the filtered sequences

3. Sample_Name/blast_results/: this folder contains the BLAST or LAST results
Sample_Name.dbname.LASTout : tabular format of the LAST results of the Sample_Name.qced.faa file against the
database dbname
Sample_Name.dbname.BLASTout : tabular format of the BLAST results of the Sample_Name.qced.faa file against the
database dbname
Sample_Name.dbname.BLASTout.parsed.txt : tabular format of the parsed BLAST results of the Sample_Name.qced.faa file against the
database dbname
Sample_Name.dbname.LASTout.parsed.txt : tabular format of the parsed LAST results of the Sample_Name.qced.faa file against the
database dbname

4. Sample_Name/genbank/: this folder contains the annotatied and genbank files
Sample_Name.annot.gff : The annotatated file created from Sample_Name.unannot.gff (folder 2 above ) by combining
the annotations created from the parsed LAST/BLAST files
Sample_Name.gbk : The genbank file created by putting the annotations into it

5. Sample_Name/mltreemap_calculations/: folder for the MLTreeMap result. Currently, it is empty


6. Sample_Name/ptools/: this is the input to the pathway tools to create the PGDB

7. Sample_Name/results/: this is the results folder
annotation_table : folder that has the annotatation information tables
megan : folder to drop the megan input file
mltreemap : mltreemap summary results
pgdb : the zipped PGDB file that needs to be unzipped in the ptools-local/pgdb/user/ folder
Sample_Name.pathways.txt : provides the list of pathways in the column format
PWY_NAME PWY_COMMON_NAME NUM_REACTIONS NUM_COVERED_REACTIONS ORF_COUNT
note that at the end of each line (after the column ORF_COUNT) all the ORFs names are appended
rRNA : the rRNA scan stats file
sequin : not used currently
tRNA : the tRNA scan stats file

8 Sample_Name/run_statistics/: this folder has informatio about the run stats
Sample_Name.amino.stats : the amino acid stats file before and after filtering the translated ORFs
Sample_Name.contig.lengths.txt : the contig length distribution file
Sample_Name.nuc.stats : the nucleotide sequence stats file before and after filtering the translated ORFs
Sample_Name.orf.lengths.txt : the ORF length stats file
run_parameters.txt : this file stores the parameters that were used for the run
47 changes: 24 additions & 23 deletions libs/python_modules/metapaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def format_db(formatdb_executable, seqType, raw_sequence_file, formatted_db, al

if algorithm=='LAST':
# dirname = os.path.dirname(raw_sequence_file)
cmd='%s -s 5G -p -c %s %s' %(formatdb_executable, formatted_db, raw_sequence_file)
cmd='%s -s 4G -p -c %s %s' %(formatdb_executable, formatted_db, raw_sequence_file)

result= getstatusoutput(cmd)
if result[0]==0:
Expand Down Expand Up @@ -375,9 +375,9 @@ def create_annotate_genebank_cmd(sample_name, mapping_txt, input_unannotated_gff

for refdb in refdbs:
if algorithm == "LAST":
cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ ".lastout.parsed.txt -d " + refdb + " -w 1 "
cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ "." + algorithm + "out.parsed.txt -d " + refdb + " -w 1 "
else:
cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ ".blastout.parsed.txt -d " + refdb + " -w 1 "
cmd = cmd + " -b " + blast_results_dir + PATHDELIM + sample_name + "." + refdb+ "." + algorithm + "out.parsed.txt -d " + refdb + " -w 1 "

cmd = cmd + " -m " + mapping_txt
return cmd
Expand Down Expand Up @@ -406,9 +406,9 @@ def create_report_files_cmd(dbs, input_dir, input_annotated_gff, sample_name,
db_argument_string += ' -d ' + dbname
db_argument_string += ' -b ' + input_dir + sample_name +'.' + dbname
if algorithm == "LAST":
db_argument_string += '.lastout.parsed.txt'
db_argument_string += '.' + algorithm + 'out.parsed.txt'
else:
db_argument_string += '.blastout.parsed.txt'
db_argument_string += '.' + algorithm + 'out.parsed.txt'
basefun = config_settings['REFDBS'] + PATHDELIM + 'functional_categories' + PATHDELIM
basencbi = config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' + PATHDELIM
# construct command
Expand Down Expand Up @@ -978,8 +978,8 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
removeFileOnRedo(command_Status, output_gff)
enable_flag = shouldRunStep(run_type, output_fna) or shouldRunStep(run_type, output_faa) or shouldRunStep(run_type, output_gff)

commands.append(["\n1. Converting genbank file to fna, faa and gff files ....", genbank_to_fna_faa_gff_cmd,\
'PREPROCESS_FASTA', command_Status, enable_flag])
message = "\n" + '*** ' + sample_name + ' ***\n' +"\n1. Converting genbank file to fna, faa and gff files ...."
commands.append([message, genbank_to_fna_faa_gff_cmd, 'PREPROCESS_FASTA', command_Status, enable_flag])

#################################
if config_params['INPUT']['format'] in ['fasta']:
Expand All @@ -1003,8 +1003,8 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
# Niels - changed from shouldRunStep1() to shouldRunStep()
enable_flag = shouldRunStep(run_type, output_fas) or shouldRunStep(run_type, nuc_stats_file ) or shouldRunStep(run_type, contig_lengths_file)


commands.append( ["\n1. Running Quality Check ....", quality_check_cmd, 'PREPROCESS_FASTA', command_Status, enable_flag])
message = "\n" + '*** ' + sample_name + ' ***\n' +"\n1. Running Quality Check ...."
commands.append( [message, quality_check_cmd, 'PREPROCESS_FASTA', command_Status, enable_flag])

#################################

Expand All @@ -1020,7 +1020,8 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
enable_flag = shouldRunStep(run_type, output_gff)

#add command
commands.append(["\n2. Running ORF Prediction ....", orf_prediction_cmd, 'ORF_PREDICTION',command_Status, enable_flag])
message = "\n" + '*** ' + sample_name + ' ***\n' +"\n2. Running ORF Prediction ...."
commands.append([message, orf_prediction_cmd, 'ORF_PREDICTION',command_Status, enable_flag])
#################################


Expand Down Expand Up @@ -1072,7 +1073,6 @@ def run_metapathways_before_BLAST(input_fp, output_dir, command_handler, command
#algorithm= get_parameter( config_params,'annotation','algorithm')
refscores_compute_cmd = create_refscores_compute_cmd(input_faa, output_refscores, config_settings, algorithm)
command_Status= get_parameter( config_params,'metapaths_steps','COMPUTE_REFSCORE')
print os.path.exists(input_faa)
# if not hasInput(input_faa):
# command_Status = "missing"
# print "Missing " + input_faa
Expand Down Expand Up @@ -1135,7 +1135,8 @@ def run_metapathways_at_BLAST(input_fp, output_dir, command_handler, command_lin
for db in dbs:
dbname = get_refdb_name(db);
blastoutput = blast_results_dir + PATHDELIM + sample_name + "." + dbname # append "algorithout"
blastoutput += "."+algorithm+ "out"
blastoutput += "."+algorithm + "out"
blastoutputtmp = blastoutput+ ".tmp"

removeFileOnRedo(command_Status, blastoutput)
enable_flag=shouldRunStep(run_type, blastoutput)
Expand Down Expand Up @@ -1204,11 +1205,11 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
output_db_blast_parsed = blast_results_dir + PATHDELIM + sample_name + "." + dbname

if algorithm == 'BLAST':
output_db_blast_parsed += ".blastout.parsed.txt"
input_db_blastout += ".blastout"
output_db_blast_parsed += "." + algorithm + "out.parsed.txt"
input_db_blastout += "." + algorithm + "out"
if algorithm == 'LAST':
output_db_blast_parsed += ".lastout.parsed.txt"
input_db_blastout += ".lastout"
output_db_blast_parsed += "." + algorithm + "out.parsed.txt"
input_db_blastout += "." + algorithm + "out"

command_Status= get_parameter( config_params,'metapaths_steps','PARSE_BLAST')
removeFileOnRedo(command_Status, output_db_blast_parsed)
Expand All @@ -1230,14 +1231,14 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
input_fasta = preprocessed_dir + PATHDELIM + sample_name + ".fasta"
db_type = 'taxonomic'
refdbstring = get_parameter(config_params, 'rRNA', 'refdbs', default=None)
refdbnames= [ x.strip() for x in refdbstring.split(',') ]
refdbnames= [ x.strip() for x in refdbstring.split(',') if len(x.strip()) ]


message = "\n" + '*** ' + sample_name + ' ***\n' +"\n8. Scan for rRNA sequences in reference database - "
for refdbname in refdbnames:
# print '----------'
# print refdbname
rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".blastout"
rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".BLASTout"

command_Status= get_parameter( config_params,'metapaths_steps','SCAN_rRNA')
scan_rRNA_seqs_cmd = create_scan_rRNA_seqs_cmd(input_fasta,rRNA_blastout, refdbname, config_settings, config_params, command_Status, db_type)
Expand All @@ -1260,7 +1261,7 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
#print refdbnames
for refdbname in refdbnames:
rRNA_stat_results= output_results_rRNA_dir + sample_name + "." + get_refdb_name(refdbname) + ".rRNA.stats.txt"
rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".blastout"
rRNA_blastout= blast_results_dir + PATHDELIM + sample_name + ".rRNA." + get_refdb_name(refdbname) + ".BLASTout"

rRNA_stats_cmd = create_rRNA_scan_statistics(rRNA_blastout, refdbname, bscore_cutoff, eval_cutoff, identity_cutoff, config_settings, rRNA_stat_results)

Expand Down Expand Up @@ -1387,8 +1388,8 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_

genbank_annotation_table_cmd = create_genbank_ptinput_sequin_cmd(input_annot_gff, input_nucleotide_fasta, input_amino_acid_fasta, outputs, config_settings, ncbi_sequin_params, ncbi_sequin_sbt)
#add command

commands.append( ["\n" + '*** ' + sample_name + ' ***\n' +"\n12. Create genbank/sequin/ptools input command ....", genbank_annotation_table_cmd,'GENBANK_FILE', command_Status, enable_flag])
message = "\n" + '*** ' + sample_name + ' ***\n' +"\n12. Create reports/genbank/tools command ...."
commands.append( [message, genbank_annotation_table_cmd,'GENBANK_FILE', command_Status, enable_flag])
#################################


Expand All @@ -1399,12 +1400,12 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
command_Status= get_parameter(config_params,'metapaths_steps','CREATE_REPORT_FILES')
removeFileOnRedo(command_Status, output_annot_table)
enable_flag=shouldRunStep(run_type, output_annot_table)
message = "\n" + '*** ' + sample_name + ' ***\n' +"\n13. Creating KEGG and COG hits table and LCA based taxonomy table "
message = "\n" + '*** ' + sample_name + ' ***\n' +"\n13. Creating KEGG and COG hits table and LCA based taxonomy table...."
input_dir = blast_results_dir + PATHDELIM # D+ sample_name + "." + dbname + ".blastout.parsed"
create_report_cmd = create_report_files_cmd(dbs, input_dir, input_annotated_gff, sample_name,
output_results_annotation_table_dir, config_settings,
algorithm)
commands.append( [message + " ....", create_report_cmd, 'CREATE_REPORT_FILES', command_Status, enable_flag])
commands.append( [message, create_report_cmd, 'CREATE_REPORT_FILES', command_Status, enable_flag])

#################################

Expand Down
5 changes: 1 addition & 4 deletions libs/python_scripts/MetaPathways_create_amino_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,16 +196,12 @@ def process_gff_file(gff_file_name, output_amino_file_name, output_nuc_file_name
gfffile.close()

contig_dict={}
count = 0
for line in gff_lines:
line = line.strip()
if gff_beg_pattern.search(line):
continue
insert_orf_into_dict(line, contig_dict)

#if count > 100:
# break
#count = count + 1
create_the_gene_IDs(contig_dict, nucleotide_seq_dict)
create_sequnces(output_amino_file_name, output_nuc_file_name, contig_dict, nucleotide_seq_dict)
write_gff_file(output_gff_file_name, contig_dict)
Expand Down Expand Up @@ -432,6 +428,7 @@ def process_sequence_file(sequence_file_name, seq_dictionary):




def main(argv):
# Parse options (many!)
# TODO: Create option groups
Expand Down
3 changes: 2 additions & 1 deletion libs/python_scripts/MetaPathways_create_reports_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,8 @@ def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks):

try:
for i in range(len(filenames)):
print 'handles ' + str(len(readerhandles))
print filenames
readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) )
except OSError:
print "ERROR: Cannot read sequence file : " + filenames[i]
Expand All @@ -708,7 +710,6 @@ def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks):
outputfile = open(outputfilename, 'w')
fieldmapHeaderLine = readerhandles[0].getHeaderLine()
fprintf(outputfile, "%s\n",fieldmapHeaderLine)

except OSError:
print "ERROR: Cannot create sequence file : " + outputfilename
sys.exit(1)
Expand Down
31 changes: 17 additions & 14 deletions libs/python_scripts/MetaPathways_parse_blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,18 @@ def create_query_dictionary(blastoutputfile, query_dictionary, algorithm ):
pass

def create_dictionary(databasemapfile, annot_map, query_dictionary):
# print "query size " + str(len(query_dictionary))
if not query_dictionary:
print "WARNING : empty query dictionary in parse B/LAST"
return

seq_beg_pattern = re.compile(">")
dbmapfile = open( databasemapfile,'r')

for line in dbmapfile:
if seq_beg_pattern.search(line):
words = line.rstrip().split()
name = words[0].replace('>','',1)

if not name in query_dictionary:
continue
words.pop(0)
Expand Down Expand Up @@ -229,19 +233,18 @@ def permuteForLAST(self, words):
sys.exit(0)

def refillBuffer(self):
i = 0
self.lines = []
line = self.blastoutputfile.readline()
while line and i < self.Size:
line=self.blastoutputfile.readline()
if self.commentPATTERN.match(line):
continue
self.lines.append(line)
if not line:
break
i += 1

self.size = len(self.lines)
i = 0
self.lines = []
line = self.blastoutputfile.readline()
while line and i < self.Size:
line=self.blastoutputfile.readline()
if self.commentPATTERN.match(line):
continue
self.lines.append(line)
if not line:
break
i += 1
self.size = len(self.lines)

def next(self):
if self.i % self.Size ==0:
Expand Down
Loading

0 comments on commit 72d3824

Please sign in to comment.