Skip to content

Commit

Permalink
Update sort_contigs to use shuffleBank if available. Will run faster …
Browse files Browse the repository at this point in the history
…on large datasets since it will only stream the bakn once instead of constantly seeking
  • Loading branch information
skoren committed Aug 18, 2013
1 parent a32c70b commit 00300bb
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 12 deletions.
39 changes: 31 additions & 8 deletions Utilities/python/sort_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from utils import *
_settings = Settings()
def sort_contigs(ocf,cf,rcf,ck,orf_fasta,orf_protein,orf_mapf,out_dir,amos_bnk,amos_dir):
def sort_contigs(ocf,cf,rcf,ck,orf_fasta,orf_protein,orf_mapf,out_dir,amos_bnk,amos_dir, lowmem):
contigs_by_class = { }
reads_by_class = { }
orf_to_src = { }
Expand Down Expand Up @@ -111,6 +111,7 @@ def sort_contigs(ocf,cf,rcf,ck,orf_fasta,orf_protein,orf_mapf,out_dir,amos_bnk,a
read_class_file.close()

# output contigs
fofn = open(out_dir + os.sep + "contig.fofn", 'w')
for key in contigs_by_class:
if key not in id_class:
continue
Expand All @@ -124,9 +125,15 @@ def sort_contigs(ocf,cf,rcf,ck,orf_fasta,orf_protein,orf_mapf,out_dir,amos_bnk,a
f = open(path + class_name + ".eid", 'w')
f.write("\n".join(contigs_by_class[key]) + "\n")
f.close()
run_process(_settings,"%s/bank2fasta -b %s -eid -E '%s%s%s.eid' > '%s%s%s.ctg.fasta'"%(amos_dir,amos_bnk,path,os.sep,class_name,path,os.sep,class_name),"Classify")

if (lowmem or not os.path.exists("%s/shuffleBank"%(amos_dir))):
run_process(_settings,"%s/bank2fasta -b %s -eid -E '%s%s%s.eid' > '%s%s%s.ctg.fasta'"%(amos_dir,amos_bnk,path,os.sep,class_name,path,os.sep,class_name),"Classify")
else:
fofn.write(path + class_name + ".eid" + "\n")
fofn.close()

# output reads
fofn = open(out_dir + os.sep + "read.fofn", 'w')
for key in reads_by_class:
if key not in id_class:
continue
Expand All @@ -140,11 +147,16 @@ def sort_contigs(ocf,cf,rcf,ck,orf_fasta,orf_protein,orf_mapf,out_dir,amos_bnk,a
f = open(path + class_name + ".read.eid", 'w')
f.write("\n".join(reads_by_class[key]) + "\n")
f.close()
run_process(_settings,"%s/dumpreads -e -E '%s%s%s.read.eid' %s > '%s%s%s.read.fasta'"%(amos_dir,path,os.sep,class_name,amos_bnk,path,os.sep,class_name),"Classify")
run_process(_settings,"%s/dumpreads -q -e -E '%s%s%s.read.eid' %s > '%s%s%s.read.qual'"%(amos_dir,path,os.sep,class_name,amos_bnk,path,os.sep,class_name),"Classify")
run_process(_settings,"%s/dumpreads -f -e -E '%s%s%s.read.eid' %s > '%s%s%s.read.fastq'"%(amos_dir,path,os.sep,class_name,amos_bnk,path,os.sep,class_name),"Classify")

if (lowmem or not os.path.exists("%s/shuffleBank"%(amos_dir))):
run_process(_settings,"%s/dumpreads -e -E '%s%s%s.read.eid' %s > '%s%s%s.read.fasta'"%(amos_dir,path,os.sep,class_name,amos_bnk,path,os.sep,class_name),"Classify")
run_process(_settings,"%s/dumpreads -q -e -E '%s%s%s.read.eid' %s > '%s%s%s.read.qual'"%(amos_dir,path,os.sep,class_name,amos_bnk,path,os.sep,class_name),"Classify")
run_process(_settings,"%s/dumpreads -f -e -E '%s%s%s.read.eid' %s > '%s%s%s.read.fastq'"%(amos_dir,path,os.sep,class_name,amos_bnk,path,os.sep,class_name),"Classify")
else:
fofn.write(path + class_name + ".read.eid" + "\n")

# finally output the orfs
fofn = open(out_dir + os.sep + "orf.fofn", 'w')
for key in orf_by_class:
if key not in id_class:
continue
Expand All @@ -158,6 +170,17 @@ def sort_contigs(ocf,cf,rcf,ck,orf_fasta,orf_protein,orf_mapf,out_dir,amos_bnk,a
f = open(path + class_name + ".orf.eid", "w")
f.write("\n".join(orf_by_class[key]) + "\n")
f.close()

run_process(_settings,"%s/dumpreads -e -E '%s%s%s.orf.eid' %s > '%s%s%s.orf.fna'"%(amos_dir,path,os.sep,class_name,orf_fasta,path,os.sep,class_name),"Classify")
run_process(_settings,"%s/dumpreads -e -E '%s%s%s.orf.eid' %s > '%s%s%s.orf.faa'"%(amos_dir,path,os.sep,class_name,orf_protein,path,os.sep,class_name),"Classify")

if (lowmem or not os.path.exists("%s/shuffleBank"%(amos_dir))):
run_process(_settings,"%s/dumpreads -e -E '%s%s%s.orf.eid' %s > '%s%s%s.orf.fna.fasta'"%(amos_dir,path,os.sep,class_name,orf_fasta,path,os.sep,class_name),"Classify")
run_process(_settings,"%s/dumpreads -e -E '%s%s%s.orf.eid' %s > '%s%s%s.orf.faa.fasta'"%(amos_dir,path,os.sep,class_name,orf_protein,path,os.sep,class_name),"Classify")
else:
fofn.write(path + class_name + ".orf.eid" + "\n")
fofn.close()

if (not lowmem and os.path.exists("%s/shuffleBank"%(amos_dir))):
run_process(_settings, "%s/shuffleBank -c -e -b %s -p ctg -eid -E %s%scontig.fofn"%(amos_dir, amos_bnk, out_dir, os.sep), "Classify")
run_process(_settings, "%s/shuffleBank -r -e -b %s -eid -E %s%sread.fofn"%(amos_dir, amos_bnk, out_dir, os.sep), "Classify")
run_process(_settings, "%s/shuffleBank -r -f -e -b %s -eid -E %s%sread.fofn"%(amos_dir, amos_bnk, out_dir, os.sep), "Classify")
run_process(_settings, "%s/shuffleBank -r -e -b %s -p fna -eid -E %s%sorf.fofn"%(amos_dir, orf_fasta, out_dir, os.sep), "Classify")
run_process(_settings, "%s/shuffleBank -r -e -b %s -p faa -eid -E %s%sorf.fofn"%(amos_dir, orf_protein, out_dir, os.sep), "Classify")
9 changes: 6 additions & 3 deletions src/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@
_readlibs = []
_skipsteps = []
_cls = None
_lowmem = False
_settings = Settings()

def init(reads, skipsteps, cls):
def init(reads, skipsteps, cls, low):
global _readlibs
global _skipsteps
global _cls
global _lowmem

_readlibs = reads
_skipsteps = skipsteps
_cls = cls
_lowmem = low

@follows(Propagate)
@posttask(touch_file("%s/Logs/classify.ok"%(_settings.rundir)))
Expand All @@ -34,7 +37,7 @@ def Classify(input,output):

if _cls == "FCP" or _cls == "fcp" or _cls == "phylosift" or _cls == "PhyloSift" or _cls == "Phylosift":
#run_process(_settings, "python %s/python/sort_contigs.py %s/Propagate/in/%s.clusters %s/Propagate/out/%s.clusters %s/Propagate/out/%s.reads.clusters %s/tax_key.tab %s/FindORFS/out/%s.fna.bnk %s/FindORFS/out/%s.faa.bnk %s/FindORFS/out/%s.gene.map %s/Classify/out %s/Scaffold/in/%s.bnk %s"%(_settings.METAMOS_UTILS, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.DB_DIR, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.rundir, _settings.PREFIX,_settings.AMOS),"Classify")
sort_contigs("%s/Propagate/in/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.reads.clusters"%(_settings.rundir,_settings.PREFIX),"%s/tax_key.tab"%(_settings.DB_DIR),"%s/FindORFS/out/%s.fna.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.faa.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.gene.map"%(_settings.rundir,_settings.PREFIX),"%s/Classify/out"%(_settings.rundir),"%s/Scaffold/in/%s.bnk"%(_settings.rundir,_settings.PREFIX),"%s"%(_settings.AMOS))
sort_contigs("%s/Propagate/in/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.reads.clusters"%(_settings.rundir,_settings.PREFIX),"%s/tax_key.tab"%(_settings.DB_DIR),"%s/FindORFS/out/%s.fna.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.faa.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.gene.map"%(_settings.rundir,_settings.PREFIX),"%s/Classify/out"%(_settings.rundir),"%s/Scaffold/in/%s.bnk"%(_settings.rundir,_settings.PREFIX),"%s"%(_settings.AMOS), _lowmem)
else:
#run_process(_settings, "python %s/python/sort_contigs.py %s/Propagate/in/%s.clusters %s/Propagate/out/%s.clusters %s/Propagate/out/%s.reads.clusters %s/class_key.tab %s/FindORFS/out/%s.fna.bnk %s/FindORFS/out/%s.faa.bnk %s/FindORFS/out/%s.gene.map %s/Classify/out %s/Scaffold/in/%s.bnk %s"%(_settings.METAMOS_UTILS, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.DB_DIR, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.rundir, _settings.PREFIX,_settings.AMOS),"Classify")
sort_contigs("%s/Propagate/in/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.reads.clusters"%(_settings.rundir,_settings.PREFIX),"%s/class_key.tab"%(_settings.DB_DIR),"%s/FindORFS/out/%s.fna.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.faa.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.gene.map"%(_settings.rundir,_settings.PREFIX),"%s/Classify/out"%(_settings.rundir),"%s/Scaffold/in/%s.bnk"%(_settings.rundir,_settings.PREFIX),"%s"%(_settings.AMOS))
sort_contigs("%s/Propagate/in/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.clusters"%(_settings.rundir,_settings.PREFIX),"%s/Propagate/out/%s.reads.clusters"%(_settings.rundir,_settings.PREFIX),"%s/class_key.tab"%(_settings.DB_DIR),"%s/FindORFS/out/%s.fna.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.faa.bnk"%(_settings.rundir,_settings.PREFIX),"%s/FindORFS/out/%s.gene.map"%(_settings.rundir,_settings.PREFIX),"%s/Classify/out"%(_settings.rundir),"%s/Scaffold/in/%s.bnk"%(_settings.rundir,_settings.PREFIX),"%s"%(_settings.AMOS), _lowmem)
2 changes: 1 addition & 1 deletion src/runPipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ def printConfiguration(fileName=None):
scaffold.init(readlibs, skipsteps, retainBank, selected_programs["assemble"])
findscforfs.init(readlibs, skipsteps, selected_programs["findorfs"])
propagate.init(readlibs, skipsteps, selected_programs["classify"])
classify.init(readlibs, skipsteps, selected_programs["classify"])
classify.init(readlibs, skipsteps, selected_programs["classify"], lowmem)
postprocess.init(readlibs, skipsteps, selected_programs["classify"])
generic.init(skipsteps, readlibs)

Expand Down

0 comments on commit 00300bb

Please sign in to comment.