Skip to content

Commit

Permalink
Add coverage stat and bogus unitig filters.
Browse files Browse the repository at this point in the history
Probably quite a few tweaks to the pipeline.
  • Loading branch information
brianwalenz committed Aug 14, 2015
1 parent dbaaaa2 commit d3e4d49
Show file tree
Hide file tree
Showing 13 changed files with 442 additions and 394 deletions.
2 changes: 2 additions & 0 deletions src/main.mk
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ SUBMAKEFILES := stores/gatekeeperCreate.mk \
stores/ovStoreDump.mk \
stores/tgStoreDump.mk \
stores/tgStoreLoad.mk \
stores/tgStoreFilter.mk \
stores/tgStoreCoverageStat.mk \
stores/tgTigDisplay.mk \
\
meryl/libleaff.mk \
Expand Down
5 changes: 3 additions & 2 deletions src/pipelines/ca3g.pl
Original file line number Diff line number Diff line change
Expand Up @@ -374,9 +374,10 @@ ($$$)
consensusCheck($wrk, $asm, 2);
consensusCheck($wrk, $asm, 3);

#consensusLoad($wrk, $asm);
consensusLoad($wrk, $asm);
consensusFilter($wrk, $asm);

#outputGraph($wrk, $asm);
outputGraph($wrk, $asm);
outputLayout($wrk, $asm);
outputSequence($wrk, $asm);
}
Expand Down
149 changes: 139 additions & 10 deletions src/pipelines/ca3g/Consensus.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package ca3g::Consensus;
require Exporter;

@ISA = qw(Exporter);
@EXPORT = qw(consensusConfigure consensusCheck);
@EXPORT = qw(consensusConfigure consensusCheck consensusLoad consensusFilter);

use strict;

Expand All @@ -12,6 +12,7 @@ use File::Path qw(make_path remove_tree);
use ca3g::Defaults;
use ca3g::Execution;
use ca3g::Gatekeeper;
use ca3g::Unitig;



Expand Down Expand Up @@ -179,6 +180,7 @@ sub consensusConfigure ($$) {
my $path = "$wrk/5-consensus";

goto stopBefore if (skipStage($wrk, $asm, "consensusConfigure") == 1);
goto stopAfter if (-e "$wrk/$asm.tigStore/seqDB.v002.tig");

make_path("$path") if (! -d "$path");

Expand All @@ -195,12 +197,8 @@ sub consensusConfigure ($$) {
}
}

# How many partitions? There should be a classier way...

my $jobs = computeNumberOfConsensusJobs($wrk, $asm);

# Run consensus

if (getGlobal("cnsConsensus") eq "utgcns") {
utgcns($wrk, $asm, $jobs);

Expand All @@ -220,7 +218,7 @@ sub consensusConfigure ($$) {
allDone:
emitStage($wrk, $asm, "consensusConfigure");
stopBefore:
stopBefore("consensus", $cmd);
stopBefore("consensusConfigure", $cmd);
}


Expand All @@ -235,10 +233,9 @@ sub consensusCheck ($$$) {
my $attempt = shift @_;
my $path = "$wrk/5-consensus";

return if (skipStage($wrk, $asm, "consensusCheck", $attempt) == 1);
return if (-e "$path/cnsjob.files");

# How many partitions? There should be a classier way...
goto stopAfter if (skipStage($wrk, $asm, "consensusCheck", $attempt) == 1);
goto stopAfter if (-e "$path/cnsjob.files");
goto stopAfter if (-e "$wrk/$asm.tigStore/seqDB.v002.tig");

my $jobs = computeNumberOfConsensusJobs($wrk, $asm);

Expand Down Expand Up @@ -301,4 +298,136 @@ sub consensusCheck ($$$) {
emitStage($wrk, $asm, "consensusCheck", $attempt);

submitOrRunParallelJob($wrk, $asm, "cns", $path, "consensus", @failedJobs);

allDone:
emitStage($wrk, $asm, "consensusCheck");
stopAfter:
stopAfter("consensusCheck");
}




sub consensusLoad ($$) {
my $wrk = shift @_;
my $asm = shift @_;
my $bin = getBinDirectory();
my $cmd;
my $path = "$wrk/5-consensus";

goto stopAfter if (skipStage($wrk, $asm, "consensusLoad") == 1);
goto allDone if (-e "$wrk/$asm.tigStore/seqDB.v002.tig");

# Expects to have a cnsjob.files list of output files from the consensusCheck() function.

caExit("can't find '$path/cnsjob.files' for loading tigs into store: $!", undef) if (! -e "$path/cnsjob.files");

# Now just load them.

$cmd = "$bin/tgStoreLoad \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
$cmd .= " -T $wrk/$asm.tigStore 2 \\\n";
$cmd .= " -L $path/cnsjob.files \\\n";
$cmd .= "> $path/cnsjobs.files.tigStoreLoad.err 2>&1";

if (runCommand($path, $cmd)) {
caExit("failed to load unitig consensus into tigStore", "$path/cnsjobs.files.tigStoreLoad.err");
}

# Remvoe consensus outputs

if (-e "$path/cnsjob.files") {
print STDERR "-- Purging consensus output after loading to tigStore.\n";

my $Ncns = 0;
my $Nfastq = 0;
my $Nlayout = 0;
my $Nlog = 0;

open(F, "< $path/cnsjob.files") or caExit("can't open '$path/cnsjob.files' for reading: $!\n", undef);
while (<F>) {
chomp;
if (m/^(.*)\/0*(\d+).cns$/) {
my $ID6 = substr("00000" . $2, -6);
my $ID4 = substr("000" . $2, -4);
my $ID0 = $2;

if (-e "$1/$ID4.cns") {
$Ncns++;
unlink "$1/$ID4.cns";
}
if (-e "$1/$ID4.fastq") {
$Nfastq++;
unlink "$1/$ID4.fastq";
}
if (-e "$1/$ID4.layout") {
$Nlayout++;
unlink "$1/$ID4.layout";
}
if (-e "$1/consensus.$ID6.out") {
$Nlog++;
unlink "$1/consensus.$ID6.out";
}
if (-e "$1/consensus.$ID0.out") {
$Nlog++;
unlink "$1/consensus.$ID0.out";
}

} else {
caExit("unknown consensus job name '$_'\n", undef);
}
}
close(F);

print STDERR "-- Purged $Ncns .cns outputs.\n" if ($Ncns > 0);
print STDERR "-- Purged $Nfastq .fastq outputs.\n" if ($Nfastq > 0);
print STDERR "-- Purged $Nlayout .layout outputs.\n" if ($Nlayout > 0);
print STDERR "-- Purged $Nlog .err log outputs.\n" if ($Nlog > 0);
}

allDone:
emitStage($wrk, $asm, "consensusLoad");
stopAfter:
reportUnitigSizes($wrk, $asm, 2, "after consenss generation");
stopAfter("consensusLoad");
}




sub consensusFilter ($$) {
my $wrk = shift @_;
my $asm = shift @_;
my $bin = getBinDirectory();
my $cmd;
my $path = "$wrk/5-consensus";

goto stopAfter if (skipStage($wrk, $asm, "consensusFilter") == 1);

my $msrs = getGlobal("maxSingleReadSpan");
my $lca = getGlobal("lowCoverageAllowed"); # checkParameters() ensures that this and
my $lcd = getGlobal("lowCoverageDepth"); # this are both set or both unset
my $mru = getGlobal("minReadsUnique");
my $mul = getGlobal("minUniqueLength");
my $mrl = getGlobal("maxRepeatLength");

$cmd = "$bin/tgStoreFilter \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
$cmd .= " -T $wrk/$asm.tigStore 2 \\\n";
$cmd .= " -o $wrk/$asm.tigStore.filter \\\n";
$cmd .= " -span $msrs \\\n" if (defined($msrs));;
$cmd .= " -lowcov $lca $lcd \\\n" if (defined($lca) && defined($lcd));;
$cmd .= " -reads $mru \\\n" if (defined($mru));
$cmd .= " -long $mul \\\n" if (defined($mul));
$cmd .= " -short $mrl \\\n" if (defined($mrl));
$cmd .= "> $wrk/$asm.tigStore.filter.err 2>&1";

if (runCommand($path, $cmd)) {
caExit("failed to filter unitigs", "$wrk/$asm.tigStore.filter.err");
}

allDone:
emitStage($wrk, $asm, "consensusFilter");
stopAfter:
stopAfter("consensusFilter");
}
4 changes: 3 additions & 1 deletion src/pipelines/ca3g/CorrectReads.pm
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,9 @@ sub dumpCorrectedReads ($$) {
close(O);
close(F);

print STDERR "dumpCorrectedReads()-- wrote $reads corrected reads from $files files into '$wrk/$asm.correctedReads.fastq'.\n";
print STDERR "--\n";
print STDERR "-- wrote $reads corrected reads from $files files into '$wrk/$asm.correctedReads.fastq'.\n";
print STDERR "--\n";

emitStage($WRK, $asm, "cor-dumpCorrectedReads");
}
37 changes: 24 additions & 13 deletions src/pipelines/ca3g/Defaults.pm
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,13 @@ sub checkParameters ($) {
}
}


if ((!defined("lowCoverageAllowed") && defined("lowCoverageDepth")) ||
( defined("lowCoverageAllowed") && !defined("lowCoverageDepth"))) {
caExit("invalid 'lowCoverageAllowed' and 'lowCoverageDepth' specified; both must be set", undef);
}


#if ((getGlobal("cleanup") ne "none") &&
# (getGlobal("cleanup") ne "light") &&
# (getGlobal("cleanup") ne "heavy") &&
Expand Down Expand Up @@ -1128,12 +1135,6 @@ sub setDefaults () {
$global{"genomeSize"} = undef;
$synops{"genomeSize"} = "An estimate of the size of the genome";

$global{"utgBubblePopping"} = 1;
$synops{"utgBubblePopping"} = "Smooth polymorphic regions";

$global{"utgRecalibrateGAR"} = 1;
$synops{"utgRecalibrateGAR"} = "Use an experimental algorithm to decide unique/repeat";

$global{"batOptions"} = undef;
$synops{"batOptions"} = "Advanced options to bogart";

Expand All @@ -1143,21 +1144,28 @@ sub setDefaults () {
$global{"batThreads"} = undef;
$synops{"batThreads"} = "Number of threads to use in the Merge/Split/Join phase; default is whatever OpenMP wants";

##### Unitig Filtering Options



##### Unitig Repeat/Unique Options (formerly in scaffolder)

$global{"maxSingleReadSpan"} = undef;
$global{"maxSingleReadSpan"} = undef; # 1.0 (default as in the binary)
$synops{"maxSingleReadSpan"} = "Unitigs with a single read spanning more than this fraction of the unitig are never labeled unique";

$global{"lowCoverageDepth"} = undef;
$synops{"lowCoverageDepth"} = "See lowCoverageAllowed";
$global{"lowCoverageAllowed"} = undef; # 1.0
$synops{"lowCoverageAllowed"} = "Unitigs with more than fraction lowCoverageAllowed bases at depth at most lowCoverageDepth bases are never labeled unique";

$global{"lowCoverageAllowed"} = undef;
$synops{"lowCoverageAllowed"} = "Unitigs with more than this fraction lowCoverageDepth bases are never labeled unique";
$global{"lowCoverageDepth"} = undef; # 2
$synops{"lowCoverageDepth"} = "Unitigs with more than fraction lowCoverageAllowed bases at depth at most lowCoverageDepth bases are never labeled unique";

$global{"minReadsUnique"} = undef;
$global{"minReadsUnique"} = undef; # 2
$synops{"minReadsUnique"} = "Unitigs with fewer reads that this are never labeled unique";

$global{"maxRepeatLength"} = undef;
$global{"minUniqueLength"} = undef; # 1000
$synops{"minUniqueLength"} = "Unitigs shorter than this are always labeled non-unique";

$global{"maxRepeatLength"} = undef; # max_int
$synops{"maxRepeatLength"} = "Unitigs longer than this are always labeled unique";

##### Consensus Options
Expand Down Expand Up @@ -1209,6 +1217,9 @@ sub setDefaults () {
$global{"falconSense"} = "/work/software/falcon/install/fc_env/bin/fc_consensus.py" if (-e "/work/software/falcon/install/fc_env/bin/fc_consensus.py");
$synops{"falconSense"} = "Path to fc_consensus.py or falcon_sense.bin";




##### Ugly, command line options passed to printHelp()

$global{"help"} = "";
Expand Down
4 changes: 3 additions & 1 deletion src/pipelines/ca3g/Execution.pm
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,11 @@ sub lookupStageLabel ($) {

$ckp{'consensusConfigure'} = $index++;
$ckp{'consensusCheck'} = $index++; # + attempt
$ckp{'consensusLoad'} = $index++;
$ckp{'consensusFilter'} = $index++;

$ckp{'outputLabel'} = $index++;
$ckp{'outputLayout'} = $index++;
$ckp{'outputGraph'} = $index++;
$ckp{'outputSequence'} = $index++;

caFailure("invalid checkpoint label '$label'", undef) if (!defined($ckp{$label}));
Expand Down
46 changes: 43 additions & 3 deletions src/pipelines/ca3g/Output.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package ca3g::Output;
require Exporter;

@ISA = qw(Exporter);
@EXPORT = qw(outputLayout outputSequence);
@EXPORT = qw(outputLayout outputGraph outputSequence);

use strict;

Expand Down Expand Up @@ -35,8 +35,6 @@ sub outputLayout ($$) {
my $prefix = $1 if (m/^(.*).cns/);

if (-e "$prefix.layout") {
print STDERR "Copying layouts from $prefix.layout\n";

open(L, "< $prefix.layout") or caExit("can't open '$prefix.layout' for reading: $!", undef);
while (<L>) {
next if (m/^cns\s/);
Expand All @@ -63,9 +61,47 @@ sub outputLayout ($$) {
allDone:
emitStage($wrk, $asm, "outputLayout");
stopAfter:

print STDERR "--\n";
print STDERR "-- wrote unitig layouts to '$wrk/$asm.layout'.\n";
print STDERR "--\n";
}




sub outputGraph ($$) {
my $wrk = shift @_;
my $asm = shift @_;
my $bin = getBinDirectory();
my $cmd;

goto stopAfter if (skipStage($wrk, $asm, "outputGraph") == 1);
goto allDone if (-e "$wrk/$asm.graph");

#
# Stuff here.
#

stopBefore:
#stopBefore("outputSequence", $cmd);

if (runCommand($wrk, $cmd)) {
caExit("failed to output consensus", "$wrk/$asm.graph.err");
}

allDone:
emitStage($wrk, $asm, "outputGraph");
stopAfter:

print STDERR "--\n";
print STDERR "-- wrote unitig graph to (nowhere, yet).\n";
print STDERR "--\n";
}




sub outputSequence ($$) {
my $wrk = shift @_;
my $asm = shift @_;
Expand Down Expand Up @@ -115,4 +151,8 @@ sub outputSequence ($$) {
allDone:
emitStage($wrk, $asm, "outputSequence");
stopAfter:

print STDERR "--\n";
print STDERR "-- wrote unitig sequence to $wrk/$asm.consensus.f''.\n";
print STDERR "--\n";
}
Loading

0 comments on commit d3e4d49

Please sign in to comment.