Skip to content

Commit

Permalink
Simplify the logic that decides which top-level stages run and that u…
Browse files Browse the repository at this point in the history
…pdates seqStore.
  • Loading branch information
brianwalenz committed Aug 26, 2020
1 parent d4d3779 commit 6d27414
Show file tree
Hide file tree
Showing 7 changed files with 479 additions and 550 deletions.
441 changes: 243 additions & 198 deletions src/pipelines/canu.pl

Large diffs are not rendered by default.

118 changes: 37 additions & 81 deletions src/pipelines/canu/CorrectReads.pm
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ sub getCorCov ($$) {
my $typ = shift @_;
my $cov = getGlobal("corMaxEvidenceCoverage$typ");

my $exp = getExpectedCoverage("cor", $asm);
my $exp = getExpectedCoverage($asm, "cor");
my $des = getGlobal("corOutCoverage");

if (!defined($cov)) {
Expand Down Expand Up @@ -95,7 +95,7 @@ sub setupCorrectionParameters ($) {
# Set the minimum coverage for a corrected read based on coverage in input reads.

if (!defined(getGlobal("corMinCoverage"))) {
my $cov = getExpectedCoverage("cor", $asm);
my $cov = getExpectedCoverage($asm, "cor");

setGlobal("corMinCoverage", 4);
setGlobal("corMinCoverage", 4) if ($cov < 60);
Expand Down Expand Up @@ -639,24 +639,26 @@ sub loadCorrectedReads ($) {

unlink("$base/$asm.loadCorrectedReads.err");

# Save updated stores.
# Summarize and save updated stores.

generateReadLengthHistogram("obt", $asm);
stashSeqStore($asm);

stashFile("$base/$asm.corStore/seqDB.v002.dat");
stashFile("$base/$asm.corStore/seqDB.v002.tig");

# Report reads.

addToReport("obtSeqStore", generateReadLengthHistogram("obt", $asm));

# Now that all outputs are (re)written, cleanup the job outputs.
# (unless there are no corrected reads, then leave things alone for debugging)

my $Ncns = 0;
my $Nerr = 0;
my $Nlog = 0;

if (getGlobal("saveReadCorrections") != 1) {
if (getNumberOfReadsInStore($asm, "obt") == 0) {
print STDERR "--\n";
print STDERR "-- No corrected reads generated; correctReads output saved.\n";
}
elsif (getGlobal("saveReadCorrections") != 1) {
print STDERR "--\n";
print STDERR "-- Purging correctReads output after loading into stores.\n";

Expand All @@ -669,23 +671,11 @@ sub loadCorrectedReads ($) {
my $ID4 = substr("000" . $2, -4);
my $ID0 = $2;

if (-e "correction/$1/results/$ID4.cns") {
$Ncns++;
unlink "correction/$1/results/$ID4.cns";
}
if (-e "correction/$1/results/$ID4.err") {
$Nlog++;
unlink "correction/$1/results/$ID4.err";
}

if (-e "correction/$1/correctReads.$ID6.out") {
$Nlog++;
unlink "correction/$1/correctReads.$ID6.out";
}
if (-e "correction/$1/correctReads.$ID0.out") {
$Nlog++;
unlink "correction/$1/correctReads.$ID0.out";
}
if (-e "correction/$1/results/$ID4.cns") { $Ncns++; unlink "correction/$1/results/$ID4.cns"; }
if (-e "correction/$1/results/$ID4.err") { $Nlog++; unlink "correction/$1/results/$ID4.err"; }

if (-e "correction/$1/correctReads.$ID6.out") { $Nlog++; unlink "correction/$1/correctReads.$ID6.out"; }
if (-e "correction/$1/correctReads.$ID0.out") { $Nlog++; unlink "correction/$1/correctReads.$ID0.out"; }

} else {
caExit("unknown correctReads job name '$_'\n", undef);
Expand All @@ -696,19 +686,24 @@ sub loadCorrectedReads ($) {
print STDERR "-- Purged $Ncns .cns outputs.\n" if ($Ncns > 0);
print STDERR "-- Purged $Nerr .err outputs.\n" if ($Nerr > 0);
print STDERR "-- Purged $Nlog .out job log outputs.\n" if ($Nlog > 0);
} else {
}
else {
print STDERR "--\n";
print STDERR "-- Purging correctReads output disabled by saveReadCorrections=true.\n" if (getGlobal("saveReadCorrections") == 1);
}

# And purge the usually massive overlap store.

if (getGlobal("saveOverlaps") eq "0") {
if (getNumberOfReadsInStore($asm, "obt") > 0) {
print STDERR "--\n";
print STDERR "-- No corrected reads generated, overlaps used for correction saved.\n";
}
elsif (getGlobal("saveOverlaps") eq "0") {
print STDERR "--\n";
print STDERR "-- Purging overlaps used for correction.\n";

remove_tree("correction/$asm.ovlStore")
} else {
}
else {
print STDERR "--\n";
print STDERR "-- Overlaps used for correction saved.\n";
}
Expand All @@ -729,65 +724,26 @@ sub dumpCorrectedReads ($) {
my $cmd;

goto allDone if (fileExists("$asm.correctedReads.fasta.gz"));
goto allDone if (fileExists("$asm.correctedReads.fastq.gz"));
goto allDone if (getGlobal("saveReads") == 0);

# We need to skip this entire function if corrected reads were not computed.
# Otherwise, we incorrectly declare that no corrected reads were generated
# and halt the assembly.
#
# In cloud mode, we dump reads right after loading them, and to load them,
# the 2-correction directory must exist, so we use that to decide if correction
# was attempted.

return if (! -d "correction/2-correction");

# If no corrected reads exist, don't bother trying to dump them.
$cmd = "$bin/sqStoreDumpFASTQ \\\n";
$cmd .= " -corrected \\\n";
$cmd .= " -S ./$asm.seqStore \\\n";
$cmd .= " -o ./$asm.correctedReads.gz \\\n";
$cmd .= " -fasta \\\n";
$cmd .= " -nolibname \\\n";
$cmd .= "> $asm.correctedReads.fasta.err 2>&1";

if (getNumberOfReadsInStore($asm, "obt") > 0) {
$cmd = "$bin/sqStoreDumpFASTQ \\\n";
$cmd .= " -corrected \\\n";
$cmd .= " -S ./$asm.seqStore \\\n";
$cmd .= " -o ./$asm.correctedReads.gz \\\n";
$cmd .= " -fasta \\\n";
$cmd .= " -nolibname \\\n";
$cmd .= "> $asm.correctedReads.fasta.err 2>&1";

if (runCommand(".", $cmd)) {
caExit("failed to output corrected reads", "./$asm.correctedReads.fasta.err");
}

unlink "./$asm.correctedReads.fasta.err";

stashFile("$asm.correctedReads.fasta.gz");
if (runCommand(".", $cmd)) {
caExit("failed to output corrected reads", "./$asm.correctedReads.fasta.err");
}

# If the corrected reads file exists, report so.
# Otherwise, report no corrected reads, and generate fake outputs so we terminate.

my $out;

$out = "$asm.correctedReads.fasta.gz" if (fileExists("$asm.correctedReads.fasta.gz"));
$out = "$asm.correctedReads.fastq.gz" if (fileExists("$asm.correctedReads.fastq.gz"));

if (defined($out)) {
print STDERR "--\n";
print STDERR "-- Corrected reads saved in '$out'.\n";
} else {
print STDERR "--\n";
print STDERR "-- Yikes! No corrected reads generated!\n";
print STDERR "-- Can't proceed!\n";
print STDERR "--\n";
print STDERR "-- Generating empty outputs.\n";
unlink "./$asm.correctedReads.fasta.err";

runCommandSilently(".", "gzip -1vc < /dev/null > $asm.correctedReads.gz 2> /dev/null", 0) if (! -e "$asm.correctedReads.gz");
runCommandSilently(".", "gzip -1vc < /dev/null > $asm.trimmedReads.gz 2> /dev/null", 0) if (! -e "$asm.trimmedReads.gz");
stashFile("$asm.correctedReads.fasta.gz");

stashFile("$asm.correctedReads.fasta.gz");
stashFile("$asm.trimmedReads.fasta.gz");

generateOutputs($asm);
}
print STDERR "--\n";
print STDERR "-- Corrected reads saved in '$asm.correctedReads.fasta.gz'.\n";

finishStage:
generateReport($asm);
Expand Down
4 changes: 2 additions & 2 deletions src/pipelines/canu/Meryl.pm
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ sub merylConfigure ($$) {

my $mem = getGlobal("merylMemory");
my $thr = getGlobal("merylThreads");
my $cov = getExpectedCoverage($tag, $asm);
my $cov = getExpectedCoverage($asm, $tag);

open(F, "> $path/meryl-configure.sh");
print F "#!" . getGlobal("shell") . "\n";
Expand Down Expand Up @@ -639,7 +639,7 @@ sub merylConfigure ($$) {
}

if (getGlobal("${tag}Overlapper") eq "mhap") {
$mthresh = int(5.0 * getExpectedCoverage($tag, $asm));
$mthresh = int(5.0 * getExpectedCoverage($asm, $tag));
$mdistinct = undef;
$mwordfreq = getGlobal("${tag}MhapFilterThreshold");
}
Expand Down
83 changes: 24 additions & 59 deletions src/pipelines/canu/OverlapBasedTrimming.pm
Original file line number Diff line number Diff line change
Expand Up @@ -209,22 +209,24 @@ sub loadTrimmedReads ($) {

unlink("$path/$asm.loadTrimmedReads.err");

# Save results.
# Summarize and save results.

generateReadLengthHistogram("utg", $asm);
stashSeqStore($asm);

# Report reads.
# If no trimmed reads were generated, stop now so we
# preserve the stores.

addToReport("utgSeqStore", generateReadLengthHistogram("utg", $asm));

#stashFile("./$asm.trimmedReads.fasta.gz");

if (getGlobal("saveOverlaps") eq "0") {
if (getNumberOfReadsInStore($asm, "utg") > 0) {
print STDERR "--\n";
print STDERR "-- No trimmed reads generated, overlaps used for trimming saved.\n";
}
elsif (getGlobal("saveOverlaps") eq "0") {
print STDERR "--\n";
print STDERR "-- Purging overlaps used for trimming.\n";

remove_tree("trimming/$asm.ovlStore")
} else {
}
else {
print STDERR "--\n";
print STDERR "-- Overlaps used for trimming saved.\n";
}
Expand All @@ -244,63 +246,26 @@ sub dumpTrimmedReads ($) {
my $cmd;

goto allDone if (fileExists("$asm.trimmedReads.fasta.gz"));
goto allDone if (fileExists("$asm.trimmedReads.fastq.gz"));
goto allDone if (getGlobal("saveReads") == 0);

# We need to skip this entire function if trimmed reads were not computed.
# Otherwise, we incorrectly declare that all reads were trimmed out
# and halt the assembly.
#
# In cloud mode, we dump reads right after loading them, and to load them,
# the 3-overlapbasedtrimming directory must exist, so we use that to decide if trimming
# was attempted.

return if (! -d "trimming/3-overlapbasedtrimming");

# If no trimmed reads exist, don't bother trying to dump them.

if (getNumberOfReadsInStore($asm, "utg") > 0) {
$cmd = "$bin/sqStoreDumpFASTQ \\\n";
$cmd .= " -trimmed \\\n";
$cmd .= " -S ./$asm.seqStore \\\n";
$cmd .= " -o ./$asm.trimmedReads.gz \\\n";
$cmd .= " -fasta \\\n";
$cmd .= " -trimmed -normal -nolibname \\\n";
$cmd .= "> ./$asm.trimmedReads.fasta.err 2>&1";

if (runCommand(".", $cmd)) {
caExit("failed to output trimmed reads", "./$asm.trimmedReads.fasta.err");
}

unlink "./$asm.trimmedReads.fasta.err";
$cmd = "$bin/sqStoreDumpFASTQ \\\n";
$cmd .= " -trimmed \\\n";
$cmd .= " -S ./$asm.seqStore \\\n";
$cmd .= " -o ./$asm.trimmedReads.gz \\\n";
$cmd .= " -fasta \\\n";
$cmd .= " -trimmed -normal -nolibname \\\n";
$cmd .= "> ./$asm.trimmedReads.fasta.err 2>&1";

stashFile("$asm.trimmedReads.fasta.gz");
if (runCommand(".", $cmd)) {
caExit("failed to output trimmed reads", "./$asm.trimmedReads.fasta.err");
}

# If the trimmed reads file exists, report so.
# Otherwise, report no trimmed reads, and generate fake outputs so we terminate.
unlink "./$asm.trimmedReads.fasta.err";

my $out;
stashFile("$asm.trimmedReads.fasta.gz");

$out = "$asm.trimmedReads.fasta.gz" if (fileExists("$asm.trimmedReads.fasta.gz"));
$out = "$asm.trimmedReads.fastq.gz" if (fileExists("$asm.trimmedReads.fastq.gz"));

if (defined($out)) {
print STDERR "--\n";
print STDERR "-- Trimmed reads saved in '$out'.\n";
} else {
print STDERR "--\n";
print STDERR "-- Yikes! No trimmed reads generated!\n";
print STDERR "-- Can't proceed!\n";
print STDERR "--\n";
print STDERR "-- Generating empty outputs.\n";

runCommandSilently(".", "gzip -1vc < /dev/null > $asm.trimmedReads.gz 2> /dev/null", 0) if (! -e "$asm.trimmedReads.gz");

stashFile("$asm.trimmedReads.fasta.gz");

generateOutputs($asm);
}
print STDERR "--\n";
print STDERR "-- Trimmed reads saved in '$asm.trimmedReads.fasta.gz'.\n";

finishStage:
generateReport($asm);
Expand Down
2 changes: 1 addition & 1 deletion src/pipelines/canu/OverlapMhap.pm
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ sub mhapConfigure ($$$) {
my ($numHashes, $minNumMatches, $threshold, $ordSketch, $ordSketchMer);

if (!defined(getGlobal("${tag}MhapSensitivity"))) {
my $cov = getExpectedCoverage($tag, $asm);
my $cov = getExpectedCoverage($asm, $tag);

setGlobal("${tag}MhapSensitivity", "low"); # Yup, super inefficient. The code is
setGlobal("${tag}MhapSensitivity", "normal") if ($cov < 60); # compact and clear and runs once.
Expand Down
2 changes: 1 addition & 1 deletion src/pipelines/canu/OverlapStore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -934,7 +934,7 @@ sub createOverlapStore ($$) {

if ($tag eq "utg") {
$cmd = "$bin/ovStoreStats \\\n";
$cmd .= " -C " . getExpectedCoverage("utg", $asm). " \\\n";
$cmd .= " -C " . getExpectedCoverage($asm, "utg"). " \\\n";
$cmd .= " -S ../$asm.seqStore \\\n";
$cmd .= " -O ./$asm.ovlStore \\\n";
$cmd .= " -o ./$asm.ovlStore \\\n";
Expand Down
Loading

0 comments on commit 6d27414

Please sign in to comment.