Skip to content

Commit

Permalink
Auto-increase overlap minimum size for bogart, temporary while bogart…
Browse files Browse the repository at this point in the history
… splitting algorithm is improved
  • Loading branch information
skoren committed Feb 29, 2016
1 parent 81a7505 commit 0bcc712
Showing 1 changed file with 54 additions and 10 deletions.
64 changes: 54 additions & 10 deletions src/pipelines/canu/Unitig.pm
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ require Exporter;
use strict;

use File::Path qw(make_path remove_tree);
use POSIX qw(ceil);

use canu::Defaults;
use canu::Execution;
Expand All @@ -49,6 +50,47 @@ use canu::Gatekeeper;
use canu::HTML;
use canu::Meryl;

sub roundoff($$) {
my $num = shift;
my $base = shift || 1;

return int($num/$base + 0.5)*$base;
}

sub estimateOverlapSize($$) {
my $wrk = shift @_;
my $asm = shift @_;
my $bin = getBinDirectory();
my $minread = getGlobal("minReadLength");
my $ovl = getGlobal("minOverlapLength");
my $gs = getGlobal("genomeSize");

# if the user manually set a threshold, use theirs, don't change it
if ($ovl != 500) {
return $ovl;
}

# for larger genomes bogart works better when ignoring short (repetitive) overlaps
# this is a temoporary fix while bogart splitting is improved
# we don't do it for all genomes to avoid losing overlaps for assembly of small genomes
open(F, "$bin/gatekeeperDumpMetaData -stats -G $wrk/$asm.gkpStore | ") or caFailure("failed to read gatekeeper stats fromfrom '$wrk/$asm.gkpStore'", undef);
while (<F>) {
my ($junk1, $library, $junk2, $reads, $junk3, $junk4, $bases, $junk5, $average, $junk6, $min, $junk7, $max) = split '\s+', $_;
if ($library == 0) {
my $cov = ceil($bases / $gs);

if ($gs >= 100000000) {
$ovl = roundoff(($average - $minread + $ovl) * 0.65, 500);
print STDERR "-- Average read length $average (found $reads reads with $bases bases). Minimum overlap set to $ovl\n";
}
last;
}
}
close(F);
return $ovl;
}



sub reportUnitigSizes ($$$$) {
my $wrk = shift @_;
Expand Down Expand Up @@ -150,6 +192,7 @@ sub unitig ($$) {
my $perPart = int(getNumberOfReadsInStore($wrk, $asm) / getGlobal("cnsPartitions"));
my $minPart = getGlobal("cnsPartitionMin");
my $genomeCoverage = int(1.3 * getGenomeCoverage($wrk, $asm, getGlobal("utgOvlMerSize")));
my $overlapLength = estimateOverlapSize($wrk, $asm);

$perPart = ($perPart < $minPart) ? ($perPart) : ($minPart);

Expand All @@ -173,16 +216,17 @@ sub unitig ($$) {
print F " -T $wrk/$asm.tigStore.WORKING \\\n";
print F " -o $wrk/4-unitigger/$asm \\\n";
print F " -B $perPart \\\n";
print F " -gs " . getGlobal("genomeSize") . " \\\n";
print F " -eg " . getGlobal("utgGraphErrorRate") . " \\\n";
print F " -eb " . getGlobal("utgBubbleErrorRate") . " \\\n";
print F " -em " . getGlobal("utgMergeErrorRate") . " \\\n";
print F " -er " . getGlobal("utgRepeatErrorRate") . " \\\n";
print F " -threads " . getGlobal("batThreads") . " \\\n" if (defined(getGlobal("batThreads")));
print F " -M " . getGlobal("batMemory") . " \\\n" if (defined(getGlobal("batMemory")));
print F " " . getGlobal("batOptions") . " \\\n" if (defined(getGlobal("batOptions")));
print F " -unassembled " . getGlobal("contigFilter") . " \\\n" if (defined(getGlobal("contigFilter")));
print F " -repeatdetect 6 " . $genomeCoverage . " 15" . " \\\n" if (defined($genomeCoverage));
print F " -gs " . getGlobal("genomeSize") . " \\\n";
print F " -eg " . getGlobal("utgGraphErrorRate") . " \\\n";
print F " -eb " . getGlobal("utgBubbleErrorRate") . " \\\n";
print F " -em " . getGlobal("utgMergeErrorRate") . " \\\n";
print F " -er " . getGlobal("utgRepeatErrorRate") . " \\\n";
print F " -threads " . getGlobal("batThreads") . " \\\n" if (defined(getGlobal("batThreads")));
print F " -M " . getGlobal("batMemory") . " \\\n" if (defined(getGlobal("batMemory")));
print F " " . getGlobal("batOptions") . " \\\n" if (defined(getGlobal("batOptions")));
print F " -unassembled " . getGlobal("contigFilter") . " \\\n" if (defined(getGlobal("contigFilter")));
print F " -repeatdetect 6 " . $genomeCoverage . " 15" . " \\\n" if (defined($genomeCoverage));
print F " -el " . $overlapLength . " \\\n";
print F " > $wrk/4-unitigger/unitigger.err 2>&1 \\\n";
print F "&& \\\n";
print F "mv $wrk/$asm.tigStore.WORKING $wrk/$asm.tigStore.FINISHED\n";
Expand Down

0 comments on commit 0bcc712

Please sign in to comment.