Skip to content

Commit

Permalink
Add read length histograms to sqStoreDumpMetaData; support reporting …
Browse files Browse the repository at this point in the history
…metadata for only specific read types.
  • Loading branch information
brianwalenz committed Jul 15, 2019
1 parent c8f2c4b commit 730f95e
Show file tree
Hide file tree
Showing 5 changed files with 528 additions and 317 deletions.
21 changes: 15 additions & 6 deletions src/pipelines/canu/OverlapErrorAdjustment.pm
Original file line number Diff line number Diff line change
Expand Up @@ -74,22 +74,31 @@ sub loadReadLengths ($$$) {
print STDERR "--\n";
print STDERR "-- Loading read lengths.\n";

open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -reads 2> /dev/null |");
# Dump corrected read lengths and trimming. Load the length into
# an array for later use.

open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -corrected -reads 2> /dev/null |");
while (<F>) {
s/^\s+//;
s/\s+$//;

next if (m/^readID/); # Header line
next if (m/^------/); # ------ ----
next if (m/^----/); # ------ ----

my @v = split '\s+', $_;
my $l = 0;

if ($v[5] =~ m/CC--/) { # Corrected available, but no trimmed.
$l = $v[2];
}

next if ($v[8] eq "-");
next if ($v[8] eq "ignored");
if ($v[5] =~ m/CCTT/) { # Corrected and trimmed.
$l = $v[4] - $v[3];
}

vec($$rlVec, $v[0], 32) = $v[10] - $v[9];
vec($$rlVec, $v[0], 32) = $l;
$rlCnt += 1;
$rlSum += $v[10] - $v[9];
$rlSum += $l;
}
close(F);

Expand Down
108 changes: 43 additions & 65 deletions src/pipelines/canu/SequenceStore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -331,32 +331,27 @@ sub generateReadLengthHistogram ($$) {
my @rl;
my @hi;

# Load the read lengths, find min and max lengths.
# Generate a lovely PNG histogram.

open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -reads |") or caExit("can't dump meta data from './$asm.seqStore': $!", undef);
while (<F>) {
next if (m/readID/); # Skip the header lines.
next if (m/------/);

s/^\s+//;
s/\s+$//;

my @v = split '\s+', $_;
my $l = 0;

$l = $v[2] if (($tag eq "cor") || ($tag eq "hap"));
$l = $v[8] if (($tag eq "obt"));
$l = $v[10]-$v[9] if (($tag eq "utg") && ($v[10] ne "-") && ($v[9] ne "-"));
if (! -e "./$asm.seqStore/readlengths-$tag.dat") {
my $cmd;

push @rl, $l if (($l ne "-") && ($l > 0));
}
close(F);
$cmd = "$bin/sqStoreDumpMetaData \\\n";
$cmd .= " -S ./$asm.seqStore \\\n";
$cmd .= " -raw \\\n" if ($tag eq "cor");
$cmd .= " -corrected \\\n" if ($tag eq "obt");
$cmd .= " -corrected -trimmed \\\n" if ($tag eq "utg");
$cmd .= " -histogram " . getGlobal("genomeSize") . " \\\n";
$cmd .= " -lengths \\\n";
$cmd .= "> ./$asm.seqStore/readlengths-$tag.dat \\\n";
$cmd .= "2> ./$asm.seqStore/readlengths-$tag.err \n";

@rl = sort { $a <=> $b } @rl;
if (runCommand(".", $cmd) > 0) {
caExit("sqStoreDumpMetaData failed", "./$asm.seqStore/readlengths-$tag.err");
}

# Generate PNG histograms if there are any reads.
unlink "./$asm.seqStore/readlengths-$tag.err";

if ($reads > 0) {
my $gnuplot = getGlobal("gnuplot");
my $format = getGlobal("gnuplotImageFormat");

Expand All @@ -375,12 +370,6 @@ sub generateReadLengthHistogram ($$) {
print F "plot [] './$asm.seqStore/readlengths-$tag.dat' using (bin(\$1,binwidth)):(1.0) smooth freq with boxes title ''\n";
close(F);

open(F, "> ./$asm.seqStore/readlengths-$tag.dat") or caExit("can't open './$asm.seqStore/readlengths-$tag.dat' for writing: $!", undef);
foreach my $rl (@rl) {
print F "$rl\n";
}
close(F);

if (runCommandSilently(".", "$gnuplot < /dev/null ./$asm.seqStore/readlengths-$tag.gp > /dev/null 2>&1", 0)) {
print STDERR "--\n";
print STDERR "-- WARNING: gnuplot failed.\n";
Expand All @@ -390,54 +379,43 @@ sub generateReadLengthHistogram ($$) {
}
}

# Generate the ASCII histogram.

$hist = "--\n";
$hist .= "-- In sequence store './$asm.seqStore':\n";
$hist .= "-- Found $reads reads.\n";
$hist .= "-- Found $bases bases ($coverage times coverage).\n";

if ($reads > 0) {
my $minLen = $rl[ 0];
my $maxLen = $rl[-1];
my $scale = 0;
my $bucketSize = 0;

# Buckets of size 1000 are easy to interpret, but sometimes not ideal.

$bucketSize = 10000;
$bucketSize = 5000 if ($maxLen - $minLen < 1000000);
$bucketSize = 1000 if ($maxLen - $minLen < 100000);
$bucketSize = 100 if ($maxLen - $minLen < 10000);
# Generate a lovely ASCII histogram.

# Generate the histogram (int truncates)

my $mBgn = int($minLen / $bucketSize);
my $mEnd = int($maxLen / $bucketSize);

foreach my $rl (@rl) {
my $b = int($rl / $bucketSize);
if (! -e "./$asm.seqStore/readlengths-$tag.txt") {
my $cmd;

$hi[$b]++;
}
$cmd = "$bin/sqStoreDumpMetaData \\\n";
$cmd .= " -S ./$asm.seqStore \\\n";
$cmd .= " -raw \\\n" if ($tag eq "cor");
$cmd .= " -corrected \\\n" if ($tag eq "obt");
$cmd .= " -corrected -trimmed \\\n" if ($tag eq "utg");
$cmd .= " -histogram " . getGlobal("genomeSize") . " \\\n";
$cmd .= "> ./$asm.seqStore/readlengths-$tag.txt \\\n";
$cmd .= "2> ./$asm.seqStore/readlengths-$tag.err \n";

for (my $ii=$mBgn; $ii<=$mEnd; $ii++) { # Scale the *'s so that the longest has 70 of 'em
$scale = $hi[$ii] / 70 if ($scale < $hi[$ii] / 70);
if (runCommand(".", $cmd) > 0) {
caExit("sqStoreDumpMetaData failed", "./$asm.seqStore/readlengths-$tag.err");
}

# Draw the histogram.
unlink "./$asm.seqStore/readlengths-$tag.err";
}

$hist .= "--\n";
$hist .= "-- Read length histogram (one '*' equals " . int(100 * $scale) / 100 . " reads):\n";
# Read the ASCII histogram, append to report.

for (my $ii=$mBgn; $ii<=$mEnd; $ii++) {
my $s = $ii * $bucketSize;
my $e = $ii * $bucketSize + $bucketSize - 1;
$hist = "--\n";
$hist .= "-- In sequence store './$asm.seqStore':\n";
$hist .= "-- Found $reads reads.\n";
$hist .= "-- Found $bases bases ($coverage times coverage).\n";

$hi[$ii] += 0; # Otherwise, cells with no count print as null.
if (-e "./$asm.seqStore/readlengths-$tag.txt") {
#$hist .= "--\n";
#$hist .= "-- Read length histogram (one '*' equals " . int(100 * $scale) / 100 . " reads):\n";

$hist .= sprintf("-- %6d %6d %6d %s\n", $s, $e, $hi[$ii], "*" x int($hi[$ii] / $scale));
open(F, "< ./$asm.seqStore/readlengths-$tag.txt") or caExit("can't open './$asm.seqStore/readlengths-$tag.txt' for reading: $!", undef);
while (<F>) {
$hist .= "-- $_";
}
close(F);
}

# Abort if the read coverage is too low.
Expand Down
Loading

0 comments on commit 730f95e

Please sign in to comment.