From 730f95ea4bdea7de87aab0cb22698c6ba519b7ef Mon Sep 17 00:00:00 2001 From: "Brian P. Walenz" Date: Sun, 14 Jul 2019 23:15:33 -0400 Subject: [PATCH] Add read length histograms to sqStoreDumpMetaData; support reporting metadata for only specific read types. --- src/pipelines/canu/OverlapErrorAdjustment.pm | 21 +- src/pipelines/canu/SequenceStore.pm | 108 ++--- src/sequence/sequence-summarize.C | 408 ++++++++++--------- src/stores/sqStoreDumpMetaData.C | 304 ++++++++++++-- src/stores/sqStoreDumpMetaData.mk | 4 +- 5 files changed, 528 insertions(+), 317 deletions(-) diff --git a/src/pipelines/canu/OverlapErrorAdjustment.pm b/src/pipelines/canu/OverlapErrorAdjustment.pm index 25e3f825f..a3f9baf3d 100644 --- a/src/pipelines/canu/OverlapErrorAdjustment.pm +++ b/src/pipelines/canu/OverlapErrorAdjustment.pm @@ -74,22 +74,31 @@ sub loadReadLengths ($$$) { print STDERR "--\n"; print STDERR "-- Loading read lengths.\n"; - open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -reads 2> /dev/null |"); + # Dump corrected read lengths and trimming. Load the length into + # an array for later use. + + open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -corrected -reads 2> /dev/null |"); while () { s/^\s+//; s/\s+$//; next if (m/^readID/); # Header line - next if (m/^------/); # ------ ---- + next if (m/^----/); # ------ ---- my @v = split '\s+', $_; + my $l = 0; + + if ($v[5] =~ m/CC--/) { # Corrected available, but no trimmed. + $l = $v[2]; + } - next if ($v[8] eq "-"); - next if ($v[8] eq "ignored"); + if ($v[5] =~ m/CCTT/) { # Corrected and trimmed. + $l = $v[4] - $v[3]; + } - vec($$rlVec, $v[0], 32) = $v[10] - $v[9]; + vec($$rlVec, $v[0], 32) = $l; $rlCnt += 1; - $rlSum += $v[10] - $v[9]; + $rlSum += $l; } close(F); diff --git a/src/pipelines/canu/SequenceStore.pm b/src/pipelines/canu/SequenceStore.pm index 57f30d92c..9857d43b9 100644 --- a/src/pipelines/canu/SequenceStore.pm +++ b/src/pipelines/canu/SequenceStore.pm @@ -331,32 +331,27 @@ sub generateReadLengthHistogram ($$) { my @rl; my @hi; - # Load the read lengths, find min and max lengths. + # Generate a lovely PNG histogram. - open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -reads |") or caExit("can't dump meta data from './$asm.seqStore': $!", undef); - while () { - next if (m/readID/); # Skip the header lines. - next if (m/------/); - - s/^\s+//; - s/\s+$//; - - my @v = split '\s+', $_; - my $l = 0; - - $l = $v[2] if (($tag eq "cor") || ($tag eq "hap")); - $l = $v[8] if (($tag eq "obt")); - $l = $v[10]-$v[9] if (($tag eq "utg") && ($v[10] ne "-") && ($v[9] ne "-")); + if (! -e "./$asm.seqStore/readlengths-$tag.dat") { + my $cmd; - push @rl, $l if (($l ne "-") && ($l > 0)); - } - close(F); + $cmd = "$bin/sqStoreDumpMetaData \\\n"; + $cmd .= " -S ./$asm.seqStore \\\n"; + $cmd .= " -raw \\\n" if ($tag eq "cor"); + $cmd .= " -corrected \\\n" if ($tag eq "obt"); + $cmd .= " -corrected -trimmed \\\n" if ($tag eq "utg"); + $cmd .= " -histogram " . getGlobal("genomeSize") . " \\\n"; + $cmd .= " -lengths \\\n"; + $cmd .= "> ./$asm.seqStore/readlengths-$tag.dat \\\n"; + $cmd .= "2> ./$asm.seqStore/readlengths-$tag.err \n"; - @rl = sort { $a <=> $b } @rl; + if (runCommand(".", $cmd) > 0) { + caExit("sqStoreDumpMetaData failed", "./$asm.seqStore/readlengths-$tag.err"); + } - # Generate PNG histograms if there are any reads. + unlink "./$asm.seqStore/readlengths-$tag.err"; - if ($reads > 0) { my $gnuplot = getGlobal("gnuplot"); my $format = getGlobal("gnuplotImageFormat"); @@ -375,12 +370,6 @@ sub generateReadLengthHistogram ($$) { print F "plot [] './$asm.seqStore/readlengths-$tag.dat' using (bin(\$1,binwidth)):(1.0) smooth freq with boxes title ''\n"; close(F); - open(F, "> ./$asm.seqStore/readlengths-$tag.dat") or caExit("can't open './$asm.seqStore/readlengths-$tag.dat' for writing: $!", undef); - foreach my $rl (@rl) { - print F "$rl\n"; - } - close(F); - if (runCommandSilently(".", "$gnuplot < /dev/null ./$asm.seqStore/readlengths-$tag.gp > /dev/null 2>&1", 0)) { print STDERR "--\n"; print STDERR "-- WARNING: gnuplot failed.\n"; @@ -390,54 +379,43 @@ sub generateReadLengthHistogram ($$) { } } - # Generate the ASCII histogram. - - $hist = "--\n"; - $hist .= "-- In sequence store './$asm.seqStore':\n"; - $hist .= "-- Found $reads reads.\n"; - $hist .= "-- Found $bases bases ($coverage times coverage).\n"; - - if ($reads > 0) { - my $minLen = $rl[ 0]; - my $maxLen = $rl[-1]; - my $scale = 0; - my $bucketSize = 0; - - # Buckets of size 1000 are easy to interpret, but sometimes not ideal. - - $bucketSize = 10000; - $bucketSize = 5000 if ($maxLen - $minLen < 1000000); - $bucketSize = 1000 if ($maxLen - $minLen < 100000); - $bucketSize = 100 if ($maxLen - $minLen < 10000); + # Generate a lovely ASCII histogram. - # Generate the histogram (int truncates) - - my $mBgn = int($minLen / $bucketSize); - my $mEnd = int($maxLen / $bucketSize); - - foreach my $rl (@rl) { - my $b = int($rl / $bucketSize); + if (! -e "./$asm.seqStore/readlengths-$tag.txt") { + my $cmd; - $hi[$b]++; - } + $cmd = "$bin/sqStoreDumpMetaData \\\n"; + $cmd .= " -S ./$asm.seqStore \\\n"; + $cmd .= " -raw \\\n" if ($tag eq "cor"); + $cmd .= " -corrected \\\n" if ($tag eq "obt"); + $cmd .= " -corrected -trimmed \\\n" if ($tag eq "utg"); + $cmd .= " -histogram " . getGlobal("genomeSize") . " \\\n"; + $cmd .= "> ./$asm.seqStore/readlengths-$tag.txt \\\n"; + $cmd .= "2> ./$asm.seqStore/readlengths-$tag.err \n"; - for (my $ii=$mBgn; $ii<=$mEnd; $ii++) { # Scale the *'s so that the longest has 70 of 'em - $scale = $hi[$ii] / 70 if ($scale < $hi[$ii] / 70); + if (runCommand(".", $cmd) > 0) { + caExit("sqStoreDumpMetaData failed", "./$asm.seqStore/readlengths-$tag.err"); } - # Draw the histogram. + unlink "./$asm.seqStore/readlengths-$tag.err"; + } - $hist .= "--\n"; - $hist .= "-- Read length histogram (one '*' equals " . int(100 * $scale) / 100 . " reads):\n"; + # Read the ASCII histogram, append to report. - for (my $ii=$mBgn; $ii<=$mEnd; $ii++) { - my $s = $ii * $bucketSize; - my $e = $ii * $bucketSize + $bucketSize - 1; + $hist = "--\n"; + $hist .= "-- In sequence store './$asm.seqStore':\n"; + $hist .= "-- Found $reads reads.\n"; + $hist .= "-- Found $bases bases ($coverage times coverage).\n"; - $hi[$ii] += 0; # Otherwise, cells with no count print as null. + if (-e "./$asm.seqStore/readlengths-$tag.txt") { + #$hist .= "--\n"; + #$hist .= "-- Read length histogram (one '*' equals " . int(100 * $scale) / 100 . " reads):\n"; - $hist .= sprintf("-- %6d %6d %6d %s\n", $s, $e, $hi[$ii], "*" x int($hi[$ii] / $scale)); + open(F, "< ./$asm.seqStore/readlengths-$tag.txt") or caExit("can't open './$asm.seqStore/readlengths-$tag.txt' for reading: $!", undef); + while () { + $hist .= "-- $_"; } + close(F); } # Abort if the read coverage is too low. diff --git a/src/sequence/sequence-summarize.C b/src/sequence/sequence-summarize.C index 35732510f..338d6c9ab 100644 --- a/src/sequence/sequence-summarize.C +++ b/src/sequence/sequence-summarize.C @@ -90,173 +90,58 @@ doSummarize_loadSequence(dnaSeqFile *sf, void -doSummarize(vector &inputs, - summarizeParameters &sumPar) { - - vector lengths; - - uint64 nSeqs = 0; - uint64 nBases = 0; - - uint32 mer = 0; - - uint64 mn[4] = {0}; - uint64 dn[4*4] = {0}; - uint64 tn[4*4*4] = {0}; - - double nmn = 0; - double ndn = 0; - double ntn = 0; - - uint32 nameMax = 0; - char *name = NULL; - uint64 seqMax = 0; - char *seq = NULL; - uint8 *qlt = NULL; - uint64 seqLen = 0; - - for (uint32 ff=0; ff lengths) { + uint64 bgn = 0; + uint64 end = 0; - while (doSummarize_loadSequence(sf, sumPar.asSequences, name, nameMax, seq, qlt, seqMax, seqLen) == true) { - uint64 pos = 0; - uint64 bgn = 0; + sort(lengths.begin(), lengths.end(), less()); - if (pos < seqLen) { - mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f; - mn[mer & 0x03]++; - } + while (bgn < lengths.size()) { + end = bgn + 1; - if (pos < seqLen) { - mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f; - mn[mer & 0x03]++; - dn[mer & 0x0f]++; - } + while ((end < lengths.size()) && + (lengths[bgn] == lengths[end])) + end++; - while (pos < seqLen) { - mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f; - mn[mer & 0x03]++; - dn[mer & 0x0f]++; - tn[mer & 0x3f]++; - } - - nmn += (seqLen-0); - ndn += (seqLen < 2) ? 0 : (seqLen-1); - ntn += (seqLen < 3) ? 0 : (seqLen-2); - - // If we're NOT splitting on N, add one sequence of the given length. - - if (sumPar.breakAtN == false) { - nSeqs += 1; - nBases += seqLen; - - lengths.push_back(seqLen); - continue; - } + fprintf(stdout, "%lu\t%lu\n", lengths[bgn], end - bgn); - // But if we ARE splitting on N, add multiple sequences. - - pos = 0; - bgn = 0; - - while (pos < seqLen) { - - // Skip any N's. - while ((pos < seqLen) && ((seq[pos] == 'n') || - (seq[pos] == 'N'))) - pos++; - - // Remember our start position. - bgn = pos; - - // Move ahead until the end of sequence or an N. - while ((pos < seqLen) && ((seq[pos] != 'n') && - (seq[pos] != 'N'))) - pos++; - - // If a sequence, increment stuff. - if (pos - bgn > 0) { - nSeqs += 1; - nBases += pos - bgn; - - lengths.push_back(pos - bgn); - } - } - } - - // All done! - - delete sf; + bgn = end; } +} - delete [] name; - delete [] seq; - delete [] qlt; - - // Finalize. - - sort(lengths.begin(), lengths.end(), greater()); - - if (sumPar.genomeSize == 0) - sumPar.genomeSize = nBases; - - // If only a simple histogram of lengths is requested, dump and done. - - if (sumPar.asSimple == true) { - uint64 bgn = 0; - uint64 end = 0; - - reverse(lengths.begin(), lengths.end()); - - while (bgn < lengths.size()) { - end = bgn + 1; - - while ((end < lengths.size()) && - (lengths[bgn] == lengths[end])) - end++; - - fprintf(stdout, "%lu\t%lu\n", lengths[bgn], end - bgn); - bgn = end; - } - return; - } +void +doSummarize_dumpLengths(vector lengths) { - // If only the read lengths are requested, dump and done. + sort(lengths.begin(), lengths.end(), less()); - if (sumPar.asLength == true) { - reverse(lengths.begin(), lengths.end()); + for (uint64 ii=0; ii lengths, + uint64 genomeSize, + bool limitTo1x) { - uint64 lSum = 0; // Sum of the lengths we've encountered so far + sort(lengths.begin(), lengths.end(), greater()); - uint32 nStep = 10; // Step of each N report. - uint32 nVal = nStep; // Index of the threshold we're next printing. - uint64 nThr = sumPar.genomeSize * nVal / 100; // Threshold lenth; if sum is bigger, emit and move to the next threshold + uint32 nCols = 63; // Magic number to make the histogram the same width as the trinucleotide list + uint32 nRows = 0; // Height of the histogram; dynamically set. + uint32 nRowsMin = 50; // Nothing really magic, just fits on the screen. - //////////////////////////////////////// + uint64 lSum = 0; // Sum of the lengths we've encountered so far - uint32 nCols = 63; // Magic number to make the histogram the same width as the trinucleotide list - uint32 nRows = 0; // Height of the histogram; dynamically set. - uint32 nRowsMin = 50; // Nothing really magic, just fits on the screen. + uint32 nStep = 10; // Step of each N report. + uint32 nVal = nStep; // Index of the threshold we're next printing. + uint64 nThr = genomeSize * nVal / 100; // Threshold lenth; if sum is bigger, emit and move to the next threshold // Count the number of lines we expect to get in the NG table. - for (uint32 ii=0; ii= nThr) { @@ -267,11 +152,11 @@ doSummarize(vector &inputs, else if (nVal < 20000) nVal += nStep * 100; else nVal += nStep * 1000; - nThr = sumPar.genomeSize * nVal / 100; + nThr = genomeSize * nVal / 100; } } - uint64 minLength = lengths[nSeqs-1]; + uint64 minLength = lengths[lengths.size()-1]; uint64 maxLength = lengths[0]; if (nRows < nRowsMin) // If there are too few lines, make it some minimum value, @@ -288,10 +173,10 @@ doSummarize(vector &inputs, if (nRows > nRowsMin) nRows = nRowsMin; - lSum = 0; // Reset for actually generating the length histogram. + lSum = 0; // Reset for actually generating the length histogram. nStep = 10; nVal = nStep; - nThr = sumPar.genomeSize * nVal / 100; + nThr = genomeSize * nVal / 100; // Generate the length histogram. @@ -300,7 +185,7 @@ doSummarize(vector &inputs, for (uint32 rr=0; rr &inputs, histPlot[rr][28 + nn] = 0; } - // If exact...... - - - - // Output N table, with length histogram appended at the end of each line. uint32 hp = 0; fprintf(stdout, "\n"); - fprintf(stdout, "G=%-12" F_U64P " sum of || length num\n", sumPar.genomeSize); + fprintf(stdout, "G=%-12" F_U64P " sum of || length num\n", genomeSize); fprintf(stdout, "NG length index lengths || range seqs\n"); fprintf(stdout, "----- ------------ --------- ------------ || ------------------- -------\n"); // Write lines if we're showing all data, or if we're below 1x coverage. - for (uint32 ii=0; ii= nThr) { - if ((sumPar.limitTo1x == false) || + if ((limitTo1x == false) || (nVal <= 100)) { if (hp <= nRows) fprintf(stdout, "%05" F_U32P " %12" F_U64P " %9" F_U32P " %12" F_U64P " || %s\n", @@ -371,13 +251,13 @@ doSummarize(vector &inputs, else if (nVal < 20000) nVal += nStep * 100; else nVal += nStep * 1000; - nThr = sumPar.genomeSize * nVal / 100; + nThr = genomeSize * nVal / 100; } } // If we're displaying exactly 1x, write empty lines to get to there. - if (sumPar.limitTo1x == true) { + if (limitTo1x == true) { while (nVal <= 100) { if (hp <= nRows) fprintf(stdout, "%05" F_U32P " %12s %9s %12s || %s\n", @@ -393,52 +273,18 @@ doSummarize(vector &inputs, // Now the final summary line. - if (sumPar.genomeSize == 0) - fprintf(stdout, "%07.3fx %9" F_U64P " %12" F_U64P " || %s\n", 0.0, nSeqs, lSum, histPlot[hp++]); // Occurs if only empty sequences in the input! + if (genomeSize == 0) + fprintf(stdout, "%07.3fx %9" F_U64P " %12" F_U64P " || %s\n", 0.0, lengths.size(), lSum, histPlot[hp++]); // Occurs if only empty sequences in the input! else if (hp <= nRows) - fprintf(stdout, "%07.3fx %9" F_U64P " %12" F_U64P " || %s\n", (double)lSum / sumPar.genomeSize, nSeqs, lSum, histPlot[hp++]); + fprintf(stdout, "%07.3fx %9" F_U64P " %12" F_U64P " || %s\n", (double)lSum / genomeSize, lengths.size(), lSum, histPlot[hp++]); else - fprintf(stdout, "%07.3fx %9" F_U64P " %12" F_U64P " ||\n", (double)lSum / sumPar.genomeSize, nSeqs, lSum); - - //fprintf(stdout, " || %s\n", histPlot[hp++]); - //fprintf(stdout, " genome-size %12" F_U64P " || %s\n", sumPar.genomeSize, histPlot[hp++]); + fprintf(stdout, "%07.3fx %9" F_U64P " %12" F_U64P " ||\n", (double)lSum / genomeSize, lengths.size(), lSum); while (hp <= nRows) fprintf(stdout, " || %s\n", histPlot[hp++]); fprintf(stdout, "\n"); - // Report. To get alphabetic ordering correct, hardcoded. - -#define FMT "%12" F_U64P " %6.4f" -#define GC "%05.02f%%" - - if (nmn == 0) nmn = 1; // Avoid divide by zero. - if (ndn == 0) ndn = 1; - if (ntn == 0) ntn = 1; - - double gc = 100.0 * (mn[0x01] + mn[0x03]) / (mn[0x00] + mn[0x01] + mn[0x03] + mn[0x02]); - - fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n"); - fprintf(stdout, " mononucleotide dinucleotide trinucleotide\n"); - fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n"); - fprintf(stdout, "" FMT " A" FMT " AA" FMT " AAA " FMT " AAC " FMT " AAG " FMT " AAT\n", mn[0x00], mn[0x00] / nmn, dn[0x00], dn[0x00] / ndn, tn[0x00], tn[0x00] / ntn, tn[0x01], tn[0x01] / ntn, tn[0x03], tn[0x03] / ntn, tn[0x02], tn[0x02] / ntn); - fprintf(stdout, "" FMT " C" FMT " AC" FMT " ACA " FMT " ACC " FMT " ACG " FMT " ACT\n", mn[0x01], mn[0x01] / nmn, dn[0x01], dn[0x01] / ndn, tn[0x04], tn[0x04] / ntn, tn[0x05], tn[0x05] / ntn, tn[0x07], tn[0x07] / ntn, tn[0x06], tn[0x06] / ntn); - fprintf(stdout, "" FMT " G" FMT " AG" FMT " AGA " FMT " AGC " FMT " AGG " FMT " AGT\n", mn[0x03], mn[0x03] / nmn, dn[0x03], dn[0x03] / ndn, tn[0x0c], tn[0x0c] / ntn, tn[0x0d], tn[0x0d] / ntn, tn[0x0f], tn[0x0f] / ntn, tn[0x0e], tn[0x0e] / ntn); - fprintf(stdout, "" FMT " T" FMT " AT" FMT " ATA " FMT " ATC " FMT " ATG " FMT " ATT\n", mn[0x02], mn[0x02] / nmn, dn[0x02], dn[0x02] / ndn, tn[0x08], tn[0x08] / ntn, tn[0x09], tn[0x09] / ntn, tn[0x0b], tn[0x0b] / ntn, tn[0x0a], tn[0x0a] / ntn); - fprintf(stdout, " " FMT " CA" FMT " CAA " FMT " CAC " FMT " CAG " FMT " CAT\n", dn[0x04], dn[0x04] / ndn, tn[0x10], tn[0x10] / ntn, tn[0x11], tn[0x11] / ntn, tn[0x13], tn[0x13] / ntn, tn[0x12], tn[0x12] / ntn); - fprintf(stdout, " --GC-- --AT-- " FMT " CC" FMT " CCA " FMT " CCC " FMT " CCG " FMT " CCT\n", dn[0x05], dn[0x05] / ndn, tn[0x14], tn[0x14] / ntn, tn[0x15], tn[0x15] / ntn, tn[0x17], tn[0x17] / ntn, tn[0x16], tn[0x16] / ntn); - fprintf(stdout, " " GC " " GC " " FMT " CG" FMT " CGA " FMT " CGC " FMT " CGG " FMT " CGT\n", gc, 100 - gc, dn[0x07], dn[0x07] / ndn, tn[0x1c], tn[0x1c] / ntn, tn[0x1d], tn[0x1d] / ntn, tn[0x1f], tn[0x1f] / ntn, tn[0x1e], tn[0x1e] / ntn); - fprintf(stdout, " " FMT " CT" FMT " CTA " FMT " CTC " FMT " CTG " FMT " CTT\n", dn[0x06], dn[0x06] / ndn, tn[0x18], tn[0x18] / ntn, tn[0x19], tn[0x19] / ntn, tn[0x1b], tn[0x1b] / ntn, tn[0x1a], tn[0x1a] / ntn); - fprintf(stdout, " " FMT " GA" FMT " GAA " FMT " GAC " FMT " GAG " FMT " GAT\n", dn[0x0c], dn[0x0c] / ndn, tn[0x30], tn[0x30] / ntn, tn[0x31], tn[0x31] / ntn, tn[0x33], tn[0x33] / ntn, tn[0x32], tn[0x32] / ntn); - fprintf(stdout, " " FMT " GC" FMT " GCA " FMT " GCC " FMT " GCG " FMT " GCT\n", dn[0x0d], dn[0x0d] / ndn, tn[0x34], tn[0x34] / ntn, tn[0x35], tn[0x35] / ntn, tn[0x37], tn[0x37] / ntn, tn[0x36], tn[0x36] / ntn); - fprintf(stdout, " " FMT " GG" FMT " GGA " FMT " GGC " FMT " GGG " FMT " GGT\n", dn[0x0f], dn[0x0f] / ndn, tn[0x3c], tn[0x3c] / ntn, tn[0x3d], tn[0x3d] / ntn, tn[0x3f], tn[0x3f] / ntn, tn[0x3e], tn[0x3e] / ntn); - fprintf(stdout, " " FMT " GT" FMT " GTA " FMT " GTC " FMT " GTG " FMT " GTT\n", dn[0x0e], dn[0x0e] / ndn, tn[0x38], tn[0x38] / ntn, tn[0x39], tn[0x39] / ntn, tn[0x3b], tn[0x3b] / ntn, tn[0x3a], tn[0x3a] / ntn); - fprintf(stdout, " " FMT " TA" FMT " TAA " FMT " TAC " FMT " TAG " FMT " TAT\n", dn[0x08], dn[0x08] / ndn, tn[0x20], tn[0x20] / ntn, tn[0x21], tn[0x21] / ntn, tn[0x23], tn[0x23] / ntn, tn[0x22], tn[0x22] / ntn); - fprintf(stdout, " " FMT " TC" FMT " TCA " FMT " TCC " FMT " TCG " FMT " TCT\n", dn[0x09], dn[0x09] / ndn, tn[0x24], tn[0x24] / ntn, tn[0x25], tn[0x25] / ntn, tn[0x27], tn[0x27] / ntn, tn[0x26], tn[0x26] / ntn); - fprintf(stdout, " " FMT " TG" FMT " TGA " FMT " TGC " FMT " TGG " FMT " TGT\n", dn[0x0b], dn[0x0b] / ndn, tn[0x2c], tn[0x2c] / ntn, tn[0x2d], tn[0x2d] / ntn, tn[0x2f], tn[0x2f] / ntn, tn[0x2e], tn[0x2e] / ntn); - fprintf(stdout, " " FMT " TT" FMT " TTA " FMT " TTC " FMT " TTG " FMT " TTT\n", dn[0x0a], dn[0x0a] / ndn, tn[0x28], tn[0x28] / ntn, tn[0x29], tn[0x29] / ntn, tn[0x2b], tn[0x2b] / ntn, tn[0x2a], tn[0x2a] / ntn); - fprintf(stdout, "\n"); // Cleanup. @@ -447,6 +293,170 @@ doSummarize(vector &inputs, delete [] histPlot; delete [] nSeqPerLen; +} + + + +void +doSummarize(vector &inputs, + summarizeParameters &sumPar) { + + vector lengths; + + uint64 nSeqs = 0; + uint64 nBases = 0; + + uint32 mer = 0; + + uint64 mn[4] = {0}; + uint64 dn[4*4] = {0}; + uint64 tn[4*4*4] = {0}; + + double nmn = 0; + double ndn = 0; + double ntn = 0; + + uint32 nameMax = 0; + char *name = NULL; + uint64 seqMax = 0; + char *seq = NULL; + uint8 *qlt = NULL; + uint64 seqLen = 0; + + for (uint32 ff=0; ff> 1) & 0x03)) & 0x3f; + mn[mer & 0x03]++; + } + + if (pos < seqLen) { + mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f; + mn[mer & 0x03]++; + dn[mer & 0x0f]++; + } + + while (pos < seqLen) { + mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f; + mn[mer & 0x03]++; + dn[mer & 0x0f]++; + tn[mer & 0x3f]++; + } + + nmn += (seqLen-0); + ndn += (seqLen < 2) ? 0 : (seqLen-1); + ntn += (seqLen < 3) ? 0 : (seqLen-2); + // If we're NOT splitting on N, add one sequence of the given length. + + if (sumPar.breakAtN == false) { + nSeqs += 1; + nBases += seqLen; + + lengths.push_back(seqLen); + continue; + } + + // But if we ARE splitting on N, add multiple sequences. + + pos = 0; + bgn = 0; + + while (pos < seqLen) { + + // Skip any N's. + while ((pos < seqLen) && ((seq[pos] == 'n') || + (seq[pos] == 'N'))) + pos++; + + // Remember our start position. + bgn = pos; + + // Move ahead until the end of sequence or an N. + while ((pos < seqLen) && ((seq[pos] != 'n') && + (seq[pos] != 'N'))) + pos++; + + // If a sequence, increment stuff. + if (pos - bgn > 0) { + nSeqs += 1; + nBases += pos - bgn; + + lengths.push_back(pos - bgn); + } + } + } + + // All done! + + delete sf; + } + + delete [] name; + delete [] seq; + delete [] qlt; + + if (sumPar.genomeSize == 0) // If no genome size supplied, set it to the sum of lengths. + sumPar.genomeSize = nBases; + + // If only a simple histogram of lengths is requested, dump and done. + + if (sumPar.asSimple == true) { + doSummarize_lengthHistogramSimple(lengths); + } + + // If only the read lengths are requested, dump and done. + + else if (sumPar.asLength == true) { + doSummarize_dumpLengths(lengths); + } + + // Otherwise, generate a fancy histogram plot. + // And finish with the mono-, di- and tri-nucleotide frequencies. + +#define FMT "%12" F_U64P " %6.4f" +#define GC "%05.02f%%" + + else { + doSummarize_lengthHistogram(lengths, sumPar.genomeSize, sumPar.limitTo1x); + + if (nmn == 0) nmn = 1; // Avoid divide by zero. + if (ndn == 0) ndn = 1; + if (ntn == 0) ntn = 1; + + double gc = 100.0 * (mn[0x01] + mn[0x03]) / (mn[0x00] + mn[0x01] + mn[0x03] + mn[0x02]); + + fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n"); + fprintf(stdout, " mononucleotide dinucleotide trinucleotide\n"); + fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n"); + fprintf(stdout, "" FMT " A" FMT " AA" FMT " AAA " FMT " AAC " FMT " AAG " FMT " AAT\n", mn[0x00], mn[0x00] / nmn, dn[0x00], dn[0x00] / ndn, tn[0x00], tn[0x00] / ntn, tn[0x01], tn[0x01] / ntn, tn[0x03], tn[0x03] / ntn, tn[0x02], tn[0x02] / ntn); + fprintf(stdout, "" FMT " C" FMT " AC" FMT " ACA " FMT " ACC " FMT " ACG " FMT " ACT\n", mn[0x01], mn[0x01] / nmn, dn[0x01], dn[0x01] / ndn, tn[0x04], tn[0x04] / ntn, tn[0x05], tn[0x05] / ntn, tn[0x07], tn[0x07] / ntn, tn[0x06], tn[0x06] / ntn); + fprintf(stdout, "" FMT " G" FMT " AG" FMT " AGA " FMT " AGC " FMT " AGG " FMT " AGT\n", mn[0x03], mn[0x03] / nmn, dn[0x03], dn[0x03] / ndn, tn[0x0c], tn[0x0c] / ntn, tn[0x0d], tn[0x0d] / ntn, tn[0x0f], tn[0x0f] / ntn, tn[0x0e], tn[0x0e] / ntn); + fprintf(stdout, "" FMT " T" FMT " AT" FMT " ATA " FMT " ATC " FMT " ATG " FMT " ATT\n", mn[0x02], mn[0x02] / nmn, dn[0x02], dn[0x02] / ndn, tn[0x08], tn[0x08] / ntn, tn[0x09], tn[0x09] / ntn, tn[0x0b], tn[0x0b] / ntn, tn[0x0a], tn[0x0a] / ntn); + fprintf(stdout, " " FMT " CA" FMT " CAA " FMT " CAC " FMT " CAG " FMT " CAT\n", dn[0x04], dn[0x04] / ndn, tn[0x10], tn[0x10] / ntn, tn[0x11], tn[0x11] / ntn, tn[0x13], tn[0x13] / ntn, tn[0x12], tn[0x12] / ntn); + fprintf(stdout, " --GC-- --AT-- " FMT " CC" FMT " CCA " FMT " CCC " FMT " CCG " FMT " CCT\n", dn[0x05], dn[0x05] / ndn, tn[0x14], tn[0x14] / ntn, tn[0x15], tn[0x15] / ntn, tn[0x17], tn[0x17] / ntn, tn[0x16], tn[0x16] / ntn); + fprintf(stdout, " " GC " " GC " " FMT " CG" FMT " CGA " FMT " CGC " FMT " CGG " FMT " CGT\n", gc, 100 - gc, dn[0x07], dn[0x07] / ndn, tn[0x1c], tn[0x1c] / ntn, tn[0x1d], tn[0x1d] / ntn, tn[0x1f], tn[0x1f] / ntn, tn[0x1e], tn[0x1e] / ntn); + fprintf(stdout, " " FMT " CT" FMT " CTA " FMT " CTC " FMT " CTG " FMT " CTT\n", dn[0x06], dn[0x06] / ndn, tn[0x18], tn[0x18] / ntn, tn[0x19], tn[0x19] / ntn, tn[0x1b], tn[0x1b] / ntn, tn[0x1a], tn[0x1a] / ntn); + fprintf(stdout, " " FMT " GA" FMT " GAA " FMT " GAC " FMT " GAG " FMT " GAT\n", dn[0x0c], dn[0x0c] / ndn, tn[0x30], tn[0x30] / ntn, tn[0x31], tn[0x31] / ntn, tn[0x33], tn[0x33] / ntn, tn[0x32], tn[0x32] / ntn); + fprintf(stdout, " " FMT " GC" FMT " GCA " FMT " GCC " FMT " GCG " FMT " GCT\n", dn[0x0d], dn[0x0d] / ndn, tn[0x34], tn[0x34] / ntn, tn[0x35], tn[0x35] / ntn, tn[0x37], tn[0x37] / ntn, tn[0x36], tn[0x36] / ntn); + fprintf(stdout, " " FMT " GG" FMT " GGA " FMT " GGC " FMT " GGG " FMT " GGT\n", dn[0x0f], dn[0x0f] / ndn, tn[0x3c], tn[0x3c] / ntn, tn[0x3d], tn[0x3d] / ntn, tn[0x3f], tn[0x3f] / ntn, tn[0x3e], tn[0x3e] / ntn); + fprintf(stdout, " " FMT " GT" FMT " GTA " FMT " GTC " FMT " GTG " FMT " GTT\n", dn[0x0e], dn[0x0e] / ndn, tn[0x38], tn[0x38] / ntn, tn[0x39], tn[0x39] / ntn, tn[0x3b], tn[0x3b] / ntn, tn[0x3a], tn[0x3a] / ntn); + fprintf(stdout, " " FMT " TA" FMT " TAA " FMT " TAC " FMT " TAG " FMT " TAT\n", dn[0x08], dn[0x08] / ndn, tn[0x20], tn[0x20] / ntn, tn[0x21], tn[0x21] / ntn, tn[0x23], tn[0x23] / ntn, tn[0x22], tn[0x22] / ntn); + fprintf(stdout, " " FMT " TC" FMT " TCA " FMT " TCC " FMT " TCG " FMT " TCT\n", dn[0x09], dn[0x09] / ndn, tn[0x24], tn[0x24] / ntn, tn[0x25], tn[0x25] / ntn, tn[0x27], tn[0x27] / ntn, tn[0x26], tn[0x26] / ntn); + fprintf(stdout, " " FMT " TG" FMT " TGA " FMT " TGC " FMT " TGG " FMT " TGT\n", dn[0x0b], dn[0x0b] / ndn, tn[0x2c], tn[0x2c] / ntn, tn[0x2d], tn[0x2d] / ntn, tn[0x2f], tn[0x2f] / ntn, tn[0x2e], tn[0x2e] / ntn); + fprintf(stdout, " " FMT " TT" FMT " TTA " FMT " TTC " FMT " TTG " FMT " TTT\n", dn[0x0a], dn[0x0a] / ndn, tn[0x28], tn[0x28] / ntn, tn[0x29], tn[0x29] / ntn, tn[0x2b], tn[0x2b] / ntn, tn[0x2a], tn[0x2a] / ntn); + fprintf(stdout, "\n"); + } } diff --git a/src/stores/sqStoreDumpMetaData.C b/src/stores/sqStoreDumpMetaData.C index ca29c2524..195a19425 100644 --- a/src/stores/sqStoreDumpMetaData.C +++ b/src/stores/sqStoreDumpMetaData.C @@ -65,60 +65,217 @@ dumpLibs(sqStore *seq, uint32 bgnID, uint32 endID) { void +dumpReads_printHeader(sqRead_which w) { + char h1[1024] = {0}; + char h2[1024] = {0}; + char h3[1024] = {0}; + + if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) == 0))) { + strcat(h1, " --------NORMAL RAW READS--------"); + strcat(h2, " seqLen clearBgn clearEnd"); + strcat(h3, " ---------- ---------- ----------"); + } + + if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) != 0))) { + strcat(h1, " ------COMPRESSED RAW READS------"); + strcat(h2, " seqLen clearBgn clearEnd"); + strcat(h3, " ---------- ---------- ----------"); + } + + if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) == 0))) { + strcat(h1, " -----NORMAL CORRECTED READS-----"); + strcat(h2, " seqLen clearBgn clearEnd"); + strcat(h3, " ---------- ---------- ----------"); + } + + if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) != 0))) { + strcat(h1, " ---COMPRESSED CORRECTED READS---"); + strcat(h2, " seqLen clearBgn clearEnd"); + strcat(h3, " ---------- ---------- ----------"); + } + + fprintf(stdout, " %s \n", h1); + fprintf(stdout, " readID libraryID%s flags blob position\n", h2); + fprintf(stdout, "---------- ----------%s -------- ---- ------------\n", h3); +} + + + +bool dumpReads_setClearString(sqStore *seqs, uint32 rid, char *len, char *bgn, char *end, sqRead_which w) { if (seqs->sqStore_isValidRead(rid, w) == false) - memcpy(len, " -", sizeof(char) * 10); + memcpy(len, " -", sizeof(char) * 11); else if (seqs->sqStore_isIgnoredRead(rid, w) == true) - memcpy(len, " ignored", sizeof(char) * 10); + memcpy(len, " ignored", sizeof(char) * 11); else - snprintf(len, 11, "%10" F_U32P, seqs->sqStore_getReadLength(rid, w)); + snprintf(len, 12, " %10" F_U32P, seqs->sqStore_getReadLength(rid, w)); assert((w & sqRead_trimmed) == sqRead_unset); // Otherwise, length above is trimmed length! if (seqs->sqStore_isTrimmedRead(rid, w) == true) { - snprintf(bgn, 11, "%10" F_U32P, seqs->sqStore_getClearBgn(rid, w)); - snprintf(end, 11, "%10" F_U32P, seqs->sqStore_getClearEnd(rid, w)); + snprintf(bgn, 12, " %10" F_U32P, seqs->sqStore_getClearBgn(rid, w)); + snprintf(end, 12, " %10" F_U32P, seqs->sqStore_getClearEnd(rid, w)); } else { - memcpy(bgn, " -", sizeof(char) * 10); - memcpy(end, " -", sizeof(char) * 10); + memcpy(bgn, " -", sizeof(char) * 11); + memcpy(end, " -", sizeof(char) * 11); } - len[10] = 0; - bgn[10] = 0; - end[10] = 0; + len[11] = 0; + bgn[11] = 0; + end[11] = 0; + + return(seqs->sqStore_isValidRead(rid, w)); } void -dumpReads(sqStore *seqs, uint32 bgnID, uint32 endID) { +dumpReads_setFlagsString(sqStore *seqs, uint32 rid, char *flags) { + bool rv = seqs->sqStore_isValidRead (rid, sqRead_raw); + bool rt = seqs->sqStore_isTrimmedRead(rid, sqRead_raw); + bool cv = seqs->sqStore_isValidRead (rid, sqRead_corrected); + bool ct = seqs->sqStore_isTrimmedRead(rid, sqRead_corrected); + + flags[0] = 'r'; // Default to non-valid raw and corrected reads. + flags[1] = '-'; + flags[2] = '-'; + flags[3] = '-'; + + flags[4] = 'c'; + flags[5] = '-'; + flags[6] = '-'; + flags[7] = '-'; + + if (rv) { + flags[0] = 'R'; + flags[1] = (seqs->sqStore_isIgnoredRead(rid, sqRead_raw)) ? 'I' : 'R'; + + if (rt) { + flags[2] = 'T'; + flags[3] = (seqs->sqStore_isIgnoredRead(rid, sqRead_raw | sqRead_trimmed)) ? 'I' : 'T'; + } + } + + if (cv) { + flags[4] = 'C'; + flags[5] = (seqs->sqStore_isIgnoredRead(rid, sqRead_corrected)) ? 'I' : 'C'; + + if (ct) { + flags[6] = 'T'; + flags[7] = (seqs->sqStore_isIgnoredRead(rid, sqRead_corrected | sqRead_trimmed)) ? 'I' : 'T'; + } + } + + flags[8] = 0; +} + + + +void +dumpReads(sqStore *seqs, uint32 bgnID, uint32 endID, sqRead_which w) { + char l1[1024] = {0}; + char s1len[16] = {0}, s1bgn[16] = {0}, s1end[16] = {0}; char s2len[16] = {0}, s2bgn[16] = {0}, s2end[16] = {0}; char s3len[16] = {0}, s3bgn[16] = {0}, s3end[16] = {0}; char s4len[16] = {0}, s4bgn[16] = {0}, s4end[16] = {0}; + char flags[16] = {0}; - fprintf(stdout, " --------NORMAL RAW READS-------- ------COMPRESSED RAW READS------ -----NORMAL CORRECTED READS----- ---COMPRESSED CORRECTED READS--- \n"); - fprintf(stdout, " readID libraryID seqLen clearBgn clearEnd seqLen clearBgn clearEnd seqLen clearBgn clearEnd seqLen clearBgn clearEnd blobFile blobPos\n"); - fprintf(stdout, "---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- -------- ----------\n"); + dumpReads_printHeader(w); for (uint32 rid=bgnID; rid<=endID; rid++) { + bool active = false; + dumpReads_setClearString(seqs, rid, s1len, s1bgn, s1end, sqRead_raw); dumpReads_setClearString(seqs, rid, s2len, s2bgn, s2end, sqRead_raw | sqRead_compressed); dumpReads_setClearString(seqs, rid, s3len, s3bgn, s3end, sqRead_corrected); dumpReads_setClearString(seqs, rid, s4len, s4bgn, s4end, sqRead_corrected | sqRead_compressed); + dumpReads_setFlagsString(seqs, rid, flags); + + l1[0] = 0; + + if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) == 0))) { + active |= seqs->sqStore_isValidRead(rid, sqRead_raw); + strcat(l1, s1len); + strcat(l1, s1bgn); + strcat(l1, s1end); + } + + if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) != 0))) { + active |= seqs->sqStore_isValidRead(rid, sqRead_raw); + strcat(l1, s2len); + strcat(l1, s2bgn); + strcat(l1, s2end); + } + + if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) == 0))) { + active |= seqs->sqStore_isValidRead(rid, sqRead_corrected); + strcat(l1, s3len); + strcat(l1, s3bgn); + strcat(l1, s3end); + } - fprintf(stdout, "%10" F_U32P " %10" F_U32P " %s %s %s %s %s %s %s %s %s %s %s %s %8" F_U64P " %10" F_U64P "\n", - rid, - seqs->sqStore_getLibraryIDForRead(rid), - s1len, s1bgn, s1end, - s2len, s2bgn, s2end, - s3len, s3bgn, s3end, - s4len, s4bgn, s4end, - seqs->sqStore_getMeta(rid)->sqRead_mSegm(), - seqs->sqStore_getMeta(rid)->sqRead_mByte()); + if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) != 0))) { + active |= seqs->sqStore_isValidRead(rid, sqRead_corrected); + strcat(l1, s4len); + strcat(l1, s4bgn); + strcat(l1, s4end); + } + + if (active) + fprintf(stdout, "%10" F_U32P " %10" F_U32P "%s %s %4lu %12lu\n", + rid, + seqs->sqStore_getLibraryIDForRead(rid), + l1, + flags, + seqs->sqStore_getMeta(rid)->sqRead_mSegm(), + seqs->sqStore_getMeta(rid)->sqRead_mByte()); + } +} + + +void +doSummarize_lengthHistogram(vector lengths, + uint64 genomeSize, + bool limitTo1x); + + +void +dumpHistogram(sqStore *seqs, uint32 bgnID, uint32 endID, sqRead_which w, uint64 genomeSize, bool wantLengths) { + vector lengths; + + uint64 nSeqs = 0; + uint64 nBases = 0; + + // Build a vector of sequence lengths, pass that to 'sequence' to + // generate a pretty picture. + + for (uint32 rid=bgnID; rid<=endID; rid++) { + if (seqs->sqStore_isValidRead(rid, w) == false) // Skip invalid reads. + continue; + + if (seqs->sqStore_isIgnoredRead(rid, w) == true) // Skip ignored reads. + continue; + + if (seqs->sqStore_isTrimmedRead(rid, w) == false) // Skip untrimmed reads, + if (w & sqRead_trimmed) // if we want trimmed reads. + continue; + + lengths.push_back(seqs->sqStore_getReadLength(rid, w)); + } + + if (wantLengths == false) { + doSummarize_lengthHistogram(lengths, genomeSize, false); + } + + else { + sort(lengths.begin(), lengths.end(), less()); + + for (uint64 ii=0; iisqStore_lastReadID(); uint32 numLibs = seqStore->sqStore_lastLibraryID(); @@ -243,11 +455,13 @@ main(int argc, char **argv) { dumpLibs(seqStore, bgnID, endID); if (wantReads) - dumpReads(seqStore, bgnID, endID); + dumpReads(seqStore, bgnID, endID, which); if (wantStats) dumpStats(seqStore, bgnID, endID); + if (wantHistogram) + dumpHistogram(seqStore, bgnID, endID, which, genomeSize, wantLengths); delete seqStore; diff --git a/src/stores/sqStoreDumpMetaData.mk b/src/stores/sqStoreDumpMetaData.mk index adb7d0230..42e4bf636 100644 --- a/src/stores/sqStoreDumpMetaData.mk +++ b/src/stores/sqStoreDumpMetaData.mk @@ -9,9 +9,9 @@ ifeq "$(strip ${TARGET_DIR})" "" endif TARGET := sqStoreDumpMetaData -SOURCES := sqStoreDumpMetaData.C +SOURCES := sqStoreDumpMetaData.C ../sequence/sequence-summarize.C -SRC_INCDIRS := .. ../utility +SRC_INCDIRS := .. ../utility ../sequence TGT_LDFLAGS := -L${TARGET_DIR}/lib TGT_LDLIBS := -lcanu