From 730f95ea4bdea7de87aab0cb22698c6ba519b7ef Mon Sep 17 00:00:00 2001
From: "Brian P. Walenz" <thebri@gmail.com>
Date: Sun, 14 Jul 2019 23:15:33 -0400
Subject: [PATCH] Add read length histograms to sqStoreDumpMetaData; support
 reporting metadata for only specific read types.

---
 src/pipelines/canu/OverlapErrorAdjustment.pm |  21 +-
 src/pipelines/canu/SequenceStore.pm          | 108 ++---
 src/sequence/sequence-summarize.C            | 408 ++++++++++---------
 src/stores/sqStoreDumpMetaData.C             | 304 ++++++++++++--
 src/stores/sqStoreDumpMetaData.mk            |   4 +-
 5 files changed, 528 insertions(+), 317 deletions(-)
diff --git a/src/pipelines/canu/OverlapErrorAdjustment.pm b/src/pipelines/canu/OverlapErrorAdjustment.pm
index 25e3f825f..a3f9baf3d 100644
--- a/src/pipelines/canu/OverlapErrorAdjustment.pm
+++ b/src/pipelines/canu/OverlapErrorAdjustment.pm
@@ -74,22 +74,31 @@ sub loadReadLengths ($$$) {
     print STDERR "--\n";
     print STDERR "-- Loading read lengths.\n";
 
-    open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -reads 2> /dev/null |");
+    #  Dump corrected read lengths and trimming.  Load the length into
+    #  an array for later use.
+
+    open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -corrected -reads 2> /dev/null |");
     while (<F>) {
         s/^\s+//;
         s/\s+$//;
 
         next if (m/^readID/);   #  Header line
-        next if (m/^------/);   #  ------ ----
+        next if (m/^----/);     #  ------ ----
 
         my @v = split '\s+', $_;
+        my $l = 0;
+
+        if ($v[5] =~ m/CC--/) {   #  Corrected available, but no trimmed.
+            $l = $v[2];
+        }
 
-        next  if ($v[8] eq "-");
-        next  if ($v[8] eq "ignored");
+        if ($v[5] =~ m/CCTT/) {   #  Corrected and trimmed.
+            $l = $v[4] - $v[3];
+        }
 
-        vec($$rlVec, $v[0], 32) = $v[10] - $v[9];
+        vec($$rlVec, $v[0], 32) = $l;
         $rlCnt                 += 1;
-        $rlSum                 += $v[10] - $v[9];
+        $rlSum                 += $l;
     }
     close(F);
 
diff --git a/src/pipelines/canu/SequenceStore.pm b/src/pipelines/canu/SequenceStore.pm
index 57f30d92c..9857d43b9 100644
--- a/src/pipelines/canu/SequenceStore.pm
+++ b/src/pipelines/canu/SequenceStore.pm
@@ -331,32 +331,27 @@ sub generateReadLengthHistogram ($$) {
     my @rl;
     my @hi;
 
-    #  Load the read lengths, find min and max lengths.
+    #  Generate a lovely PNG histogram.
 
-    open(F, "$bin/sqStoreDumpMetaData -S ./$asm.seqStore -reads |") or caExit("can't dump meta data from './$asm.seqStore': $!", undef);
-    while (<F>) {
-        next  if (m/readID/);             #  Skip the header lines.
-        next  if (m/------/);
-
-        s/^\s+//;
-        s/\s+$//;
-
-        my @v = split '\s+', $_;
-        my $l = 0;
-
-        $l = $v[2]          if (($tag eq "cor") || ($tag eq "hap"));
-        $l = $v[8]          if (($tag eq "obt"));
-        $l = $v[10]-$v[9]   if (($tag eq "utg") && ($v[10] ne "-") && ($v[9] ne "-"));
+    if (! -e "./$asm.seqStore/readlengths-$tag.dat") {
+        my $cmd;
 
-        push @rl, $l        if (($l ne "-") && ($l > 0));
-    }
-    close(F);
+        $cmd  = "$bin/sqStoreDumpMetaData \\\n";
+        $cmd .= "  -S ./$asm.seqStore \\\n";
+        $cmd .= "  -raw \\\n"                  if ($tag eq "cor");
+        $cmd .= "  -corrected \\\n"            if ($tag eq "obt");
+        $cmd .= "  -corrected -trimmed \\\n"   if ($tag eq "utg");
+        $cmd .= "  -histogram " . getGlobal("genomeSize") . " \\\n";
+        $cmd .= "  -lengths \\\n";
+        $cmd .= "> ./$asm.seqStore/readlengths-$tag.dat \\\n";
+        $cmd .= "2> ./$asm.seqStore/readlengths-$tag.err \n";
 
-    @rl = sort { $a <=> $b } @rl;
+        if (runCommand(".", $cmd) > 0) {
+            caExit("sqStoreDumpMetaData failed", "./$asm.seqStore/readlengths-$tag.err");
+        }
 
-    #  Generate PNG histograms if there are any reads.
+        unlink "./$asm.seqStore/readlengths-$tag.err";
 
-    if ($reads > 0) {
         my $gnuplot = getGlobal("gnuplot");
         my $format  = getGlobal("gnuplotImageFormat");
 
@@ -375,12 +370,6 @@ sub generateReadLengthHistogram ($$) {
             print F "plot [] './$asm.seqStore/readlengths-$tag.dat' using (bin(\$1,binwidth)):(1.0) smooth freq with boxes title ''\n";
             close(F);
 
-            open(F, "> ./$asm.seqStore/readlengths-$tag.dat") or caExit("can't open './$asm.seqStore/readlengths-$tag.dat' for writing: $!", undef);
-            foreach my $rl (@rl) {
-                print F "$rl\n";
-            }
-            close(F);
-
             if (runCommandSilently(".", "$gnuplot < /dev/null ./$asm.seqStore/readlengths-$tag.gp > /dev/null 2>&1", 0)) {
                 print STDERR "--\n";
                 print STDERR "-- WARNING: gnuplot failed.\n";
@@ -390,54 +379,43 @@ sub generateReadLengthHistogram ($$) {
         }
     }
 
-    #  Generate the ASCII histogram.
-
-    $hist  = "--\n";
-    $hist .= "-- In sequence store './$asm.seqStore':\n";
-    $hist .= "--   Found $reads reads.\n";
-    $hist .= "--   Found $bases bases ($coverage times coverage).\n";
-
-    if ($reads > 0) {
-        my $minLen     = $rl[ 0];
-        my $maxLen     = $rl[-1];
-        my $scale      = 0;
-        my $bucketSize = 0;
-
-        #  Buckets of size 1000 are easy to interpret, but sometimes not ideal.
-
-        $bucketSize = 10000;
-        $bucketSize = 5000    if ($maxLen - $minLen < 1000000);
-        $bucketSize = 1000    if ($maxLen - $minLen < 100000);
-        $bucketSize = 100     if ($maxLen - $minLen < 10000);
+    #  Generate a lovely ASCII histogram.
 
-        #  Generate the histogram (int truncates)
-
-        my $mBgn = int($minLen / $bucketSize);
-        my $mEnd = int($maxLen / $bucketSize);
-
-        foreach my $rl (@rl) {
-            my $b = int($rl / $bucketSize);
+    if (! -e "./$asm.seqStore/readlengths-$tag.txt") {
+        my $cmd;
 
-            $hi[$b]++;
-        }
+        $cmd  = "$bin/sqStoreDumpMetaData \\\n";
+        $cmd .= "  -S ./$asm.seqStore \\\n";
+        $cmd .= "  -raw \\\n"                  if ($tag eq "cor");
+        $cmd .= "  -corrected \\\n"            if ($tag eq "obt");
+        $cmd .= "  -corrected -trimmed \\\n"   if ($tag eq "utg");
+        $cmd .= "  -histogram " . getGlobal("genomeSize") . " \\\n";
+        $cmd .= "> ./$asm.seqStore/readlengths-$tag.txt \\\n";
+        $cmd .= "2> ./$asm.seqStore/readlengths-$tag.err \n";
 
-        for (my $ii=$mBgn; $ii<=$mEnd; $ii++) {                           #  Scale the *'s so that the longest has 70 of 'em
-            $scale = $hi[$ii] / 70   if ($scale < $hi[$ii] / 70);
+        if (runCommand(".", $cmd) > 0) {
+            caExit("sqStoreDumpMetaData failed", "./$asm.seqStore/readlengths-$tag.err");
         }
 
-        #  Draw the histogram.
+        unlink "./$asm.seqStore/readlengths-$tag.err";
+    }
 
-        $hist .= "--\n";
-        $hist .= "--   Read length histogram (one '*' equals " . int(100 * $scale) / 100 . " reads):\n";
+    #  Read the ASCII histogram, append to report.
 
-        for (my $ii=$mBgn; $ii<=$mEnd; $ii++) {
-            my $s = $ii * $bucketSize;
-            my $e = $ii * $bucketSize + $bucketSize - 1;
+    $hist  = "--\n";
+    $hist .= "-- In sequence store './$asm.seqStore':\n";
+    $hist .= "--   Found $reads reads.\n";
+    $hist .= "--   Found $bases bases ($coverage times coverage).\n";
 
-            $hi[$ii] += 0;  #  Otherwise, cells with no count print as null.
+    if (-e "./$asm.seqStore/readlengths-$tag.txt") {
+        #$hist .= "--\n";
+        #$hist .= "--   Read length histogram (one '*' equals " . int(100 * $scale) / 100 . " reads):\n";
 
-            $hist .= sprintf("--   %6d %6d %6d %s\n", $s, $e, $hi[$ii], "*" x int($hi[$ii] / $scale));
+        open(F, "< ./$asm.seqStore/readlengths-$tag.txt") or caExit("can't open './$asm.seqStore/readlengths-$tag.txt' for reading: $!", undef);
+        while (<F>) {
+            $hist .= "--    $_";
         }
+        close(F);
     }
 
     #  Abort if the read coverage is too low.
diff --git a/src/sequence/sequence-summarize.C b/src/sequence/sequence-summarize.C
index 35732510f..338d6c9ab 100644
--- a/src/sequence/sequence-summarize.C
+++ b/src/sequence/sequence-summarize.C
@@ -90,173 +90,58 @@ doSummarize_loadSequence(dnaSeqFile  *sf,
 
 
 void
-doSummarize(vector<char *>       &inputs,
-            summarizeParameters  &sumPar) {
-
-  vector<uint64>  lengths;
-
-  uint64          nSeqs  = 0;
-  uint64          nBases = 0;
-
-  uint32          mer = 0;
-
-  uint64          mn[4]     = {0};
-  uint64          dn[4*4]   = {0};
-  uint64          tn[4*4*4] = {0};
-
-  double          nmn = 0;
-  double          ndn = 0;
-  double          ntn = 0;
-
-  uint32          nameMax = 0;
-  char           *name    = NULL;
-  uint64          seqMax  = 0;
-  char           *seq     = NULL;
-  uint8          *qlt     = NULL;
-  uint64          seqLen  = 0;
-
-  for (uint32 ff=0; ff<inputs.size(); ff++) {
-    dnaSeqFile  *sf = new dnaSeqFile(inputs[ff]);
-
-    //  If sequence,
-    //    Count mono-, di- and tri-nucleotides.
-    //    Count number of mono-, di- and tri-nucleotides.
-    //    Count number of sequences and total bases.
-    //    Save the lengths of sequences.
+doSummarize_lengthHistogramSimple(vector<uint64> lengths) {
+  uint64  bgn = 0;
+  uint64  end = 0;
 
-    while (doSummarize_loadSequence(sf, sumPar.asSequences, name, nameMax, seq, qlt, seqMax, seqLen) == true) {
-      uint64  pos = 0;
-      uint64  bgn = 0;
+  sort(lengths.begin(), lengths.end(), less<uint64>());
 
-      if (pos < seqLen) {
-        mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f;
-        mn[mer & 0x03]++;
-      }
+  while (bgn < lengths.size()) {
+    end = bgn + 1;
 
-      if (pos < seqLen) {
-        mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f;
-        mn[mer & 0x03]++;
-        dn[mer & 0x0f]++;
-      }
+    while ((end < lengths.size()) &&
+           (lengths[bgn] == lengths[end]))
+      end++;
 
-      while (pos < seqLen) {
-        mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f;
-        mn[mer & 0x03]++;
-        dn[mer & 0x0f]++;
-        tn[mer & 0x3f]++;
-      }
-
-      nmn +=                    (seqLen-0);
-      ndn += (seqLen < 2) ? 0 : (seqLen-1);
-      ntn += (seqLen < 3) ? 0 : (seqLen-2);
-
-      //  If we're NOT splitting on N, add one sequence of the given length.
-
-      if (sumPar.breakAtN == false) {
-        nSeqs  += 1;
-        nBases += seqLen;
-
-        lengths.push_back(seqLen);
-        continue;
-      }
+    fprintf(stdout, "%lu\t%lu\n", lengths[bgn], end - bgn);
 
-      //  But if we ARE splitting on N, add multiple sequences.
-
-      pos = 0;
-      bgn = 0;
-
-      while (pos < seqLen) {
-
-        //  Skip any N's.
-        while ((pos < seqLen) && ((seq[pos] == 'n') ||
-                                  (seq[pos] == 'N')))
-          pos++;
-
-        //  Remember our start position.
-        bgn = pos;
-
-        //  Move ahead until the end of sequence or an N.
-        while ((pos < seqLen) && ((seq[pos] != 'n') &&
-                                  (seq[pos] != 'N')))
-          pos++;
-
-        //  If a sequence, increment stuff.
-        if (pos - bgn > 0) {
-          nSeqs  += 1;
-          nBases += pos - bgn;
-
-          lengths.push_back(pos - bgn);
-        }
-      }
-    }
-
-    //  All done!
-
-    delete sf;
+    bgn = end;
   }
+}
 
-  delete [] name;
-  delete [] seq;
-  delete [] qlt;
-
-  //  Finalize.
-
-  sort(lengths.begin(), lengths.end(), greater<uint64>());
-
-  if (sumPar.genomeSize == 0)
-    sumPar.genomeSize = nBases;
-
-  //  If only a simple histogram of lengths is requested, dump and done.
-
-  if (sumPar.asSimple == true) {
-    uint64  bgn = 0;
-    uint64  end = 0;
-
-    reverse(lengths.begin(), lengths.end());
-
-    while (bgn < lengths.size()) {
-      end = bgn + 1;
-
-      while ((end < lengths.size()) &&
-             (lengths[bgn] == lengths[end]))
-        end++;
-
-      fprintf(stdout, "%lu\t%lu\n", lengths[bgn], end - bgn);
 
-      bgn = end;
-    }
 
-    return;
-  }
+void
+doSummarize_dumpLengths(vector<uint64> lengths) {
 
-  //  If only the read lengths are requested, dump and done.
+  sort(lengths.begin(), lengths.end(), less<uint64>());
 
-  if (sumPar.asLength == true) {
-    reverse(lengths.begin(), lengths.end());
+  for (uint64 ii=0; ii<lengths.size(); ii++)
+    fprintf(stdout, "%lu\n", lengths[ii]);
+}
 
-    for (uint64 ii=0; ii<lengths.size(); ii++)
-      fprintf(stdout, "%lu\n", lengths[ii]);
 
-    return;
-  }
 
-  //
+void
+doSummarize_lengthHistogram(vector<uint64> lengths,
+                            uint64         genomeSize,
+                            bool           limitTo1x) {
 
-  uint64   lSum  = 0;                                 //  Sum of the lengths we've encountered so far
+  sort(lengths.begin(), lengths.end(), greater<uint64>());
 
-  uint32   nStep = 10;                                //  Step of each N report.
-  uint32   nVal  = nStep;                             //  Index of the threshold we're next printing.
-  uint64   nThr  = sumPar.genomeSize * nVal / 100;    //  Threshold lenth; if sum is bigger, emit and move to the next threshold
+  uint32   nCols    = 63;                     //  Magic number to make the histogram the same width as the trinucleotide list
+  uint32   nRows    = 0;                      //  Height of the histogram; dynamically set.
+  uint32   nRowsMin = 50;                     //  Nothing really magic, just fits on the screen.
 
-  ////////////////////////////////////////
+  uint64   lSum  = 0;                         //  Sum of the lengths we've encountered so far
 
-  uint32   nCols    = 63;   //  Magic number to make the histogram the same width as the trinucleotide list
-  uint32   nRows    = 0;    //  Height of the histogram; dynamically set.
-  uint32   nRowsMin = 50;   //  Nothing really magic, just fits on the screen.
+  uint32   nStep = 10;                        //  Step of each N report.
+  uint32   nVal  = nStep;                     //  Index of the threshold we're next printing.
+  uint64   nThr  = genomeSize * nVal / 100;   //  Threshold lenth; if sum is bigger, emit and move to the next threshold
 
   //  Count the number of lines we expect to get in the NG table.
 
-  for (uint32 ii=0; ii<nSeqs; ii++) {
+  for (uint32 ii=0; ii<lengths.size(); ii++) {
     lSum += lengths[ii];
 
     while (lSum >= nThr) {
@@ -267,11 +152,11 @@ doSummarize(vector<char *>       &inputs,
       else if (nVal  < 20000)  nVal += nStep * 100;
       else                     nVal += nStep * 1000;
 
-      nThr  = sumPar.genomeSize * nVal / 100;
+      nThr  = genomeSize * nVal / 100;
     }
   }
 
-  uint64   minLength = lengths[nSeqs-1];
+  uint64   minLength = lengths[lengths.size()-1];
   uint64   maxLength = lengths[0];
 
   if (nRows < nRowsMin)                    //  If there are too few lines, make it some minimum value,
@@ -288,10 +173,10 @@ doSummarize(vector<char *>       &inputs,
   if (nRows > nRowsMin)
     nRows = nRowsMin;
 
-  lSum  = 0;                               //  Reset for actually generating the length histogram.
+  lSum  = 0;                                            //  Reset for actually generating the length histogram.
   nStep = 10;
   nVal  = nStep;
-  nThr  = sumPar.genomeSize * nVal / 100;
+  nThr  = genomeSize * nVal / 100;
 
   //  Generate the length histogram.
 
@@ -300,7 +185,7 @@ doSummarize(vector<char *>       &inputs,
   for (uint32 rr=0; rr<nRows+1; rr++)                   //  Clear the histogram.
     nSeqPerLen[rr] = 0;
 
-  for (uint32 ii=0; ii<nSeqs; ii++) {                   //  Count number of sequences per size range.
+  for (uint32 ii=0; ii<lengths.size(); ii++) {          //  Count number of sequences per size range.
     uint32 r = (lengths[ii] - minLength) / bucketSize;
 
     nSeqPerLen[r]++;
@@ -335,27 +220,22 @@ doSummarize(vector<char *>       &inputs,
     histPlot[rr][28 + nn]   = 0;
   }
 
-  //  If exact......
-
-
-
-
   //  Output N table, with length histogram appended at the end of each line.
 
   uint32  hp = 0;
 
   fprintf(stdout, "\n");
-  fprintf(stdout, "G=%-12" F_U64P "                     sum of  ||               length     num\n", sumPar.genomeSize);
+  fprintf(stdout, "G=%-12" F_U64P "                     sum of  ||               length     num\n", genomeSize);
   fprintf(stdout,   "NG         length     index       lengths  ||                range    seqs\n");
   fprintf(stdout,   "----- ------------ --------- ------------  ||  ------------------- -------\n");
 
   //  Write lines if we're showing all data, or if we're below 1x coverage.
 
-  for (uint32 ii=0; ii<nSeqs; ii++) {
+  for (uint32 ii=0; ii<lengths.size(); ii++) {
     lSum += lengths[ii];
 
     while (lSum >= nThr) {
-      if ((sumPar.limitTo1x == false) ||
+      if ((limitTo1x == false) ||
           (nVal <= 100)) {
         if (hp <= nRows)
           fprintf(stdout, "%05"    F_U32P " %12" F_U64P " %9" F_U32P " %12" F_U64P "  ||  %s\n",
@@ -371,13 +251,13 @@ doSummarize(vector<char *>       &inputs,
       else if (nVal < 20000)   nVal += nStep * 100;
       else                     nVal += nStep * 1000;
 
-      nThr  = sumPar.genomeSize * nVal / 100;
+      nThr  = genomeSize * nVal / 100;
     }
   }
 
   //  If we're displaying exactly 1x, write empty lines to get to there.
 
-  if (sumPar.limitTo1x == true) {
+  if (limitTo1x == true) {
     while (nVal <= 100) {
       if (hp <= nRows)
         fprintf(stdout, "%05"    F_U32P " %12s %9s %12s  ||  %s\n",
@@ -393,52 +273,18 @@ doSummarize(vector<char *>       &inputs,
 
   //  Now the final summary line.
 
-  if (sumPar.genomeSize == 0)
-    fprintf(stdout, "%07.3fx           %9" F_U64P " %12" F_U64P "  ||  %s\n", 0.0, nSeqs, lSum, histPlot[hp++]);   //  Occurs if only empty sequences in the input!
+  if (genomeSize == 0)
+    fprintf(stdout, "%07.3fx           %9" F_U64P " %12" F_U64P "  ||  %s\n", 0.0, lengths.size(), lSum, histPlot[hp++]);   //  Occurs if only empty sequences in the input!
   else if (hp <= nRows)
-    fprintf(stdout, "%07.3fx           %9" F_U64P " %12" F_U64P "  ||  %s\n", (double)lSum / sumPar.genomeSize, nSeqs, lSum, histPlot[hp++]);
+    fprintf(stdout, "%07.3fx           %9" F_U64P " %12" F_U64P "  ||  %s\n", (double)lSum / genomeSize, lengths.size(), lSum, histPlot[hp++]);
   else
-    fprintf(stdout, "%07.3fx           %9" F_U64P " %12" F_U64P "  ||\n",     (double)lSum / sumPar.genomeSize, nSeqs, lSum);
-
-  //fprintf(stdout,    "                                           ||  %s\n", histPlot[hp++]);
-  //fprintf(stdout,   "                 genome-size %12" F_U64P "  ||  %s\n", sumPar.genomeSize, histPlot[hp++]);
+    fprintf(stdout, "%07.3fx           %9" F_U64P " %12" F_U64P "  ||\n",     (double)lSum / genomeSize, lengths.size(), lSum);
 
   while (hp <= nRows)
     fprintf(stdout, "                                           ||  %s\n", histPlot[hp++]);
 
   fprintf(stdout, "\n");
 
-  //  Report.  To get alphabetic ordering correct, hardcoded.
-
-#define FMT "%12" F_U64P " %6.4f"
-#define GC "%05.02f%%"
-
-  if (nmn == 0)  nmn = 1;   //  Avoid divide by zero.
-  if (ndn == 0)  ndn = 1;
-  if (ntn == 0)  ntn = 1;
-
-  double gc = 100.0 * (mn[0x01] + mn[0x03]) / (mn[0x00] + mn[0x01] + mn[0x03] + mn[0x02]);
-
-  fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n");
-  fprintf(stdout, "       mononucleotide          dinucleotide                                                                                  trinucleotide\n");
-  fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n");
-  fprintf(stdout, ""             FMT " A" FMT " AA" FMT " AAA " FMT " AAC " FMT " AAG " FMT " AAT\n", mn[0x00], mn[0x00] / nmn, dn[0x00], dn[0x00] / ndn, tn[0x00], tn[0x00] / ntn, tn[0x01], tn[0x01] / ntn, tn[0x03], tn[0x03] / ntn, tn[0x02], tn[0x02] / ntn);
-  fprintf(stdout, ""             FMT " C" FMT " AC" FMT " ACA " FMT " ACC " FMT " ACG " FMT " ACT\n", mn[0x01], mn[0x01] / nmn, dn[0x01], dn[0x01] / ndn, tn[0x04], tn[0x04] / ntn, tn[0x05], tn[0x05] / ntn, tn[0x07], tn[0x07] / ntn, tn[0x06], tn[0x06] / ntn);
-  fprintf(stdout, ""             FMT " G" FMT " AG" FMT " AGA " FMT " AGC " FMT " AGG " FMT " AGT\n", mn[0x03], mn[0x03] / nmn, dn[0x03], dn[0x03] / ndn, tn[0x0c], tn[0x0c] / ntn, tn[0x0d], tn[0x0d] / ntn, tn[0x0f], tn[0x0f] / ntn, tn[0x0e], tn[0x0e] / ntn);
-  fprintf(stdout, ""             FMT " T" FMT " AT" FMT " ATA " FMT " ATC " FMT " ATG " FMT " ATT\n", mn[0x02], mn[0x02] / nmn, dn[0x02], dn[0x02] / ndn, tn[0x08], tn[0x08] / ntn, tn[0x09], tn[0x09] / ntn, tn[0x0b], tn[0x0b] / ntn, tn[0x0a], tn[0x0a] / ntn);
-  fprintf(stdout, "                     " FMT " CA" FMT " CAA " FMT " CAC " FMT " CAG " FMT " CAT\n",                           dn[0x04], dn[0x04] / ndn, tn[0x10], tn[0x10] / ntn, tn[0x11], tn[0x11] / ntn, tn[0x13], tn[0x13] / ntn, tn[0x12], tn[0x12] / ntn);
-  fprintf(stdout, "      --GC--  --AT-- " FMT " CC" FMT " CCA " FMT " CCC " FMT " CCG " FMT " CCT\n",                           dn[0x05], dn[0x05] / ndn, tn[0x14], tn[0x14] / ntn, tn[0x15], tn[0x15] / ntn, tn[0x17], tn[0x17] / ntn, tn[0x16], tn[0x16] / ntn);
-  fprintf(stdout, "      " GC "  " GC " " FMT " CG" FMT " CGA " FMT " CGC " FMT " CGG " FMT " CGT\n", gc, 100 - gc,             dn[0x07], dn[0x07] / ndn, tn[0x1c], tn[0x1c] / ntn, tn[0x1d], tn[0x1d] / ntn, tn[0x1f], tn[0x1f] / ntn, tn[0x1e], tn[0x1e] / ntn);
-  fprintf(stdout, "                     " FMT " CT" FMT " CTA " FMT " CTC " FMT " CTG " FMT " CTT\n",                           dn[0x06], dn[0x06] / ndn, tn[0x18], tn[0x18] / ntn, tn[0x19], tn[0x19] / ntn, tn[0x1b], tn[0x1b] / ntn, tn[0x1a], tn[0x1a] / ntn);
-  fprintf(stdout, "                     " FMT " GA" FMT " GAA " FMT " GAC " FMT " GAG " FMT " GAT\n",                           dn[0x0c], dn[0x0c] / ndn, tn[0x30], tn[0x30] / ntn, tn[0x31], tn[0x31] / ntn, tn[0x33], tn[0x33] / ntn, tn[0x32], tn[0x32] / ntn);
-  fprintf(stdout, "                     " FMT " GC" FMT " GCA " FMT " GCC " FMT " GCG " FMT " GCT\n",                           dn[0x0d], dn[0x0d] / ndn, tn[0x34], tn[0x34] / ntn, tn[0x35], tn[0x35] / ntn, tn[0x37], tn[0x37] / ntn, tn[0x36], tn[0x36] / ntn);
-  fprintf(stdout, "                     " FMT " GG" FMT " GGA " FMT " GGC " FMT " GGG " FMT " GGT\n",                           dn[0x0f], dn[0x0f] / ndn, tn[0x3c], tn[0x3c] / ntn, tn[0x3d], tn[0x3d] / ntn, tn[0x3f], tn[0x3f] / ntn, tn[0x3e], tn[0x3e] / ntn);
-  fprintf(stdout, "                     " FMT " GT" FMT " GTA " FMT " GTC " FMT " GTG " FMT " GTT\n",                           dn[0x0e], dn[0x0e] / ndn, tn[0x38], tn[0x38] / ntn, tn[0x39], tn[0x39] / ntn, tn[0x3b], tn[0x3b] / ntn, tn[0x3a], tn[0x3a] / ntn);
-  fprintf(stdout, "                     " FMT " TA" FMT " TAA " FMT " TAC " FMT " TAG " FMT " TAT\n",                           dn[0x08], dn[0x08] / ndn, tn[0x20], tn[0x20] / ntn, tn[0x21], tn[0x21] / ntn, tn[0x23], tn[0x23] / ntn, tn[0x22], tn[0x22] / ntn);
-  fprintf(stdout, "                     " FMT " TC" FMT " TCA " FMT " TCC " FMT " TCG " FMT " TCT\n",                           dn[0x09], dn[0x09] / ndn, tn[0x24], tn[0x24] / ntn, tn[0x25], tn[0x25] / ntn, tn[0x27], tn[0x27] / ntn, tn[0x26], tn[0x26] / ntn);
-  fprintf(stdout, "                     " FMT " TG" FMT " TGA " FMT " TGC " FMT " TGG " FMT " TGT\n",                           dn[0x0b], dn[0x0b] / ndn, tn[0x2c], tn[0x2c] / ntn, tn[0x2d], tn[0x2d] / ntn, tn[0x2f], tn[0x2f] / ntn, tn[0x2e], tn[0x2e] / ntn);
-  fprintf(stdout, "                     " FMT " TT" FMT " TTA " FMT " TTC " FMT " TTG " FMT " TTT\n",                           dn[0x0a], dn[0x0a] / ndn, tn[0x28], tn[0x28] / ntn, tn[0x29], tn[0x29] / ntn, tn[0x2b], tn[0x2b] / ntn, tn[0x2a], tn[0x2a] / ntn);
-  fprintf(stdout, "\n");
 
   //  Cleanup.
 
@@ -447,6 +293,170 @@ doSummarize(vector<char *>       &inputs,
 
   delete [] histPlot;
   delete [] nSeqPerLen;
+}
+
+
+
+void
+doSummarize(vector<char *>       &inputs,
+            summarizeParameters  &sumPar) {
+
+  vector<uint64>  lengths;
+
+  uint64          nSeqs  = 0;
+  uint64          nBases = 0;
+
+  uint32          mer = 0;
+
+  uint64          mn[4]     = {0};
+  uint64          dn[4*4]   = {0};
+  uint64          tn[4*4*4] = {0};
+
+  double          nmn = 0;
+  double          ndn = 0;
+  double          ntn = 0;
+
+  uint32          nameMax = 0;
+  char           *name    = NULL;
+  uint64          seqMax  = 0;
+  char           *seq     = NULL;
+  uint8          *qlt     = NULL;
+  uint64          seqLen  = 0;
+
+  for (uint32 ff=0; ff<inputs.size(); ff++) {
+    dnaSeqFile  *sf = new dnaSeqFile(inputs[ff]);
+
+    //  If sequence,
+    //    Count mono-, di- and tri-nucleotides.
+    //    Count number of mono-, di- and tri-nucleotides.
+    //    Count number of sequences and total bases.
+    //    Save the lengths of sequences.
+
+    while (doSummarize_loadSequence(sf, sumPar.asSequences, name, nameMax, seq, qlt, seqMax, seqLen) == true) {
+      uint64  pos = 0;
+      uint64  bgn = 0;
+
+      if (pos < seqLen) {
+        mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f;
+        mn[mer & 0x03]++;
+      }
+
+      if (pos < seqLen) {
+        mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f;
+        mn[mer & 0x03]++;
+        dn[mer & 0x0f]++;
+      }
+
+      while (pos < seqLen) {
+        mer = ((mer << 2) | ((seq[pos++] >> 1) & 0x03)) & 0x3f;
+        mn[mer & 0x03]++;
+        dn[mer & 0x0f]++;
+        tn[mer & 0x3f]++;
+      }
+
+      nmn +=                    (seqLen-0);
+      ndn += (seqLen < 2) ? 0 : (seqLen-1);
+      ntn += (seqLen < 3) ? 0 : (seqLen-2);
 
+      //  If we're NOT splitting on N, add one sequence of the given length.
+
+      if (sumPar.breakAtN == false) {
+        nSeqs  += 1;
+        nBases += seqLen;
+
+        lengths.push_back(seqLen);
+        continue;
+      }
+
+      //  But if we ARE splitting on N, add multiple sequences.
+
+      pos = 0;
+      bgn = 0;
+
+      while (pos < seqLen) {
+
+        //  Skip any N's.
+        while ((pos < seqLen) && ((seq[pos] == 'n') ||
+                                  (seq[pos] == 'N')))
+          pos++;
+
+        //  Remember our start position.
+        bgn = pos;
+
+        //  Move ahead until the end of sequence or an N.
+        while ((pos < seqLen) && ((seq[pos] != 'n') &&
+                                  (seq[pos] != 'N')))
+          pos++;
+
+        //  If a sequence, increment stuff.
+        if (pos - bgn > 0) {
+          nSeqs  += 1;
+          nBases += pos - bgn;
+
+          lengths.push_back(pos - bgn);
+        }
+      }
+    }
+
+    //  All done!
+
+    delete sf;
+  }
+
+  delete [] name;
+  delete [] seq;
+  delete [] qlt;
+
+  if (sumPar.genomeSize == 0)      //  If no genome size supplied, set it to the sum of lengths.
+    sumPar.genomeSize = nBases;
+
+  //  If only a simple histogram of lengths is requested, dump and done.
+
+  if (sumPar.asSimple == true) {
+    doSummarize_lengthHistogramSimple(lengths);
+  }
+
+  //  If only the read lengths are requested, dump and done.
+
+  else if (sumPar.asLength == true) {
+    doSummarize_dumpLengths(lengths);
+  }
+
+  //  Otherwise, generate a fancy histogram plot.
+  //  And finish with the mono-, di- and tri-nucleotide frequencies.
+
+#define FMT "%12" F_U64P " %6.4f"
+#define GC "%05.02f%%"
+
+  else {
+    doSummarize_lengthHistogram(lengths, sumPar.genomeSize, sumPar.limitTo1x);
+
+    if (nmn == 0)  nmn = 1;   //  Avoid divide by zero.
+    if (ndn == 0)  ndn = 1;
+    if (ntn == 0)  ntn = 1;
+
+    double gc = 100.0 * (mn[0x01] + mn[0x03]) / (mn[0x00] + mn[0x01] + mn[0x03] + mn[0x02]);
+
+    fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n");
+    fprintf(stdout, "       mononucleotide          dinucleotide                                                                                  trinucleotide\n");
+    fprintf(stdout, "--------------------- --------------------- ----------------------------------------------------------------------------------------------\n");
+    fprintf(stdout, ""             FMT " A" FMT " AA" FMT " AAA " FMT " AAC " FMT " AAG " FMT " AAT\n", mn[0x00], mn[0x00] / nmn, dn[0x00], dn[0x00] / ndn, tn[0x00], tn[0x00] / ntn, tn[0x01], tn[0x01] / ntn, tn[0x03], tn[0x03] / ntn, tn[0x02], tn[0x02] / ntn);
+    fprintf(stdout, ""             FMT " C" FMT " AC" FMT " ACA " FMT " ACC " FMT " ACG " FMT " ACT\n", mn[0x01], mn[0x01] / nmn, dn[0x01], dn[0x01] / ndn, tn[0x04], tn[0x04] / ntn, tn[0x05], tn[0x05] / ntn, tn[0x07], tn[0x07] / ntn, tn[0x06], tn[0x06] / ntn);
+    fprintf(stdout, ""             FMT " G" FMT " AG" FMT " AGA " FMT " AGC " FMT " AGG " FMT " AGT\n", mn[0x03], mn[0x03] / nmn, dn[0x03], dn[0x03] / ndn, tn[0x0c], tn[0x0c] / ntn, tn[0x0d], tn[0x0d] / ntn, tn[0x0f], tn[0x0f] / ntn, tn[0x0e], tn[0x0e] / ntn);
+    fprintf(stdout, ""             FMT " T" FMT " AT" FMT " ATA " FMT " ATC " FMT " ATG " FMT " ATT\n", mn[0x02], mn[0x02] / nmn, dn[0x02], dn[0x02] / ndn, tn[0x08], tn[0x08] / ntn, tn[0x09], tn[0x09] / ntn, tn[0x0b], tn[0x0b] / ntn, tn[0x0a], tn[0x0a] / ntn);
+    fprintf(stdout, "                     " FMT " CA" FMT " CAA " FMT " CAC " FMT " CAG " FMT " CAT\n",                           dn[0x04], dn[0x04] / ndn, tn[0x10], tn[0x10] / ntn, tn[0x11], tn[0x11] / ntn, tn[0x13], tn[0x13] / ntn, tn[0x12], tn[0x12] / ntn);
+    fprintf(stdout, "      --GC--  --AT-- " FMT " CC" FMT " CCA " FMT " CCC " FMT " CCG " FMT " CCT\n",                           dn[0x05], dn[0x05] / ndn, tn[0x14], tn[0x14] / ntn, tn[0x15], tn[0x15] / ntn, tn[0x17], tn[0x17] / ntn, tn[0x16], tn[0x16] / ntn);
+    fprintf(stdout, "      " GC "  " GC " " FMT " CG" FMT " CGA " FMT " CGC " FMT " CGG " FMT " CGT\n", gc, 100 - gc,             dn[0x07], dn[0x07] / ndn, tn[0x1c], tn[0x1c] / ntn, tn[0x1d], tn[0x1d] / ntn, tn[0x1f], tn[0x1f] / ntn, tn[0x1e], tn[0x1e] / ntn);
+    fprintf(stdout, "                     " FMT " CT" FMT " CTA " FMT " CTC " FMT " CTG " FMT " CTT\n",                           dn[0x06], dn[0x06] / ndn, tn[0x18], tn[0x18] / ntn, tn[0x19], tn[0x19] / ntn, tn[0x1b], tn[0x1b] / ntn, tn[0x1a], tn[0x1a] / ntn);
+    fprintf(stdout, "                     " FMT " GA" FMT " GAA " FMT " GAC " FMT " GAG " FMT " GAT\n",                           dn[0x0c], dn[0x0c] / ndn, tn[0x30], tn[0x30] / ntn, tn[0x31], tn[0x31] / ntn, tn[0x33], tn[0x33] / ntn, tn[0x32], tn[0x32] / ntn);
+    fprintf(stdout, "                     " FMT " GC" FMT " GCA " FMT " GCC " FMT " GCG " FMT " GCT\n",                           dn[0x0d], dn[0x0d] / ndn, tn[0x34], tn[0x34] / ntn, tn[0x35], tn[0x35] / ntn, tn[0x37], tn[0x37] / ntn, tn[0x36], tn[0x36] / ntn);
+    fprintf(stdout, "                     " FMT " GG" FMT " GGA " FMT " GGC " FMT " GGG " FMT " GGT\n",                           dn[0x0f], dn[0x0f] / ndn, tn[0x3c], tn[0x3c] / ntn, tn[0x3d], tn[0x3d] / ntn, tn[0x3f], tn[0x3f] / ntn, tn[0x3e], tn[0x3e] / ntn);
+    fprintf(stdout, "                     " FMT " GT" FMT " GTA " FMT " GTC " FMT " GTG " FMT " GTT\n",                           dn[0x0e], dn[0x0e] / ndn, tn[0x38], tn[0x38] / ntn, tn[0x39], tn[0x39] / ntn, tn[0x3b], tn[0x3b] / ntn, tn[0x3a], tn[0x3a] / ntn);
+    fprintf(stdout, "                     " FMT " TA" FMT " TAA " FMT " TAC " FMT " TAG " FMT " TAT\n",                           dn[0x08], dn[0x08] / ndn, tn[0x20], tn[0x20] / ntn, tn[0x21], tn[0x21] / ntn, tn[0x23], tn[0x23] / ntn, tn[0x22], tn[0x22] / ntn);
+    fprintf(stdout, "                     " FMT " TC" FMT " TCA " FMT " TCC " FMT " TCG " FMT " TCT\n",                           dn[0x09], dn[0x09] / ndn, tn[0x24], tn[0x24] / ntn, tn[0x25], tn[0x25] / ntn, tn[0x27], tn[0x27] / ntn, tn[0x26], tn[0x26] / ntn);
+    fprintf(stdout, "                     " FMT " TG" FMT " TGA " FMT " TGC " FMT " TGG " FMT " TGT\n",                           dn[0x0b], dn[0x0b] / ndn, tn[0x2c], tn[0x2c] / ntn, tn[0x2d], tn[0x2d] / ntn, tn[0x2f], tn[0x2f] / ntn, tn[0x2e], tn[0x2e] / ntn);
+    fprintf(stdout, "                     " FMT " TT" FMT " TTA " FMT " TTC " FMT " TTG " FMT " TTT\n",                           dn[0x0a], dn[0x0a] / ndn, tn[0x28], tn[0x28] / ntn, tn[0x29], tn[0x29] / ntn, tn[0x2b], tn[0x2b] / ntn, tn[0x2a], tn[0x2a] / ntn);
+    fprintf(stdout, "\n");
+  }
 }
 
diff --git a/src/stores/sqStoreDumpMetaData.C b/src/stores/sqStoreDumpMetaData.C
index ca29c2524..195a19425 100644
--- a/src/stores/sqStoreDumpMetaData.C
+++ b/src/stores/sqStoreDumpMetaData.C
@@ -65,60 +65,217 @@ dumpLibs(sqStore *seq, uint32 bgnID, uint32 endID) {
 
 
 void
+dumpReads_printHeader(sqRead_which w) {
+  char  h1[1024] = {0};
+  char  h2[1024] = {0};
+  char  h3[1024] = {0};
+
+  if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) == 0))) {
+    strcat(h1, " --------NORMAL RAW READS--------");
+    strcat(h2, "     seqLen   clearBgn   clearEnd");
+    strcat(h3, " ---------- ---------- ----------");
+  }
+
+  if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) != 0))) {
+    strcat(h1, " ------COMPRESSED RAW READS------");
+    strcat(h2, "     seqLen   clearBgn   clearEnd");
+    strcat(h3, " ---------- ---------- ----------");
+  }
+
+  if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) == 0))) {
+    strcat(h1, " -----NORMAL CORRECTED READS-----");
+    strcat(h2, "     seqLen   clearBgn   clearEnd");
+    strcat(h3, " ---------- ---------- ----------");
+  }
+
+  if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) != 0))) {
+    strcat(h1, " ---COMPRESSED CORRECTED READS---");
+    strcat(h2, "     seqLen   clearBgn   clearEnd");
+    strcat(h3, " ---------- ---------- ----------");
+  }
+
+  fprintf(stdout, "                     %s \n", h1);
+  fprintf(stdout, "    readID  libraryID%s     flags blob     position\n", h2);
+  fprintf(stdout, "---------- ----------%s  -------- ---- ------------\n", h3);
+}
+
+
+
+bool
 dumpReads_setClearString(sqStore *seqs, uint32 rid, char *len, char *bgn, char *end, sqRead_which w) {
 
   if (seqs->sqStore_isValidRead(rid, w) == false)
-    memcpy(len, "         -", sizeof(char) * 10);
+    memcpy(len, "          -", sizeof(char) * 11);
 
   else if (seqs->sqStore_isIgnoredRead(rid, w) == true)
-    memcpy(len, "   ignored", sizeof(char) * 10);
+    memcpy(len, "    ignored", sizeof(char) * 11);
 
   else
-    snprintf(len, 11, "%10" F_U32P, seqs->sqStore_getReadLength(rid, w));
+    snprintf(len, 12, " %10" F_U32P, seqs->sqStore_getReadLength(rid, w));
 
   assert((w & sqRead_trimmed) == sqRead_unset);   //  Otherwise, length above is trimmed length!
 
   if (seqs->sqStore_isTrimmedRead(rid, w) == true) {
-    snprintf(bgn, 11, "%10" F_U32P, seqs->sqStore_getClearBgn(rid, w));
-    snprintf(end, 11, "%10" F_U32P, seqs->sqStore_getClearEnd(rid, w));
+    snprintf(bgn, 12, " %10" F_U32P, seqs->sqStore_getClearBgn(rid, w));
+    snprintf(end, 12, " %10" F_U32P, seqs->sqStore_getClearEnd(rid, w));
   } else {
-    memcpy(bgn, "         -", sizeof(char) * 10);
-    memcpy(end, "         -", sizeof(char) * 10);
+    memcpy(bgn, "          -", sizeof(char) * 11);
+    memcpy(end, "          -", sizeof(char) * 11);
   }
 
-  len[10] = 0;
-  bgn[10] = 0;
-  end[10] = 0;
+  len[11] = 0;
+  bgn[11] = 0;
+  end[11] = 0;
+
+  return(seqs->sqStore_isValidRead(rid, w));
 }
 
 
 
 void
-dumpReads(sqStore *seqs, uint32 bgnID, uint32 endID) {
+dumpReads_setFlagsString(sqStore *seqs, uint32 rid, char *flags) {
+  bool   rv = seqs->sqStore_isValidRead  (rid, sqRead_raw);
+  bool   rt = seqs->sqStore_isTrimmedRead(rid, sqRead_raw);
+  bool   cv = seqs->sqStore_isValidRead  (rid, sqRead_corrected);
+  bool   ct = seqs->sqStore_isTrimmedRead(rid, sqRead_corrected);
+
+  flags[0] = 'r';   //  Default to non-valid raw and corrected reads.
+  flags[1] = '-';
+  flags[2] = '-';
+  flags[3] = '-';
+
+  flags[4] = 'c';
+  flags[5] = '-';
+  flags[6] = '-';
+  flags[7] = '-';
+
+  if (rv) {
+    flags[0] = 'R';
+    flags[1] = (seqs->sqStore_isIgnoredRead(rid, sqRead_raw)) ? 'I' : 'R';
+
+    if (rt) {
+      flags[2] = 'T';
+      flags[3] = (seqs->sqStore_isIgnoredRead(rid, sqRead_raw | sqRead_trimmed)) ? 'I' : 'T';
+    }
+  }
+
+  if (cv) {
+    flags[4] = 'C';
+    flags[5] = (seqs->sqStore_isIgnoredRead(rid, sqRead_corrected)) ? 'I' : 'C';
+
+    if (ct) {
+      flags[6] = 'T';
+      flags[7] = (seqs->sqStore_isIgnoredRead(rid, sqRead_corrected | sqRead_trimmed)) ? 'I' : 'T';
+    }
+  }
+
+  flags[8] = 0;
+}
+
+
+
+void
+dumpReads(sqStore *seqs, uint32 bgnID, uint32 endID, sqRead_which w) {
+  char   l1[1024] = {0};
+  
   char   s1len[16] = {0}, s1bgn[16] = {0}, s1end[16] = {0};
   char   s2len[16] = {0}, s2bgn[16] = {0}, s2end[16] = {0};
   char   s3len[16] = {0}, s3bgn[16] = {0}, s3end[16] = {0};
   char   s4len[16] = {0}, s4bgn[16] = {0}, s4end[16] = {0};
+  char   flags[16] = {0};
 
-  fprintf(stdout, "                      --------NORMAL RAW READS-------- ------COMPRESSED RAW READS------ -----NORMAL CORRECTED READS----- ---COMPRESSED CORRECTED READS--- \n");
-  fprintf(stdout, "    readID  libraryID     seqLen   clearBgn   clearEnd     seqLen   clearBgn   clearEnd     seqLen   clearBgn   clearEnd     seqLen   clearBgn   clearEnd blobFile    blobPos\n");
-  fprintf(stdout, "---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- -------- ----------\n");
+  dumpReads_printHeader(w);
 
   for (uint32 rid=bgnID; rid<=endID; rid++) {
+    bool  active = false;
+
     dumpReads_setClearString(seqs, rid, s1len, s1bgn, s1end, sqRead_raw);
     dumpReads_setClearString(seqs, rid, s2len, s2bgn, s2end, sqRead_raw       | sqRead_compressed);
     dumpReads_setClearString(seqs, rid, s3len, s3bgn, s3end, sqRead_corrected);
     dumpReads_setClearString(seqs, rid, s4len, s4bgn, s4end, sqRead_corrected | sqRead_compressed);
+    dumpReads_setFlagsString(seqs, rid, flags);
+
+    l1[0] = 0;
+
+    if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) == 0))) {
+      active |= seqs->sqStore_isValidRead(rid, sqRead_raw);
+      strcat(l1, s1len);
+      strcat(l1, s1bgn);
+      strcat(l1, s1end);
+    }
+
+    if ((w == sqRead_unset) || (((w & sqRead_raw) != 0) && ((w & sqRead_compressed) != 0))) {
+      active |= seqs->sqStore_isValidRead(rid, sqRead_raw);
+      strcat(l1, s2len);
+      strcat(l1, s2bgn);
+      strcat(l1, s2end);
+    }
+
+    if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) == 0))) {
+      active |= seqs->sqStore_isValidRead(rid, sqRead_corrected);
+      strcat(l1, s3len);
+      strcat(l1, s3bgn);
+      strcat(l1, s3end);
+    }
 
-    fprintf(stdout, "%10" F_U32P " %10" F_U32P " %s %s %s %s %s %s %s %s %s %s %s %s %8" F_U64P " %10" F_U64P "\n",
-            rid,
-            seqs->sqStore_getLibraryIDForRead(rid),
-            s1len, s1bgn, s1end,
-            s2len, s2bgn, s2end,
-            s3len, s3bgn, s3end,
-            s4len, s4bgn, s4end,
-            seqs->sqStore_getMeta(rid)->sqRead_mSegm(),
-            seqs->sqStore_getMeta(rid)->sqRead_mByte());
+    if ((w == sqRead_unset) || (((w & sqRead_corrected) != 0) && ((w & sqRead_compressed) != 0))) {
+      active |= seqs->sqStore_isValidRead(rid, sqRead_corrected);
+      strcat(l1, s4len);
+      strcat(l1, s4bgn);
+      strcat(l1, s4end);
+    }
+
+    if (active)
+      fprintf(stdout, "%10" F_U32P " %10" F_U32P "%s  %s %4lu %12lu\n",
+              rid,
+              seqs->sqStore_getLibraryIDForRead(rid),
+              l1,
+              flags,
+              seqs->sqStore_getMeta(rid)->sqRead_mSegm(),
+              seqs->sqStore_getMeta(rid)->sqRead_mByte());
+  }
+}
+
+
+void
+doSummarize_lengthHistogram(vector<uint64> lengths,
+                            uint64         genomeSize,
+                            bool           limitTo1x);
+
+
+void
+dumpHistogram(sqStore *seqs, uint32 bgnID, uint32 endID, sqRead_which w, uint64 genomeSize, bool wantLengths) {
+  vector<uint64>  lengths;
+
+  uint64          nSeqs  = 0;
+  uint64          nBases = 0;
+
+  //  Build a vector of sequence lengths, pass that to 'sequence' to
+  //  generate a pretty picture.
+
+  for (uint32 rid=bgnID; rid<=endID; rid++) {
+    if (seqs->sqStore_isValidRead(rid, w) == false)     //  Skip invalid reads.
+      continue;
+
+    if (seqs->sqStore_isIgnoredRead(rid, w) == true)    //  Skip ignored reads.
+      continue;
+
+    if (seqs->sqStore_isTrimmedRead(rid, w) == false)   //  Skip untrimmed reads,
+      if (w & sqRead_trimmed)                           //  if we want trimmed reads.
+        continue;
+
+    lengths.push_back(seqs->sqStore_getReadLength(rid, w));
+  }
+
+  if (wantLengths == false) {
+    doSummarize_lengthHistogram(lengths, genomeSize, false);
+  }
+
+  else {
+    sort(lengths.begin(), lengths.end(), less<uint64>());
+
+    for (uint64 ii=0; ii<lengths.size(); ii++)
+      fprintf(stdout, "%lu\n", lengths[ii]);
   }
 }
 
@@ -149,13 +306,19 @@ int
 main(int argc, char **argv) {
   char            *seqStoreName      = NULL;
 
+  sqRead_which     which             = sqRead_unset;
+
   bool             wantLibs          = false;
   bool             wantReads         = true;
-  bool             wantStats         = false;  //  Useful only for reads
+  bool             wantStats         = false;
+  bool             wantHistogram     = false;
+  bool             wantLengths       = false;
 
   uint32           bgnID             = 1;
   uint32           endID             = UINT32_MAX;
 
+  uint64           genomeSize        = 0;
+
   argc = AS_configure(argc, argv);
 
   int arg = 1;
@@ -163,32 +326,72 @@ main(int argc, char **argv) {
   while (arg < argc) {
     if        (strcmp(argv[arg], "-S") == 0) {
       seqStoreName = argv[++arg];
+    }
+
+    else if (strcmp(argv[arg], "-libs") == 0) {
+      wantLibs      = true;
+      wantReads     = false;
+      wantStats     = false;
+      wantHistogram = false;
+      wantLengths   = false;
+    }
+
+    else if (strcmp(argv[arg], "-reads") == 0) {
+      wantLibs      = false;
+      wantReads     = true;
+      wantStats     = false;
+      wantHistogram = false;
+      wantLengths   = false;
+    }
+
+    else if (strcmp(argv[arg], "-stats") == 0) {
+      wantLibs      = false;
+      wantReads     = false;
+      wantStats     = true;
+      wantHistogram = false;
+      wantLengths   = false;
+    }
 
-    } else if (strcmp(argv[arg], "-libs") == 0) {
-      wantLibs  = true;
-      wantReads = false;
-      wantStats = false;
+    else if (strcmp(argv[arg], "-histogram") == 0) {
+      wantLibs      = false;
+      wantReads     = false;
+      wantStats     = false;
+      wantHistogram = true;
+      wantLengths   = false;
+      genomeSize    = strtouint64(argv[++arg]);
+    }
 
-    } else if (strcmp(argv[arg], "-reads") == 0) {
-      wantLibs  = false;
-      wantReads = true;
-      wantStats = false;
+    else if (strcmp(argv[arg], "-lengths") == 0) {
+      wantLibs      = false;
+      wantReads     = false;
+      wantStats     = false;
+      wantHistogram = true;
+      wantLengths   = true;
+    }
 
-    } else if (strcmp(argv[arg], "-stats") == 0) {
-      wantLibs  = false;
-      wantReads = false;
-      wantStats = true;
+    else if (strcmp(argv[arg], "-raw") == 0) {
+      which |=  sqRead_raw;
+      which &= ~sqRead_corrected;
+    }
 
-    } else if (strcmp(argv[arg], "-b") == 0) {
-      bgnID = atoi(argv[++arg]);
+    else if (strcmp(argv[arg], "-corrected") == 0) {
+      which &= ~sqRead_raw;
+      which |=  sqRead_corrected;
+    }
 
-    } else if (strcmp(argv[arg], "-e") == 0) {
-      endID  = atoi(argv[++arg]);
+    else if (strcmp(argv[arg], "-trimmed") == 0) {
+      which |=  sqRead_trimmed;
+    }
 
-    } else if (strcmp(argv[arg], "-r") == 0) {
+    else if (strcmp(argv[arg], "-compressed") == 0) {
+      which |=  sqRead_compressed;
+    }
+
+    else if (strcmp(argv[arg], "-r") == 0) {
       decodeRange(argv[++arg], bgnID, endID);
+    }
 
-    } else {
+    else {
       err++;
       fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
     }
@@ -201,17 +404,24 @@ main(int argc, char **argv) {
   if (err) {
     fprintf(stderr, "usage: %s -S seqStore [p] [...]\n", argv[0]);
     fprintf(stderr, "\n");
+    fprintf(stderr, "INPUTS:\n");
     fprintf(stderr, "  -S seqStore      dump reads from 'seqStore'\n");
     fprintf(stderr, "\n");
+    fprintf(stderr, "OUTPUT FORMAT:\n");
     fprintf(stderr, "  -libs            dump information about libraries\n");
     fprintf(stderr, "  -reads           dump information about reads\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  -stats           dump summary statistics on reads\n");
     fprintf(stderr, "\n");
-    fprintf(stderr, "  -b id            output starting at read/library 'id'     DEPRECATED\n");
-    fprintf(stderr, "  -e id            output stopping after read/library 'id'  DEPRECATED");
+    fprintf(stderr, "  -histogram G     dump a length histogram using genome size G\n");
+    fprintf(stderr, "  -lengths         dump just the (sorted) read lengths\n");
     fprintf(stderr, "\n");
+    fprintf(stderr, "READ SELECTION:\n");
     fprintf(stderr, "  -r bgn[-end]     output reads/libraies from `bgn` to `end`, inclusive\n");
+    fprintf(stderr, "  -raw             restrict to 'raw' reads\n");
+    fprintf(stderr, "  -corrected       restrict to 'corrected' reads\n");
+    fprintf(stderr, "  -trimmed         restrict to 'trimmed' reads\n");
+    fprintf(stderr, "  -compressed      restrict to 'compressed' reads\n");
     fprintf(stderr, "\n");
 
     if (seqStoreName == NULL)
@@ -220,6 +430,8 @@ main(int argc, char **argv) {
     exit(1);
   }
 
+  sqRead_setDefaultVersion(which);
+
   sqStore    *seqStore  = new sqStore(seqStoreName, sqStore_readOnly);
   uint32      numReads  = seqStore->sqStore_lastReadID();
   uint32      numLibs   = seqStore->sqStore_lastLibraryID();
@@ -243,11 +455,13 @@ main(int argc, char **argv) {
     dumpLibs(seqStore, bgnID, endID);
 
   if (wantReads)
-    dumpReads(seqStore, bgnID, endID);
+    dumpReads(seqStore, bgnID, endID, which);
 
   if (wantStats)
     dumpStats(seqStore, bgnID, endID);
 
+  if (wantHistogram)
+    dumpHistogram(seqStore, bgnID, endID, which, genomeSize, wantLengths);
 
   delete seqStore;
 
diff --git a/src/stores/sqStoreDumpMetaData.mk b/src/stores/sqStoreDumpMetaData.mk
index adb7d0230..42e4bf636 100644
--- a/src/stores/sqStoreDumpMetaData.mk
+++ b/src/stores/sqStoreDumpMetaData.mk
@@ -9,9 +9,9 @@ ifeq "$(strip ${TARGET_DIR})" ""
 endif
 
 TARGET   := sqStoreDumpMetaData
-SOURCES  := sqStoreDumpMetaData.C
+SOURCES  := sqStoreDumpMetaData.C ../sequence/sequence-summarize.C
 
-SRC_INCDIRS := .. ../utility
+SRC_INCDIRS := .. ../utility ../sequence
 
 TGT_LDFLAGS := -L${TARGET_DIR}/lib
 TGT_LDLIBS  := -lcanu