Skip to content

Commit

Permalink
Implement a histogram that supports kmers that occur more than 32 mil…
Browse files Browse the repository at this point in the history
…lion times. Issue marbl#1182.
  • Loading branch information
brianwalenz committed Feb 12, 2019
1 parent c538153 commit dc859c7
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 63 deletions.
36 changes: 15 additions & 21 deletions src/meryl/merylOp-histogram.C
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,12 @@ merylOperation::reportHistogram(void) {

// Now just dump it.

for (uint32 frequency=1; frequency<stats->numFrequencies(); frequency++) {
uint64 nkmers = stats->numKmersAtFrequency(frequency);
for (uint32 ii=0; ii<stats->histogramLength(); ii++) {
uint32 value = stats->histogramValue(ii);
uint64 occur = stats->histogramOccurrences(ii);

if (nkmers > 0)
fprintf(stdout, F_U32 "\t" F_U64 "\n", frequency, nkmers);
fprintf(stdout, F_U32 "\t" F_U64 "\n", value, occur);
}

//for (uint32 ii=0; ii<stats->_hbigLen; ii++) {
//}
}


Expand Down Expand Up @@ -89,24 +86,21 @@ merylOperation::reportStatistics(void) {
fprintf(stdout, "frequency kmers distinct total (1e-6)\n");
fprintf(stdout, "--------- ------------ ------------ ------------ ------------\n");

for (uint32 frequency=1; frequency<stats->numFrequencies(); frequency++) {
uint64 nkmers = stats->numKmersAtFrequency(frequency);
for (uint32 ii=0; ii<stats->histogramLength(); ii++) {
uint32 value = stats->histogramValue(ii);
uint64 occur = stats->histogramOccurrences(ii);

sDistinct += nkmers;
sTotal += nkmers * frequency;
sDistinct += occur;
sTotal += occur * value;

if (nkmers > 0)
fprintf(stdout, "%9" F_U32P " %12" F_U64P " %12.4f %12.4f %12.6f\n",
frequency,
nkmers,
(double)sDistinct / stats->numDistinct(),
(double)sTotal / stats->numTotal(),
(double)frequency / stats->numTotal() * 1000000.0);
fprintf(stdout, "%9" F_U32P " %12" F_U64P " %12.4f %12.4f %12.6f\n",
value,
occur,
(double)sDistinct / stats->numDistinct(),
(double)sTotal / stats->numTotal(),
(double)value / stats->numTotal() * 1000000.0);
}

assert(sDistinct == stats->numDistinct());
assert(sTotal == stats->numTotal());

//for (uint32 ii=0; ii<stats->_hbigLen; ii++) {
//}
}
6 changes: 3 additions & 3 deletions src/meryl/merylOp-nextMer.C
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ merylOperation::initializeThreshold(void) {
uint64 nKmers = 0;
uint64 nKmersTarget = _fracDist * stats->numDistinct();

for (uint32 ii=0; ii<stats->numFrequencies(); ii++) {
nKmers += stats->numKmersAtFrequency(ii);
for (uint32 ii=0; ii<stats->histogramLength(); ii++) {
nKmers += stats->histogramOccurrences(ii);

if (nKmers >= nKmersTarget) {
_threshold = ii;
_threshold = stats->histogramValue(ii);
break;
}
}
Expand Down
11 changes: 7 additions & 4 deletions src/utility/kmers-exact.C
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,11 @@ kmerCountExactLookup::initialize(kmerCountFileReader *input_,
if (minValue_ == 0)
minValue_ = 1;

if (maxValue_ == UINT32_MAX)
maxValue_ = input_->stats()->maxFrequency();
if (maxValue_ == UINT32_MAX) {
uint32 nV = input_->stats()->histogramLength();

maxValue_ = input_->stats()->histogramValue(nV - 1);
}

// Now initialize filtering!

Expand Down Expand Up @@ -93,8 +96,8 @@ kmerCountExactLookup::initialize(kmerCountFileReader *input_,

// Scan the histogram to count the number of kmers in range.

for (uint32 ii=_minValue; ii<=_maxValue; ii++)
_nSuffix += input_->stats()->numKmersAtFrequency(ii);
for (uint32 ii=0; ii<input_->stats()->histogramLength(); ii++)
_nSuffix += input_->stats()->histogramOccurrences(ii);

_prePtrBits = countNumberOfBits64(_nSuffix); // Width of an entry in the prefix table.
_prePtrBits = 64;
Expand Down
80 changes: 65 additions & 15 deletions src/utility/kmers-statistics.C
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,16 @@ kmerCountStatistics::kmerCountStatistics() {
for (uint64 ii=0; ii<_histMax; ii++)
_hist[ii] = 0;

_hbigLen = 0;
_hbigMax = 0;
_hbigCount = NULL;
_hbigNumber = NULL;
_histLen = 0;
_histVs = NULL;
_histOs = NULL;
}


kmerCountStatistics::~kmerCountStatistics() {
delete [] _hist;
delete [] _hbigCount;
delete [] _hbigNumber;
delete [] _histVs;
delete [] _histOs;
}


Expand All @@ -64,7 +63,14 @@ kmerCountStatistics::clear(void) {
for (uint64 ii=0; ii<_histMax; ii++)
_hist[ii] = 0;

_hbigLen = 0;
_histBig.clear();

delete [] _histVs;
delete [] _histOs;

_histLen = 0;
_histVs = NULL;
_histOs = NULL;
}


Expand All @@ -86,11 +92,18 @@ kmerCountStatistics::dump(stuffedBits *bits) {
histLast++;

bits->setBinary(32, histLast); // Out of order relative to struct to keep
bits->setBinary(32, _hbigLen); // the arrays below word-aligned.
bits->setBinary(32, _histBig.size()); // the arrays below word-aligned.

bits->setBinary(64, histLast, _hist); // Easy one, just dump an array.

bits->setBinary(64, histLast, _hist);
bits->setBinary(64, _hbigLen, _hbigCount);
bits->setBinary(64, _hbigLen, _hbigNumber);
// The histBig map is a little trickier. The original format wrote two arrays,
// _hbigCount then _hbigNumber. But those were always empty and never used.
// We're therefore free to change the order.

for (map<uint64,uint64>::iterator it=_histBig.begin(); it != _histBig.end(); it++) {
bits->setBinary(64, it->first); // Value
bits->setBinary(64, it->second); // Number of occurrences
}
}


Expand All @@ -109,19 +122,56 @@ kmerCountStatistics::dump(FILE *outFile) {
void
kmerCountStatistics::load(stuffedBits *bits) {
uint32 histLast;
uint32 histBigLen;

_numUnique = bits->getBinary(64);
_numDistinct = bits->getBinary(64);
_numTotal = bits->getBinary(64);

histLast = bits->getBinary(32);
_hbigLen = bits->getBinary(32);
histBigLen = bits->getBinary(32);

assert(_hist != NULL);
delete [] _hist;

_hist = new uint64 [histLast + 1];
_hist = bits->getBinary(64, histLast, _hist);
_hbigCount = bits->getBinary(64, _hbigLen);
_hbigNumber = bits->getBinary(64, _hbigLen);

// Count how many non-zero histogram values there are.

_histLen = histBigLen;

for (uint32 ii=0; ii<histLast; ii++)
if (_hist[ii] > 0)
_histLen++;

// Allocate space for them.

_histVs = new uint32 [_histLen];
_histOs = new uint64 [_histLen];

// Set the ones we've loaded already.

_histLen = 0;

for (uint32 ii=0; ii<histLast; ii++)
if (_hist[ii] > 0) {
_histVs[_histLen] = ii;
_histOs[_histLen] = _hist[ii];
_histLen++;
}

// Load the rest from disk.

for (uint32 ii=0; ii<histBigLen; ii++) {
_histVs[_histLen] = bits->getBinary(64);
_histOs[_histLen] = bits->getBinary(64);
_histLen++;
}

// Delete _hist to indicate we cannot accept new values.

delete [] _hist;
_hist = NULL;
}


Expand Down
37 changes: 17 additions & 20 deletions src/utility/kmers.H
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
#include "files.H"
#include "bits.H"

#include <map>

using namespace std;


// merSize 1 NOT supported. Fails _leftShift.

#undef SHOW_LOAD
Expand Down Expand Up @@ -269,10 +274,10 @@ public:
_numDistinct += 1;
_numTotal += count;

if (count < _histMax) {
if (count < _histMax)
_hist[count]++;
return;
}
else
_histBig[count]++;
};

void clear(void);
Expand All @@ -287,31 +292,23 @@ public:
uint64 numDistinct(void) { return(_numDistinct); };
uint64 numTotal(void) { return(_numTotal); };

uint32 numFrequencies(void) { return(_histMax); };
uint64 numKmersAtFrequency(uint32 ii) { return(_hist[ii]); };
double wordFrequencyAtFrequency(uint32 ii) { return(_hist[ii] * 1000000.0 / _numTotal); };

uint32 maxFrequency(void) {
uint32 maxV = _histMax - 1;

while ((maxV > 0) && (numKmersAtFrequency(maxV) == 0))
maxV--;

return(maxV);
};
uint32 histogramLength(void) { return(_histLen); };
uint32 histogramValue(uint32 i) { return(_histVs[i]); };
uint64 histogramOccurrences(uint32 i) { return(_histOs[i]); };

private:
uint64 _numUnique;
uint64 _numDistinct;
uint64 _numTotal;

uint32 _histMax; // Max count that can be stored in _hist.
uint32 _histMax; // Max value that can be stored in _hist.
uint64 *_hist;
map<uint64, uint64> _histBig; // Values bigger than _histMax; <value,occurrances>

uint64 _histLen; // If loaded from disk, this is the unpacked histogram.
uint32 *_histVs;
uint64 *_histOs;

uint32 _hbigLen; // Counts bigger than _histMax are stored
uint32 _hbigMax; // as unsorted arrays.
uint64 *_hbigCount;
uint64 *_hbigNumber;
};


Expand Down

0 comments on commit dc859c7

Please sign in to comment.