Skip to content

Commit

Permalink
Add initial (and poor) version of an overlap statistics generator, co…
Browse files Browse the repository at this point in the history
…mmitted

because the next version is going to be a major rewrite.
  • Loading branch information
brianwalenz committed Oct 27, 2015
1 parent 73534e0 commit 2b8806e
Show file tree
Hide file tree
Showing 5 changed files with 995 additions and 12 deletions.
339 changes: 327 additions & 12 deletions src/AS_UTL/stddev.H
Original file line number Diff line number Diff line change
Expand Up @@ -29,42 +29,45 @@
#include "AS_global.H"

#include <vector>
#include <algorithm>

using namespace std;


// Note: Does not work well with unsigned types. The 'smallest' compute can underflow.


template<typename TT>
void
computeStdDev(vector<TT> dist, TT &mean, TT &median, TT &mode, double &stddev) {
computeStdDev(vector<TT> dist, double &mean, double &stddev, bool isSorted=false) {
mean = 0;
median = 0;
mode = 0;
stddev = 0;

if (dist.size() == 0)
return;

// Sort the values. Lets us approximate the stddev for filtering out outliers,
// and makes finding the mode simple.
// Sort the values. Lets us approximate the stddev for filtering out outliers.

sort(dist.begin(), dist.end());
if (isSorted == false)
sort(dist.begin(), dist.end());

// Approximate the stddev to filter out outliers. This is done by assuming we're normally
// distributed, finding the values that would represent 1 standard deviation (about 68.27% of the
// data), and using that to find the 5 std.dev. limits.

median = dist[dist.size() * 1 / 2];
TT median = dist[1 * dist.size() / 2];

TT oneThird = dist[dist.size() * 1 / 3];
TT twoThird = dist[dist.size() * 2 / 3];
TT oneThird = dist[1 * dist.size() / 3];
TT twoThird = dist[2 * dist.size() / 3];

TT approxStd = max(median - oneThird, twoThird - median);

TT biggest = median + approxStd * 5;
TT smallest = median - approxStd * 5;

// Now, compute the number of samples within our bounts. And find the mean, too.
fprintf(stderr, "computeStdDev %d %d %d %d %d %d\n", median, oneThird, twoThird, approxStd, biggest, smallest);

// Now, compute the number of samples within our bounds. And find the mean, too.

size_t numSamples = 0;

Expand All @@ -89,9 +92,23 @@ computeStdDev(vector<TT> dist, TT &mean, TT &median, TT &mode, double &stddev) {

if (numSamples > 1)
stddev = sqrt(stddev / (numSamples - 1));
};



// Compute the mode. Once the values are sorted, we just need to scan the list and remember the
// most common value.
//
template<typename TT>
void
computeMode(vector<TT> dist, TT &mode, bool isSorted=false) {
mode = 0;

if (dist.size() == 0)
return;

// And the mode. The values are sorted, so we just need to scan the list and remember the
// most common value.
if (isSorted == false)
sort(dist.begin(), dist.end());

uint32 modeCnt = 0;
TT modeVal = 0;
Expand Down Expand Up @@ -119,6 +136,42 @@ computeStdDev(vector<TT> dist, TT &mean, TT &median, TT &mode, double &stddev) {
}

mode = modeVal;
}



// Comptue the median and median absolute deviation. Sort the values to find the median, then
// build a new vector of |median - x| and find the median of that.
//
template<typename TT>
void
computeMedianAbsoluteDeviation(vector<TT> dist, TT &median, TT &mad, bool isSorted=false) {
median = 0;
mad = 0;

if (dist.size() == 0)
return;

if (isSorted == false)
sort(dist.begin(), dist.end());

// Technically, if there are an even number of values, the median should be the average of the two
// in the middle.

median = dist[ dist.size()/2 ];

vector<TT> m;

for (uint64 ii=0; ii<dist.size(); ii++) {
if (dist[ii] < median)
m.push_back(median - dist[ii]);
else
m.push_back(dist[ii] - median);
}

sort(m.begin(), m.end());

mad = m[ m.size()/2 ];
};


Expand All @@ -134,4 +187,266 @@ computeExponentialMovingAverage(TT alpha, TT ema, TT value) {










template<typename TT>
class genericStatistics {
public:
genericStatistics() {

_finalized = false;

_mean = 0.0;
_stddev = 0.0;

_mode = 0;

_median = 0;
_mad = 0;

};
~genericStatistics() {
};

void add(TT data) {
_finalized = false;
_data.push_back(data);
};


uint64 numberOfObjects(void) {
finalizeData();
return(_data.size());
}

double mean(void) {
finalizeData();
return(_mean);
}
double stddev(void) {
finalizeData();
return(_stddev);
};

TT median(void) {
finalizeData();
return(_median);
};
TT mad(void) { // Median Absolute Deviation
finalizeData();
return(_mad);
};

vector<uint64> &histogram(void) { // Returns pointer to private histogram data
finalizeData();
return(&_histogram);
};

vector<uint64> &Nstatistics(void) { // Returns pointer to private N data
finalizeData();
return(&_Nstatistics);
};

void finalizeData(void) {
if (_finalized == true)
return;

computeStdDev(_data, _mean, _stddev); // Filters out outliers
computeMode(_data, _mode); // Mo filtering
computeMedianAbsoluteDeviation(_data, _median, _mad); // No filtering

_finalized = true;
};


private:
bool _finalized;

vector<TT> _data;

double _mean;
double _stddev;

TT _mode;

TT _median;
TT _mad;

vector<uint64> _histogram;
vector<uint64> _Nstatistics;
};








class histogramStatistics {
public:
histogramStatistics() {
_histogramAlloc = 1024 * 1024;
_histogramMax = 0;
_histogram = new uint64 [_histogramAlloc];

memset(_histogram, 0, sizeof(uint64) * _histogramAlloc);

_finalized = false;

clearStatistics();
};
~histogramStatistics() {
delete [] _histogram;
};

void add(uint64 data) {
if (_histogramAlloc < data)
resizeArray(_histogram, _histogramMax+1, _histogramAlloc, _histogramAlloc * 2, resizeArray_copyData | resizeArray_clearNew);

if (_histogramMax < data)
_histogramMax = data;

_histogram[data]++;
_finalized = false;
};


uint64 numberOfObjects(void) { finalizeData(); return(_numObjs); };

double mean(void) { finalizeData(); return(_mean); };
double stddev(void) { finalizeData(); return(_stddev); };

uint64 median(void) { finalizeData(); return(_median); };
uint64 mad(void) { finalizeData(); return(_mad); };

#if 0
vector<uint64> &histogram(void) { // Returns pointer to private histogram data
finalizeData();
return(&_histogram);
};

vector<uint64> &Nstatistics(void) { // Returns pointer to private N data
finalizeData();
return(&_Nstatistics);
};
#endif

void clearStatistics(void) {
_numObjs = 0;

_mean = 0.0;
_stddev = 0.0;

_mode = 0;

_median = 0;
_mad = 0;
};

void finalizeData(void) {

if (_finalized == true)
return;

clearStatistics();

// Compute number of objects

for (uint64 ii=0; ii <= _histogramMax; ii++)
_numObjs += _histogram[ii];

// Compute mean and stddev

for (uint64 ii=0; ii <= _histogramMax; ii++)
_mean += ii * _histogram[ii];

_mean /= _numObjs;

for (uint64 ii=0; ii <= _histogramMax; ii++)
_stddev += _histogram[ii] * (ii - _mean) * (ii - _mean);

if (_numObjs > 1)
_stddev = sqrt(_stddev / (_numObjs - 1));

// Compute mode

for (uint64 ii=0; ii <= _histogramMax; ii++)
if (_histogram[ii] > _histogram[_mode])
_mode = ii;

// Compute median and mad

for (uint64 ii=0; ii <= _histogramMax; ii++)
if (_median < _numObjs / 2) {
_median += _histogram[ii];
} else {
_median = ii;
break;
}

uint64 *maddata = new uint64 [_histogramAlloc];

memset(maddata, 0, sizeof(uint64) * _histogramAlloc);

for (uint64 ii=0; ii <= _histogramMax; ii++) {
uint64 mad = (ii < _median) ? (_median - ii) : (ii - _median);
maddata[mad] += _histogram[ii];
}

for (uint64 ii=0; ii <= _histogramMax; ii++)
if (_mad < _numObjs / 2) {
_mad += maddata[ii];
} else {
_mad = ii;
break;
}

// And, done.

_finalized = true;
};

uint64 histogram(uint64 ii) {
return(_histogram[ii]);
};
uint64 histogramMax(void) {
return(_histogramMax);
};

void writeHistogram(FILE *F, char *label) {
fprintf(F, "#%s\tquantity\n", label);

for (uint64 ii=0; ii <= _histogramMax; ii++)
fprintf(F, F_U64"\t"F_U64"\n", ii, _histogram[ii]);
};


private:
bool _finalized;

uint64 _histogramAlloc; // Maximum allocated value
uint64 _histogramMax; // Maximum valid value
uint64 *_histogram;

uint64 _numObjs;

double _mean;
double _stddev;

uint64 _mode;

uint64 _median;
uint64 _mad;
};





#endif // STDDEV_H
1 change: 1 addition & 0 deletions src/main.mk
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ SUBMAKEFILES := stores/gatekeeperCreate.mk \
stores/ovStoreSorter.mk \
stores/ovStoreIndexer.mk \
stores/ovStoreDump.mk \
stores/ovStoreStats.mk \
stores/tgStoreDump.mk \
stores/tgStoreLoad.mk \
stores/tgStoreFilter.mk \
Expand Down
Loading

0 comments on commit 2b8806e

Please sign in to comment.