Skip to content

Commit

Permalink
Store breadcrumbs to quickly estimate overlap score in the ovlStore. …
Browse files Browse the repository at this point in the history
…Change overlap scores to

16-bit values.  Use estimates instead of actual values when settting score threshold for correction.
  • Loading branch information
brianwalenz committed Sep 8, 2017
1 parent 1baf7ad commit e17ab5b
Show file tree
Hide file tree
Showing 9 changed files with 541 additions and 133 deletions.
51 changes: 39 additions & 12 deletions src/correction/computeGlobalScore.C
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@
using namespace std;


uint64
uint16
globalScore::compute(uint32 ovlLen,
ovOverlap *ovl,
uint32 expectedCoverage,
uint32 thresholdsLen,
uint64 *thresholds) {
uint16 *thresholds) {

// Build a list of all the overlap scores. Ignore
// overlaps that are too bad/good or too short/long.
Expand All @@ -55,7 +55,7 @@ globalScore::compute(uint32 ovlLen,
(ovlLength < minOvlLength) || (maxOvlLength < ovlLength))
continue;

hist[histLen++] = ovl[oo].correctionScore();
hist[histLen++] = ovl[oo].overlapScore();
}

// Sort, reversely.
Expand All @@ -67,7 +67,7 @@ globalScore::compute(uint32 ovlLen,

// Figure out our threshold score. Any overlap with score below this should be filtered.

uint64 threshold = (expectedCoverage < histLen) ? hist[expectedCoverage] : 0;
uint16 threshold = (expectedCoverage < histLen) ? hist[expectedCoverage] : 0;

for (uint32 ii=0; ii<thresholdsLen; ii++) {
if (ii * 10 < histLen)
Expand All @@ -85,7 +85,7 @@ globalScore::compute(uint32 ovlLen,

for (uint32 oo=0; oo<ovlLen; oo++) {
uint64 ovlLength = ovl[oo].a_len();
uint64 ovlScore = ovl[oo].correctionScore();
uint16 ovlScore = ovl[oo].overlapScore();

bool isC = false;
bool isD = false;
Expand Down Expand Up @@ -132,19 +132,46 @@ globalScore::compute(uint32 ovlLen,

double fractionFiltered = (double)belowCutoffLocal / histLen;

if (fractionFiltered < 0.00) stats->reads00OlapsFiltered++;
if (fractionFiltered < 0.50) stats->reads50OlapsFiltered++;
if (fractionFiltered < 0.80) stats->reads80OlapsFiltered++;
if (fractionFiltered < 0.95) stats->reads95OlapsFiltered++;
if (fractionFiltered < 1.00) stats->reads99OlapsFiltered++;
if (fractionFiltered <= 0.00) stats->reads00OlapsFiltered++;
if (fractionFiltered <= 0.50) stats->reads50OlapsFiltered++;
if (fractionFiltered <= 0.80) stats->reads80OlapsFiltered++;
if (fractionFiltered <= 0.95) stats->reads95OlapsFiltered++;
if (fractionFiltered <= 1.00) stats->reads99OlapsFiltered++;

if (logFile)
if (histLen <= expectedCoverage)
fprintf(logFile, "%9u - %6u overlaps - %6u scored - %6u filtered - %4u saved (no filtering)\n",
ovl[0].a_iid, ovlLen, histLen, 0, histLen);
else
fprintf(logFile, "%9u - %6u overlaps - %6u scored - %6u filtered - %4u saved (length * erate cutoff %.2f)\n",
ovl[0].a_iid, ovlLen, histLen, belowCutoffLocal, histLen - belowCutoffLocal, threshold / 100.0);
fprintf(logFile, "%9u - %6u overlaps - %6u scored - %6u filtered - %4u saved (threshold %u)\n",
ovl[0].a_iid, ovlLen, histLen, belowCutoffLocal, histLen - belowCutoffLocal, threshold);

return(threshold);
}



void
globalScore::estimate(uint32 ovlLen,
uint32 expectedCoverage) {
double fractionFiltered = 0;

stats->totalOverlaps += ovlLen;

if (ovlLen < expectedCoverage) {
stats->retained += ovlLen;

} else {
fractionFiltered = (double)(ovlLen - expectedCoverage) / ovlLen;

stats->retained += expectedCoverage;
stats->belowCutoff += ovlLen - expectedCoverage;
}

if (fractionFiltered <= 0.00) stats->reads00OlapsFiltered++;
if (fractionFiltered <= 0.50) stats->reads50OlapsFiltered++;
if (fractionFiltered <= 0.80) stats->reads80OlapsFiltered++;
if (fractionFiltered <= 0.95) stats->reads95OlapsFiltered++;
if (fractionFiltered <= 1.00) stats->reads99OlapsFiltered++;
}

7 changes: 5 additions & 2 deletions src/correction/computeGlobalScore.H
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,14 @@ public:
delete stats;
};

uint64 compute(uint32 ovlLen,
uint16 compute(uint32 ovlLen,
ovOverlap *ovl,
uint32 expectedCoverage,
uint32 thresholdsLen,
uint64 *thresholds);
uint16 *thresholds);

void estimate(uint32 ovlLen,
uint32 expectedCoverage);

uint64 totalOverlaps(void) { return(stats->totalOverlaps); };
uint64 lowErate(void) { return(stats->lowErate); };
Expand Down
137 changes: 96 additions & 41 deletions src/correction/filterCorrectionOverlaps.C
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,26 @@

using namespace std;




FILE *
openOutput(char *fileName, bool doOpen) {
errno = 0;

if ((doOpen == false) || (fileName == NULL))
return(NULL);

FILE *F = fopen(fileName, "w");

if (errno)
fprintf(stderr, "ERROR: failed to open '%s' for writing: %s\n", fileName, strerror(errno)), exit(1);

return(F);
}



int
main(int argc, char **argv) {
char *gkpStoreName = NULL;
Expand All @@ -52,6 +72,10 @@ main(int argc, char **argv) {
char logFileName[FILENAME_MAX];
char statsFileName[FILENAME_MAX];

bool doEstimate = true;
bool doExact = false;
bool doCompare = false;

bool noLog = false;
bool noStats = false;

Expand All @@ -63,8 +87,6 @@ main(int argc, char **argv) {
double maxErate = 1.0;
double minErate = 1.0;

bool legacyScore = false;

argc = AS_configure(argc, argv);

int32 arg = 1;
Expand All @@ -80,6 +102,20 @@ main(int argc, char **argv) {
scoreFileName = argv[++arg];


} else if (strcmp(argv[arg], "-estimate") == 0) {
doEstimate = true;
doExact = false;

} else if (strcmp(argv[arg], "-exact") == 0) {
doEstimate = false;
doExact = true;

} else if (strcmp(argv[arg], "-compare") == 0) {
doEstimate = true;
doExact = true;
doCompare = true;


} else if (strcmp(argv[arg], "-c") == 0) {
expectedCoverage = atoi(argv[++arg]);

Expand All @@ -97,9 +133,6 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-nostats") == 0) {
noStats = true;

} else if (strcmp(argv[arg], "-legacy") == 0) {
legacyScore = true;

} else {
fprintf(stderr, "ERROR: invalid arg '%s'\n", argv[arg]);
err++;
Expand All @@ -126,6 +159,11 @@ main(int argc, char **argv) {
fprintf(stderr, " per-read logging to 'scoreFile.log' (see -nolog)\n");
fprintf(stderr, " summary statistics to 'scoreFile.stats' (see -nostats)\n");
fprintf(stderr, "\n");
fprintf(stderr, " -estimate estimate the cutoff from precomputed scores\n");
fprintf(stderr, " -exact compute an exact cutoff by reading all overlaps\n");
fprintf(stderr, "\n");
fprintf(stderr, " -compare output a comparison of estimated vs exact scores\n");
fprintf(stderr, "\n");
fprintf(stderr, " -c coverage retain at most this many overlaps per read\n");
fprintf(stderr, "\n");
fprintf(stderr, " -l length filter overlaps shorter than this length\n");
Expand All @@ -134,6 +172,8 @@ main(int argc, char **argv) {
fprintf(stderr, " example: -e 0.05-0.20 filter overlaps below 5%% error\n");
fprintf(stderr, " or above 20%% error\n");
fprintf(stderr, "\n");
fprintf(stderr, " Length and Fraction Error filtering NOT SUPPORTED with -estimate.\n");
fprintf(stderr, "\n");
fprintf(stderr, " -nolog don't create 'scoreFile.log'\n");
fprintf(stderr, " -nostats don't create 'scoreFile.stats'\n");

Expand All @@ -156,55 +196,68 @@ main(int argc, char **argv) {
minErate = 0.0;
}

uint32 maxEvalue = AS_OVS_encodeEvalue(maxErate);
uint32 minEvalue = AS_OVS_encodeEvalue(minErate);;
uint32 maxEvalue = AS_OVS_encodeEvalue(maxErate);
uint32 minEvalue = AS_OVS_encodeEvalue(minErate);;

gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName);
gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName);

ovStore *inpStore = new ovStore(ovlStoreName, gkpStore);
ovStore *ovlStore = new ovStore(ovlStoreName, gkpStore);
ovStoreHistogram *ovlHisto = ovlStore->getHistogram();

uint64 *scores = new uint64 [gkpStore->gkStore_getNumReads() + 1];
uint32 *numOlaps = ovlStore->numOverlapsPerRead();

uint32 ovlLen = 0;
uint32 ovlMax = 131072;
ovOverlap *ovl = (doExact == true) ? ovOverlap::allocateOverlaps(gkpStore, ovlMax) : NULL;

snprintf(logFileName, FILENAME_MAX, "%s.log", scoreFileName);
uint16 *scores = new uint16 [gkpStore->gkStore_getNumReads() + 1];
uint16 scoreExact = 0;
uint16 scoreEstim = 0;

snprintf(logFileName, FILENAME_MAX, "%s.log", scoreFileName);
snprintf(statsFileName, FILENAME_MAX, "%s.stats", scoreFileName);

errno = 0;
FILE *scoreFile = (scoreFileName == NULL) ? NULL : fopen(scoreFileName, "w");
if (errno)
fprintf(stderr, "ERROR: failed to open '%s' for writing: %s\n", scoreFileName, strerror(errno)), exit(1);
FILE *scoreFile = openOutput(scoreFileName, true);
FILE *logFile = openOutput(logFileName, (noLog == false));

globalScore *gs = new globalScore(minOvlLength, maxOvlLength, minErate, maxErate, logFile, (noStats == false));

errno = 0;
FILE *logFile = (noLog == true) ? NULL : fopen(logFileName, "w");
if (errno)
fprintf(stderr, "ERROR: failed to open '%s' for writing: %s\n", logFileName, strerror(errno)), exit(1);
uint64 readsNoOlaps = 0;

if (doCompare) {
fprintf(stdout, " readID exact estim\n");
//fprintf(stdout, "-------- ------ ------\n");
}

uint32 ovlLen = 0;
uint32 ovlMax = 131072;
ovOverlap *ovl = ovOverlap::allocateOverlaps(gkpStore, ovlMax);
for (uint32 id=0; id <= gkpStore->gkStore_getNumReads(); id++) {
scores[id] = UINT16_MAX;

uint64 readsNoOlaps = 0;
if (numOlaps[id] == 0) {
readsNoOlaps++;
continue;
}

globalScore *gs = new globalScore(minOvlLength, maxOvlLength, minErate, maxErate, logFile, (noStats == false));
if (doEstimate == true) {
scores[id] = scoreEstim = ovlHisto->overlapScoreEstimate(id, expectedCoverage);

for (uint32 id=1; id <= gkpStore->gkStore_getNumReads(); id++) {
scores[id] = UINT64_MAX;
gs->estimate(numOlaps[id], expectedCoverage); // Just for stats collection
}

inpStore->readOverlaps(id, ovl, ovlLen, ovlMax);
if (doExact == true) {
ovlStore->readOverlaps(id, ovl, ovlLen, ovlMax);
assert(ovlLen == numOlaps[id]);
assert(ovl[0].a_iid == id);

if ((ovlLen == 0) ||
(ovl[0].a_iid != id)) {
readsNoOlaps++;
continue;
scores[id] = scoreExact = gs->compute(ovlLen, ovl, expectedCoverage, 0, NULL);
}

scores[id] = gs->compute(ovlLen, ovl, expectedCoverage, 0, NULL);
if (doCompare) {
fprintf(stdout, "%8u %6u %6u\n", id, scoreExact, scoreEstim);
}
}

if (scoreFile) {
AS_UTL_safeWrite(scoreFile, scores, "scores", sizeof(uint64), gkpStore->gkStore_getNumReads() + 1);
AS_UTL_safeWrite(scoreFile, scores, "scores", sizeof(uint16), gkpStore->gkStore_getNumReads() + 1);
fclose(scoreFile);
}

Expand All @@ -214,7 +267,9 @@ main(int argc, char **argv) {
delete [] scores;

delete [] ovl;
delete inpStore;
delete [] numOlaps;
delete ovlHisto;
delete ovlStore;

gkpStore->gkStore_close();

Expand Down Expand Up @@ -244,27 +299,27 @@ main(int argc, char **argv) {
fprintf(statsFile, "%12" F_U64P " (< %u bases long)\n", gs->tooShort(), minOvlLength);
fprintf(statsFile, "%12" F_U64P " (> %u bases long)\n", gs->tooLong(), maxOvlLength);
fprintf(statsFile, "\n");
fprintf(statsFile, "FILTERED:\n");
fprintf(statsFile, "FILTERED:%s\n", (doEstimate == true) ? " (estimated)" : "");
fprintf(statsFile, "\n");
fprintf(statsFile, "%12" F_U64P " (too many overlaps, discard these shortest ones)\n", gs->belowCutoff());
fprintf(statsFile, "\n");
fprintf(statsFile, "EVIDENCE:\n");
fprintf(statsFile, "EVIDENCE:%s\n", (doEstimate == true) ? " (estimated)" : "");
fprintf(statsFile, "\n");
fprintf(statsFile, "%12" F_U64P " (longest overlaps)\n", gs->retained());
fprintf(statsFile, "\n");
fprintf(statsFile, "TOTAL:\n");
fprintf(statsFile, "\n");
fprintf(statsFile, "%12" F_U64P " (all overlaps)\n", gs->totalOverlaps());
fprintf(statsFile, "\n");
fprintf(statsFile, "READS:\n");
fprintf(statsFile, "READS:%s\n", (doEstimate == true) ? " (estimated)" : "");
fprintf(statsFile, "-----\n");
fprintf(statsFile, "\n");
fprintf(statsFile, "%12" F_U64P " (no overlaps)\n", readsNoOlaps);
fprintf(statsFile, "%12" F_U64P " (no overlaps filtered)\n", gs->reads00OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (< 50%% overlaps filtered)\n", gs->reads50OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (< 80%% overlaps filtered)\n", gs->reads80OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (< 95%% overlaps filtered)\n", gs->reads95OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (< 100%% overlaps filtered)\n", gs->reads99OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (<= 50%% overlaps filtered)\n", gs->reads50OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (<= 80%% overlaps filtered)\n", gs->reads80OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (<= 95%% overlaps filtered)\n", gs->reads95OlapsFiltered());
fprintf(statsFile, "%12" F_U64P " (<= 100%% overlaps filtered)\n", gs->reads99OlapsFiltered());
fprintf(statsFile, "\n");
fclose(statsFile);

Expand Down
Loading

0 comments on commit e17ab5b

Please sign in to comment.