Skip to content

Commit

Permalink
Let overlap store building scale in memory to avoid hitting system fi…
Browse files Browse the repository at this point in the history
…le limits on large stores
  • Loading branch information
skoren committed Feb 29, 2016
1 parent 3151c35 commit 81a7505
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 24 deletions.
5 changes: 2 additions & 3 deletions src/pipelines/canu/Configure.pm
Original file line number Diff line number Diff line change
Expand Up @@ -554,13 +554,13 @@ sub configureAssembler () {

} elsif (getGlobal("genomeSize") < adjustGenomeSize("1g")) {
setGlobalIfUndef("cnsMemory", "16-48"); setGlobalIfUndef("cnsThreads", "2-8");
setGlobalIfUndef("corMemory", "10-16"); setGlobalIfUndef("corThreads", "2-8");
setGlobalIfUndef("corMemory", "10-20"); setGlobalIfUndef("corThreads", "2-4");
setGlobalIfUndef("cnsPartitions", "64"); setGlobalIfUndef("cnsPartitionMin", "20000");
setGlobalIfUndef("corPartitions", "256"); setGlobalIfUndef("corPartitionMin", "15000");

} else {
setGlobalIfUndef("cnsMemory", "16-64"); setGlobalIfUndef("cnsThreads", "2-8");
setGlobalIfUndef("corMemory", "10-16"); setGlobalIfUndef("corThreads", "2-8");
setGlobalIfUndef("corMemory", "10-32"); setGlobalIfUndef("corThreads", "2-4");
setGlobalIfUndef("cnsPartitions", "256"); setGlobalIfUndef("cnsPartitionMin", "25000");
setGlobalIfUndef("corPartitions", "512"); setGlobalIfUndef("corPartitionMin", "25000");
}
Expand Down Expand Up @@ -639,7 +639,6 @@ sub configureAssembler () {
($err, $all) = getAllowedResources("", "oea", $err, $all);
($err, $all) = getAllowedResources("", "cns", $err, $all);
($err, $all) = getAllowedResources("", "ovb", $err, $all);
($err, $all) = getAllowedResources("", "ovs", $err, $all);
($err, $all) = getAllowedResources("cor", "ovl", $err, $all);
($err, $all) = getAllowedResources("obt", "ovl", $err, $all);
($err, $all) = getAllowedResources("utg", "ovl", $err, $all);
Expand Down
2 changes: 1 addition & 1 deletion src/pipelines/canu/Defaults.pm
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,7 @@ sub setDefaults () {

##### Overlap Store

$global{"ovsMethod"} = "sequential";
$global{"ovsMethod"} = undef;
$synops{"ovsMethod"} = "Use the 'sequential' or 'parallel' algorithm for constructing an overlap store; default 'sequential'";

##### Mers
Expand Down
14 changes: 9 additions & 5 deletions src/pipelines/canu/OverlapStore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ require Exporter;

use strict;

use POSIX qw(ceil);
use canu::Defaults;
use canu::Execution;
use canu::HTML;
Expand Down Expand Up @@ -117,12 +118,14 @@ sub getNumOlapsAndSlices ($$) {

my $numOlaps = 0;
my $numSlices = 0;
my $memLimit = 0;

open(F, "< $wrk/$asm.ovlStore.BUILDING/config.err") or caExit("can't open '$wrk/$asm.ovlStore.BUILDING/config.err' for reading: $!\n", undef);
while (<F>) {
if (m/Will sort (\d+.\d+) million overlaps per bucket, using (\d+) buckets./) {
if (m/Will sort (\d+.\d+) million overlaps per bucket, using (\d+) buckets (\d+.\d+) GB per bucket./) {
$numOlaps = $1;
$numSlices = $2;
$memLimit = ceil($3);
}
}
close(F);
Expand All @@ -131,7 +134,7 @@ sub getNumOlapsAndSlices ($$) {
caExit("Failed to find any overlaps ($numOlaps) or slices ($numSlices).\n", undef);
}

return($numOlaps, $numSlices);
return($numOlaps, $numSlices, $memLimit);
}


Expand Down Expand Up @@ -176,9 +179,10 @@ sub overlapStoreConfigure ($$$$) {
}
}

# Parse the output to find the number of jobs we need to sort.

my ($numOlaps, $numSlices) = getNumOlapsAndSlices($wrk, $asm);
# Parse the output to find the number of jobs we need to sort and the memory
# ovs store memory is left as a range (e.g. 4-16) so building can scale itself to (hopefully) fit both into memory and into max system open files
my ($numOlaps, $numSlices, $memLimit) = getNumOlapsAndSlices($wrk, $asm);
setGlobal("ovsMemory", $memLimit);

# Parallel jobs for bucketizing. This should really be part of overlap computation itself.

Expand Down
40 changes: 25 additions & 15 deletions src/stores/ovStoreBuild.C
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
*/

#include "AS_global.H"
#include "AS_UTL_decodeRange.H"

#include "gkStore.H"
#include "ovStore.H"
Expand All @@ -60,17 +61,19 @@ static
uint32 *
computeIIDperBucket(uint32 fileLimit,
uint64 memoryLimit,
uint64 maxMemoryLimit,
uint32 maxIID,
vector<char *> &fileList) {
uint32 *iidToBucket = new uint32 [maxIID];
uint32 maxFiles = sysconf(_SC_OPEN_MAX) - 16;

// If we're reading from stdin, not much we can do but divide the IIDs equally per file. Note
// that the IIDs must be consecutive; the obvious, simple and clean division of 'mod' won't work.

if (fileList[0][0] == '-') {
if (memoryLimit > 0) {
memoryLimit = 0;
fileLimit = sysconf(_SC_OPEN_MAX) - 16;
fileLimit = maxFiles;

fprintf(stderr, "WARNING: memory limit (-M) specified, but can't be used with inputs from stdin; using %d files instead.\n", fileLimit);
} else {
Expand Down Expand Up @@ -166,12 +169,16 @@ computeIIDperBucket(uint32 fileLimit,

// If a memory limit, distribute the overlaps to files no larger than the limit.
if (memoryLimit > 0) {
olapsPerBucketMax = (memoryLimit - MEMORY_OVERHEAD) / ovOverlapSortSize;
fprintf(stderr, "Will sort using "F_U64" files; "F_U64" (%.2f million) overlaps per bucket; %.2f GB memory per bucket\n",
numOverlaps / olapsPerBucketMax + 1,
olapsPerBucketMax,
olapsPerBucketMax / 1000000.0,
olapsPerBucketMax * GBperOlap);
// iterate until we can fit the files into file system limits, give up if we hit our max limit
do {
olapsPerBucketMax = (memoryLimit - MEMORY_OVERHEAD) / ovOverlapSortSize;
fprintf(stderr, "Will sort using "F_U64" files; "F_U64" (%.2f million) overlaps per bucket; %.2f GB memory per bucket\n",
numOverlaps / olapsPerBucketMax + 1,
olapsPerBucketMax,
olapsPerBucketMax / 1000000.0,
olapsPerBucketMax * GBperOlap);
memoryLimit += 1024 * 1024 * 1024;
} while (memoryLimit <= maxMemoryLimit && ( numOverlaps / olapsPerBucketMax + 1) > maxFiles / 2);
}

// Given the limit on each bucket, count the number of buckets needed, then reset the limit on
Expand Down Expand Up @@ -214,8 +221,8 @@ computeIIDperBucket(uint32 fileLimit,
fprintf(stderr, " bucket %3d has "F_U64" olaps.\n", bucket, olaps);
}

fprintf(stderr, "Will sort %.3f million overlaps per bucket, using %u buckets.\n",
olapsPerBucketMax / 1000000.0, iidToBucket[maxIID-1]);
fprintf(stderr, "Will sort %.3f million overlaps per bucket, using %u buckets %.2f GB per bucket.\n",
olapsPerBucketMax / 1000000.0, iidToBucket[maxIID-1], olapsPerBucketMax * GBperOlap);

delete [] overlapsPerRead;

Expand Down Expand Up @@ -253,10 +260,11 @@ writeToDumpFile(ovOverlap *overlap,

int
main(int argc, char **argv) {
char *ovlName = NULL;
char *gkpName = NULL;
uint32 fileLimit = 0;
uint64 memoryLimit = (uint64)4 * 1024 * 1024 * 1024;
char *ovlName = NULL;
char *gkpName = NULL;
uint32 fileLimit = 0;
uint64 memoryLimit = (uint64)4 * 1024 * 1024 * 1024;
uint64 maxMemoryLimit = memoryLimit;

double maxError = 1.0;
uint32 minOverlap = 0;
Expand Down Expand Up @@ -285,7 +293,9 @@ main(int argc, char **argv) {

} else if (strcmp(argv[arg], "-M") == 0) {
fileLimit = 0;
memoryLimit = (uint64)ceil(atof(argv[++arg]) * 1024.0 * 1024.0 * 1024.0);
AS_UTL_decodeRange(argv[++arg], memoryLimit, maxMemoryLimit);
memoryLimit = (uint64)ceil(memoryLimit) * 1024.0 * 1024.0 * 1024.0;
maxMemoryLimit = (uint64)ceil(maxMemoryLimit) * 1024.0 * 1024.0 * 1024.0;

} else if (strcmp(argv[arg], "-e") == 0) {
maxError = atof(argv[++arg]);
Expand Down Expand Up @@ -405,7 +415,7 @@ main(int argc, char **argv) {

gkStore *gkp = gkStore::gkStore_open(gkpName);
uint64 maxIID = gkp->gkStore_getNumReads() + 1;
uint32 *iidToBucket = computeIIDperBucket(fileLimit, memoryLimit, maxIID, fileList);
uint32 *iidToBucket = computeIIDperBucket(fileLimit, memoryLimit, maxMemoryLimit, maxIID, fileList);

uint32 maxFiles = sysconf(_SC_OPEN_MAX);

Expand Down

0 comments on commit 81a7505

Please sign in to comment.