forked from marbl/canu
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add utility to convert a list of reads to contig layouts.
- Loading branch information
1 parent
550e9ec
commit 790f3cb
Showing
4 changed files
with
391 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
|
||
my $tigid; | ||
my $readlist; | ||
|
||
while (<STDIN>) { | ||
if (m/^tig\s(\d+)/) { | ||
$tigid = $1; | ||
} | ||
|
||
if (m/^read\s+(\d+)\s+anchor.*position\s+(\d+)\s+(\d+)\s/) { | ||
my $bgn = $2; | ||
my $end = $3; | ||
my $rid = $1 . (($bgn < $end) ? "+" : "-"); | ||
|
||
if (defined($readlist)) { | ||
$readlist .= ",$rid"; | ||
} else { | ||
$readlist = $rid; | ||
} | ||
} | ||
|
||
if (m/^tigend/) { | ||
print "tig$tigid $readlist\n"; | ||
|
||
undef $tigid; | ||
undef $readlist; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,327 @@ | ||
|
||
/****************************************************************************** | ||
* | ||
* This file is part of canu, a software program that assembles whole-genome | ||
* sequencing reads into contigs. | ||
* | ||
* This software is based on: | ||
* 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) | ||
* the 'kmer package' r1994 (http://kmer.sourceforge.net) | ||
* | ||
* Except as indicated otherwise, this is a 'United States Government Work', | ||
* and is released in the public domain. | ||
* | ||
* File 'README.licenses' in the root directory of this distribution | ||
* contains full conditions and disclaimers. | ||
*/ | ||
|
||
#include "AS_BAT_ReadInfo.H" | ||
#include "AS_BAT_OverlapCache.H" | ||
|
||
#include "AS_BAT_Logging.H" | ||
|
||
#include "AS_BAT_Unitig.H" | ||
|
||
#include "AS_BAT_Instrumentation.H" | ||
#include "AS_BAT_PlaceContains.H" | ||
|
||
#include "AS_BAT_Outputs.H" | ||
|
||
#include "AS_BAT_TigGraph.H" | ||
|
||
|
||
|
||
ReadInfo *RI = 0L; | ||
OverlapCache *OC = 0L; | ||
BestOverlapGraph *OG = 0L; | ||
|
||
|
||
|
||
void | ||
importTigsFromReadList(char const *prefix, | ||
TigVector &tigs, | ||
char const *readListPath) { | ||
uint32 lineLen = 0; | ||
uint32 lineMax = 0; | ||
char *line = nullptr; | ||
splitToWords words; | ||
|
||
FILE *listFile = AS_UTL_openInputFile(readListPath); | ||
|
||
while (AS_UTL_readLine(line, lineLen, lineMax, listFile) == true) { | ||
for (uint32 ii=0; ii<lineLen; ii++) // Change commas to spaces | ||
if (line[ii] == ',') // so we can split out words | ||
line[ii] = ' '; // easily. | ||
|
||
words.split(line); | ||
|
||
// Make a new tig. | ||
|
||
Unitig *tig = tigs.newUnitig(); | ||
|
||
// Add all the reads to it. | ||
|
||
for (uint32 rr=1; rr<words.numWords(); rr++) { | ||
uint32 fi = 0; | ||
int32 fbgn = 0; | ||
int32 fend = 0; | ||
int32 flen = 0; | ||
bool ffwd = true; | ||
|
||
// Iterate over every letter in the name. On the first digit, extract | ||
// the read ID. And set 'fwd' based on the last =+' or '-'. | ||
|
||
for (uint32 pp=0; words[rr][pp] != 0; pp++) { | ||
if ((fi == 0) && (isdigit(words[rr][pp]))) { | ||
fi = strtouint32(&words[rr][pp]); | ||
flen = RI->readLength(fi); | ||
} | ||
|
||
if (words[rr][pp] == '-') | ||
ffwd = false; | ||
|
||
if (words[rr][pp] == '+') | ||
ffwd = true; | ||
} | ||
|
||
if (fi == 0) | ||
writeLog("failed to parse read ID from '%s'\n", words[rr]); | ||
if (flen == 0) | ||
writeLog("reference to non-existent read ID %u from '%s'\n", fi, words[rr]); | ||
assert(fi != 0); | ||
assert(flen != 0); | ||
|
||
// If this is the first read, we can immediately add it. | ||
|
||
if (rr == 1) { | ||
fbgn = 0; | ||
fend = flen; | ||
|
||
if (ffwd == true) { | ||
writeLog("place initial read %8u len %5u --> at %7u-%-7u\n", fi, flen, fbgn, fend); | ||
tig->addRead(ufNode(fi, fbgn, fend)); | ||
} else { | ||
writeLog("place initial read %8u len %5u <-- at %7u-%-7u\n", fi, flen, fend, fbgn); | ||
tig->addRead(ufNode(fi, fend, fbgn)); | ||
} | ||
|
||
continue; | ||
} | ||
|
||
// Find the overlap to the last read in the tig. | ||
|
||
uint32 pi = tig->ufpath.back().ident; | ||
bool pfwd = tig->ufpath.back().isForward(); | ||
ufNode &pnode = tig->ufpath.back(); | ||
|
||
uint32 povlLen = 0; | ||
BAToverlap *povl = OC->getOverlaps(pi, povlLen); | ||
|
||
bool wantFlip = (pfwd != ffwd); | ||
bool placed = false; | ||
|
||
for (uint32 poo=0; poo<povlLen; poo++) { // Iterator 'poo' for consistency with 'povl'. Not because it's funny. You'll have to trust me on that. | ||
if (povl[poo].b_iid != fi) // Overlap not to the | ||
continue; // read we want to place. | ||
|
||
if (povl[poo].flipped != wantFlip) // Overlap flippedness doesn't | ||
continue; // agree with what we need. | ||
|
||
// Must be the overlap we want. | ||
|
||
placed = true; | ||
|
||
// Compute the position of the read. Some nuance to point out here. | ||
// If pnode is forward, then .bgn is the min coord, and a_hang _should_ be positive. | ||
// If pnode is reverse, then .bgn is the max coord, and a_hang _should_ be negative. | ||
|
||
fbgn = pnode.position.bgn + povl[poo].a_hang; | ||
fend = pnode.position.bgn + povl[poo].a_hang + flen; | ||
|
||
if (pfwd == true) assert(povl[poo].a_hang >= 0); | ||
if (pfwd == false) assert(povl[poo].a_hang <= 0); | ||
|
||
// However, testing with bogart layouts occasionally resulted in the | ||
// next read being placed before the previous read, probably due to | ||
// the 'optimization' baloney. | ||
|
||
if (fbgn < pnode.position.min()) { | ||
fbgn = pnode.position.min(); | ||
fend = pnode.position.min() + flen; | ||
} | ||
|
||
// Now just compute the position of the new read and plop it in the | ||
// tig. | ||
|
||
if (ffwd == true) { | ||
writeLog("place read %8u len %5u --> at %7u-%-7u\n", fi, flen, fbgn, fend); | ||
tig->addRead(ufNode(fi, fbgn, fend)); | ||
} else { | ||
writeLog("place read %8u len %5u <-- at %7u-%-7u\n", fi, flen, fend, fbgn); | ||
tig->addRead(ufNode(fi, fend, fbgn)); | ||
} | ||
} | ||
|
||
if (placed == false) { | ||
writeLog("Failed to place read '%s'\n", words[rr]); | ||
flushLog(); | ||
assert(0); | ||
} | ||
} | ||
} | ||
|
||
delete [] line; | ||
|
||
AS_UTL_closeFile(listFile, readListPath); | ||
} | ||
|
||
|
||
|
||
int | ||
main (int argc, char **argv) { | ||
char const *seqStorePath = NULL; | ||
char const *ovlStorePath = NULL; | ||
char const *readListPath = NULL; | ||
char const *prefix = NULL; | ||
|
||
double erateGraph = 0.075; | ||
double deviationGraph = 6.0; | ||
double erateMax = 0.100; | ||
|
||
uint32 minReadLen = 0; | ||
uint32 maxReadLen = UINT32_MAX; | ||
|
||
uint64 ovlCacheMemory = UINT64_MAX; | ||
uint32 minOverlapLen = 500; | ||
|
||
uint64 genomeSize = 0; | ||
|
||
argc = AS_configure(argc, argv); | ||
|
||
vector<char const *> err; | ||
int arg = 1; | ||
while (arg < argc) { | ||
if (strcmp(argv[arg], "-S") == 0) { | ||
seqStorePath = argv[++arg]; | ||
} | ||
|
||
else if (strcmp(argv[arg], "-O") == 0) { | ||
ovlStorePath = argv[++arg]; | ||
} | ||
|
||
else if (strcmp(argv[arg], "-R") == 0) { | ||
readListPath = argv[++arg]; | ||
} | ||
|
||
else if (strcmp(argv[arg], "-o") == 0) { | ||
prefix = argv[++arg]; | ||
} | ||
|
||
else if (strcmp(argv[arg], "-gs") == 0) { | ||
genomeSize = strtouint64(argv[++arg]); | ||
} | ||
|
||
else { | ||
char *s = new char [1024]; | ||
snprintf(s, 1024, "Unknown option '%s'.\n", argv[arg]); | ||
err.push_back(s); | ||
} | ||
|
||
arg++; | ||
} | ||
|
||
if (genomeSize == 0) err.push_back("Genome size (-gs option) must be supplied\n"); | ||
if (seqStorePath == NULL) err.push_back("No sequence store (-S option) supplied.\n"); | ||
if (ovlStorePath == NULL) err.push_back("No overlap store (-O option) supplied.\n"); | ||
if (readListPath == NULL) err.push_back("No list of reads (-R option) supplied.\n"); | ||
if (prefix == NULL) err.push_back("No output prefix name (-o option) supplied.\n"); | ||
|
||
if (err.size() > 0) { | ||
fprintf(stderr, "usage: %s -S seqPath -O ovlPath -R readListPath -o outPrefix ...\n", argv[0]); | ||
fprintf(stderr, "\n"); | ||
fprintf(stderr, "Mandatory Parameters:\n"); | ||
fprintf(stderr, "\n"); | ||
fprintf(stderr, " -S seqPath Mandatory path to an existing seqStore.\n"); | ||
fprintf(stderr, " -O ovlPath Mandatory path to an existing ovlStore.\n"); | ||
fprintf(stderr, " -R readListPath Mandatory path to an existing ovlStore.\n"); | ||
fprintf(stderr, " -o outPrefix Mandatory prefix for the output files.\n"); | ||
fprintf(stderr, "\n"); | ||
|
||
for (uint32 ii=0; ii<err.size(); ii++) | ||
if (err[ii]) | ||
fputs(err[ii], stderr); | ||
|
||
exit(1); | ||
} | ||
|
||
// | ||
|
||
writeStatus("\n"); | ||
writeStatus("==> LOADING READ AND OVERLAP INFORMATION.\n"); | ||
writeStatus("\n"); | ||
|
||
setLogFile(prefix, "loadInformation"); | ||
|
||
RI = new ReadInfo(seqStorePath, prefix, minReadLen, maxReadLen); | ||
OC = new OverlapCache(ovlStorePath, prefix, max(erateMax, erateGraph), minOverlapLen, ovlCacheMemory, genomeSize); | ||
|
||
// | ||
|
||
writeStatus("\n"); | ||
writeStatus("==> READING LAYOUTS AND BUILDING TIGS.\n"); | ||
writeStatus("\n"); | ||
|
||
setLogFile(prefix, "layoutTigs"); | ||
|
||
TigVector contigs(RI->numReads()); // Both initial greedy tigs and final contigs | ||
|
||
importTigsFromReadList(prefix, contigs, readListPath); | ||
|
||
setLogFile(prefix, "layoutTigsOpt"); | ||
contigs.optimizePositions(prefix, "layoutTigsOpt"); | ||
reportTigs(contigs, prefix, "layoutTigsOpt", genomeSize); | ||
|
||
// | ||
|
||
writeStatus("\n"); | ||
writeStatus("==> INSERTING CONTAINED READS.\n"); | ||
writeStatus("\n"); | ||
|
||
setLogFile(prefix, "placeContains"); | ||
|
||
set<uint32> placedReads; | ||
|
||
placeUnplacedUsingAllOverlaps(contigs, | ||
deviationGraph, | ||
erateGraph, | ||
prefix, | ||
placedReads); | ||
|
||
setLogFile(prefix, "placeContainsOpt"); | ||
contigs.optimizePositions(prefix, "placeContainsOpt"); | ||
reportTigs(contigs, prefix, "placeContainsOpt", genomeSize); | ||
|
||
// | ||
|
||
writeStatus("\n"); | ||
writeStatus("==> GENERATE OUTPUTS.\n"); | ||
writeStatus("\n"); | ||
|
||
setLogFile(prefix, "generateOutputs"); | ||
|
||
reportTigs(contigs, prefix, "final", genomeSize); | ||
|
||
//setParentAndHang(contigs); | ||
writeTigsToStore(contigs, prefix, "ctg", true); | ||
|
||
// | ||
|
||
delete OC; | ||
delete RI; | ||
|
||
writeStatus("\n"); | ||
writeStatus("Bye.\n"); | ||
|
||
return(0); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# If 'make' isn't run from the root directory, we need to set these to | ||
# point to the upper level build directory. | ||
ifeq "$(strip ${BUILD_DIR})" "" | ||
BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj | ||
endif | ||
ifeq "$(strip ${TARGET_DIR})" "" | ||
TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) | ||
endif | ||
|
||
TARGET := layoutReads | ||
SOURCES := layoutReads.C \ | ||
AS_BAT_Instrumentation.C \ | ||
AS_BAT_Logging.C \ | ||
AS_BAT_OptimizePositions.C \ | ||
AS_BAT_Outputs.C \ | ||
AS_BAT_OverlapCache.C \ | ||
AS_BAT_PlaceContains.C \ | ||
AS_BAT_PlaceReadUsingOverlaps.C \ | ||
AS_BAT_ReadInfo.C \ | ||
AS_BAT_SetParentAndHang.C \ | ||
AS_BAT_TigVector.C \ | ||
AS_BAT_Unitig.C \ | ||
AS_BAT_Unitig_AddRead.C \ | ||
AS_BAT_Unitig_PlaceReadUsingEdges.C | ||
|
||
SRC_INCDIRS := .. ../utility/src/utility ../stores | ||
|
||
TGT_LDFLAGS := -L${TARGET_DIR}/lib | ||
TGT_LDLIBS := -lcanu | ||
TGT_PREREQS := libcanu.a | ||
|
||
SUBMAKEFILES := |
Oops, something went wrong.