Skip to content

Commit

Permalink
Add kmerIterator interface to return something for every base; use th…
Browse files Browse the repository at this point in the history
…at to report N bases in meryl-lookup output.
  • Loading branch information
brianwalenz committed Feb 12, 2020
1 parent e1d9ff4 commit 4724187
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 21 deletions.
43 changes: 28 additions & 15 deletions src/meryl/meryl-lookup.C
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,31 @@ dumpExistence(dnaSeqFile *sfile,
for (uint32 seqId=0; sfile->loadSequence(seq); seqId++) {
kmerIterator kiter(seq.bases(), seq.length());

while (kiter.nextMer()) {
for (uint32 dd=0; dd<klookup.size(); dd++) {
uint64 fValue = 0;
uint64 rValue = 0;
bool fExists = klookup[dd]->exists(kiter.fmer(), fValue);
bool rExists = klookup[dd]->exists(kiter.rmer(), rValue);

fprintf(ofile->file(), "%s\t%u\t%lu\t%c\t%s\t%lu\t%s\t%lu\t%s\n",
while (kiter.nextBase()) {
if (kiter.isValid() == false) {
fprintf(ofile->file(), "%s\t%u\t%lu\t%c\n",
seq.name(),
seqId,
kiter.position(),
(fExists || rExists) ? 'T' : 'F',
kiter.fmer().toString(fString), fValue,
kiter.rmer().toString(rString), rValue,
klabel[dd]);
kiter.isACGTbgn() ? 'n' : 'N');
}

else {
for (uint32 dd=0; dd<klookup.size(); dd++) {
uint64 fValue = 0;
uint64 rValue = 0;
bool fExists = klookup[dd]->exists(kiter.fmer(), fValue);
bool rExists = klookup[dd]->exists(kiter.rmer(), rValue);

fprintf(ofile->file(), "%s\t%u\t%lu\t%c\t%s\t%lu\t%s\t%lu\t%s\n",
seq.name(),
seqId,
kiter.position(),
(fExists || rExists) ? 'T' : 'F',
kiter.fmer().toString(fString), fValue,
kiter.rmer().toString(rString), rValue,
labels[dd]);
}
}
}
}
Expand Down Expand Up @@ -228,7 +238,7 @@ main(int argc, char **argv) {
while ((arg + 1 < argc) && (argv[arg + 1][0] != '-'))
inputDBname.push_back(argv[++arg]);

} else if (strcmp(argv[arg], "-label") == 0) {
} else if (strcmp(argv[arg], "-labels") == 0) {
while ((arg + 1 < argc) && (argv[arg + 1][0] != '-'))
inputDBlabel.push_back(argv[++arg]);

Expand Down Expand Up @@ -281,12 +291,15 @@ main(int argc, char **argv) {
if (err.size() > 0) {
fprintf(stderr, "usage: %s <report-type> \\\n", argv[0]);
fprintf(stderr, " -sequence <input1.fasta> [<input2.fasta>] \\\n");
fprintf(stderr, " -output <output1> [<output2>]\n");
fprintf(stderr, " -mers <input1.meryl> [<input2.meryl>] [...] \\\n");
fprintf(stderr, " -output <output1> [<output2>]\n");
fprintf(stderr, " -labels <input1name> [<input2name>] [...]\n");
fprintf(stderr, " Query the kmers in meryl database(s) <input.meryl> with the sequences\n");
fprintf(stderr, " in <input.fasta>.\n");
fprintf(stderr, "\n");
fprintf(stderr, " Multiple databases and multiple input files are supported.\n");
fprintf(stderr, " Multiple databases are supported.\n");
fprintf(stderr, "\n");
fprintf(stderr, " Up to two inptu sequences are supported (only for -include / -exclude).\n");
fprintf(stderr, "\n");
fprintf(stderr, " Input files can be FASTA or FASTQ; uncompressed, gz, bz2 or xz compressed\n");
fprintf(stderr, "\n");
Expand Down
86 changes: 80 additions & 6 deletions src/utility/kmers.H
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ public:
};

void reset(void) {
_kmerSize = _fmer.merSize();
_kmerLoad = 0;
_kmerValid = _fmer.merSize() - 1;
};
Expand All @@ -211,16 +212,16 @@ public:
_bufferPos = 0;
};

// The primary interface. Iterate over all valid mers, silently skipping over
// any invalid ones.
//
bool nextMer(void) {
nextMer_anotherBase:
if (_bufferPos >= _bufferLen) // No more sequence, and not a valid kmer.
return(false);

if ((_buffer[_bufferPos] != 'A') && (_buffer[_bufferPos] != 'a') &&
(_buffer[_bufferPos] != 'C') && (_buffer[_bufferPos] != 'c') &&
(_buffer[_bufferPos] != 'G') && (_buffer[_bufferPos] != 'g') &&
(_buffer[_bufferPos] != 'T') && (_buffer[_bufferPos] != 't')) {
_kmerLoad = 0; // Not a valis base. Clear the current
if (isACGT(_bufferPos) == false) {
_kmerLoad = 0; // Not a valid base. Clear the current
_bufferPos++; // kmer and move to the next base.
goto nextMer_anotherBase;
}
Expand All @@ -238,11 +239,84 @@ public:
return(true); // Valid kmer!
};

// Alternate interface. Iterate over all bases. Use isValid() to test if the kmer
// ending at this base is valid. Use isValidBgn() and isValidEnd() to decide if the
// base at the start/end of the kmer is ACGT.
//
bool isValid(void) {
return(_kmerLoad == _kmerSize);
}

bool isACGT(uint64 pos) {
return((_buffer[pos] == 'A') || (_buffer[pos] == 'a') ||
(_buffer[pos] == 'C') || (_buffer[pos] == 'c') ||
(_buffer[pos] == 'G') || (_buffer[pos] == 'g') ||
(_buffer[pos] == 'T') || (_buffer[pos] == 't'));
}

bool isACGTbgn(void) {
return(isACGT(_bufferPos - _kmerSize));
}

bool isACGTend(void) {
return(isACGT(_bufferPos - 1));
}

char getBaseBgn(void) {return(_buffer[_bufferPos - _kmerSize]);}
char getBaseEnd(void) {return(_buffer[_bufferPos - 1]);}

bool nextBase(void) {

// Preload the first kmerSize-1 bases.

while ((_bufferPos < _kmerSize - 1) &&
(_bufferPos < _bufferLen)) {
if (isACGT(_bufferPos) == false) { // Not a valid base, reset the counter.
_kmerLoad = 0;
} else {
_fmer.addR(_buffer[_bufferPos]); // A valid base, so push it onto
_rmer.addL(_buffer[_bufferPos]); // the kmer.

_kmerLoad++;
}

_bufferPos++;
}

// Stop if we're out of sequence.

if (_bufferPos >= _bufferLen) // No more sequence, stop.
return(false);

// Load another base.

if (isACGT(_bufferPos) == false) { // Not a valid base, reset the counter.
_kmerLoad = 0;
}

else {
_fmer.addR(_buffer[_bufferPos]); // A valid base, so push it onto
_rmer.addL(_buffer[_bufferPos]); // the kmer.

if (_kmerLoad < _kmerSize) // Increment the loaded size,
_kmerLoad++; // if not full already.
}

_bufferPos++;

return(true); // A base was consumed.
}


kmerTiny fmer(void) { return(_fmer); };
kmerTiny rmer(void) { return(_rmer); };
uint64 position(void) { return(_bufferPos - _fmer.merSize()); };
uint64 position(void) { return(_bufferPos - _kmerSize); };

uint64 bgnPosition(void) { return(_bufferPos - _kmerSize); };
uint64 endPosition(void) { return(_bufferPos); };

private:
uint32 _kmerSize;
uint32 _kmerLoad;
uint32 _kmerValid;

Expand Down

0 comments on commit 4724187

Please sign in to comment.