Skip to content

Commit

Permalink
Add shift-register emit(), fix printing of state, command line options.
Browse files Browse the repository at this point in the history
  • Loading branch information
brianwalenz committed Feb 13, 2020
1 parent d9130cc commit 01f6a6e
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 71 deletions.
156 changes: 156 additions & 0 deletions src/sequence/sequence-shiftregister-emit-fast.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@

/******************************************************************************
*
* This file is part of canu, a software program that assembles whole-genome
* sequencing reads into contigs.
*
* This software is based on:
* 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
* the 'kmer package' (http://kmer.sourceforge.net)
* both originally distributed by Applera Corporation under the GNU General
* Public License, version 2.
*
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
* This file is derived from:
*
* src/sequence/sequence.C
*
* Modifications by:
*
* Brian P. Walenz beginning on 2018-JUL-21
* are a 'United States Government Work', and
* are released in the public domain
*
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/

#include "sequence/sequence.H"
#include "sequence/sequence-shiftregister-gf4.H"

#include "bits.H"


void
emitShiftRegisterFast(shiftRegisterParameters &srPar) {
// Allocate space for the loop detector, set local variables.

uint64 sr = srPar.getEncodedSR(); // The shift register state
uint64 cyclen = srPar.getCycleLen(); //
uint64 cycmax = srPar.getCycleMax(); // The length of the maximum cycle
uint64 sv = srPar.getEncodedSVmin(); // The tap vector
uint64 svmax = srPar.getEncodedSVmax(); // The first vector we don't want to examine
uint64 svmask = srPar.getEncodedSVmask();
uint64 gf4widemult[4];

// If no length supplied, set it so we'll emit every possible kmer (except
// the all-zero kmer that we manually insert).

if (srPar.length == 0)
srPar.length = cycmax + srPar.order - 1 - 1;

// Log.

fprintf(stderr, "Emitting %lu bases, plus an extra %u.\n", srPar.length - srPar.order - 1 - 1, srPar.order - 1);
fprintf(stderr, " sr %0*lo\n", srPar.order, expandTo3(sr));
fprintf(stderr, " cyclen %lu\n", cyclen);
fprintf(stderr, " cycmax %lu\n", cycmax);
fprintf(stderr, " sv %0*lo\n", srPar.order, expandTo3(sv));
fprintf(stderr, " svmax %0*lo\n", srPar.order, expandTo3(svmax));
fprintf(stderr, " svmask %0*lo\n", srPar.order, expandTo3(svmask));
fprintf(stderr, "\n");

// Write a header.

fprintf(stdout, ">bases\n");

// Precompute the result of gf4mult[sv[kk]][out] used in the addition.
// There are four possible 'out' values, and sv is fixed for each cycle.

for (uint64 out=0; out<4; out++) {
uint64 mult = 0;

mult |= gf4mult[(sv >> 42) & 0x03][out]; mult <<= 2; // 22
mult |= gf4mult[(sv >> 40) & 0x03][out]; mult <<= 2; // 21
mult |= gf4mult[(sv >> 38) & 0x03][out]; mult <<= 2; // 20
mult |= gf4mult[(sv >> 36) & 0x03][out]; mult <<= 2; // 19
mult |= gf4mult[(sv >> 34) & 0x03][out]; mult <<= 2; // 18
mult |= gf4mult[(sv >> 32) & 0x03][out]; mult <<= 2; // 17
mult |= gf4mult[(sv >> 30) & 0x03][out]; mult <<= 2; // 16
mult |= gf4mult[(sv >> 28) & 0x03][out]; mult <<= 2; // 15
mult |= gf4mult[(sv >> 26) & 0x03][out]; mult <<= 2; // 14
mult |= gf4mult[(sv >> 24) & 0x03][out]; mult <<= 2; // 13
mult |= gf4mult[(sv >> 22) & 0x03][out]; mult <<= 2; // 12
mult |= gf4mult[(sv >> 20) & 0x03][out]; mult <<= 2; // 11
mult |= gf4mult[(sv >> 18) & 0x03][out]; mult <<= 2; // 10
mult |= gf4mult[(sv >> 16) & 0x03][out]; mult <<= 2; // 9
mult |= gf4mult[(sv >> 14) & 0x03][out]; mult <<= 2; // 8
mult |= gf4mult[(sv >> 12) & 0x03][out]; mult <<= 2; // 7
mult |= gf4mult[(sv >> 10) & 0x03][out]; mult <<= 2; // 6
mult |= gf4mult[(sv >> 8) & 0x03][out]; mult <<= 2; // 5
mult |= gf4mult[(sv >> 6) & 0x03][out]; mult <<= 2; // 4
mult |= gf4mult[(sv >> 4) & 0x03][out]; mult <<= 2; // 3
mult |= gf4mult[(sv >> 2) & 0x03][out]; mult <<= 2; // 2
mult |= gf4mult[(sv >> 0) & 0x03][out]; // 1

gf4widemult[out] = mult & svmask;
}

// Loop until we have emitted the requested number of bases.

uint32 bufferMax = 100;
uint32 bufferLen = 0;
char *buffer = new char [bufferMax];

bool emitExtra = true;
uint32 emitCount = 1;

for (cyclen=0; cyclen < srPar.length; cyclen++) {
uint64 out = sr & 0x03; // Save the output value
uint64 mul = gf4widemult[out]; // Compute the multiplier

// Emit the base.

buffer[bufferLen++] = srPar.numberToBase(out);

if (bufferLen == bufferMax) {
writeToFile(buffer, "bases", sizeof(char), bufferLen, stdout);

bufferLen = 0;
}

// If this is the first time we've seen order-1 zero's in a row, output
// an extra one to make the all-zero kmer.

if (out == 0)
emitCount++;
else
emitCount = 1;

if ((emitExtra == true) && (emitCount == srPar.order)) {
emitExtra = false;
buffer[bufferLen++] = srPar.numberToBase(0);
}

// Log.

#ifdef DEBUG
fprintf(stderr, "cycle %8lu out %02lx sr %0*lo\n", cyclen, out, srPar.order, expandTo3(sr >> 2));
fprintf(stderr, " add %0*lo\n", srPar.order, expandTo3(mul));
fprintf(stderr, " final %0*lo\n", srPar.order, expandTo3((sr >> 2) ^ mul));
#endif

// And do the work.

sr = (sr >> 2) ^ mul;
}

writeToFile(buffer, "bases", sizeof(char), bufferLen, stdout);
fprintf(stdout, "\n");

delete [] buffer;
}


31 changes: 15 additions & 16 deletions src/sequence/sequence-shiftregister-search-fast.C
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ searchShiftRegisterFast(shiftRegisterParameters &srPar) {

// Log.

fprintf(stderr, "Finding cycles for length %u bases.\n", srPar.len);
fprintf(stderr, " sr %022lo\n", expandTo3(sr));
fprintf(stderr, " cyclen %lu\n", cyclen);
fprintf(stderr, " cycmax %lu\n", cycmax);
fprintf(stderr, " sv %022lo\n", expandTo3(sv));
fprintf(stderr, " svmax %022lo\n", expandTo3(svmax));
fprintf(stderr, " svmask %022lo\n", expandTo3(svmask));
fprintf(stderr, "Finding cycles for length %u bases.\n", srPar.order);
fprintf(stderr, " sr %0*lo\n", srPar.order, expandTo3(sr));
fprintf(stderr, " cyclen %lu\n", cyclen);
fprintf(stderr, " cycmax %lu\n", cycmax);
fprintf(stderr, " sv %0*lo\n", srPar.order, expandTo3(sv));
fprintf(stderr, " svmax %0*lo\n", srPar.order, expandTo3(svmax));
fprintf(stderr, " svmask %0*lo\n", srPar.order, expandTo3(svmask));
fprintf(stderr, "\n");

// We can precompute the result of gf4mult[sv[kk]][out] used in the addition.
Expand Down Expand Up @@ -99,7 +99,7 @@ searchShiftRegisterFast(shiftRegisterParameters &srPar) {
mult &= svmask;

#ifdef DEBUG
fprintf(stderr, "widemult[%022lo][%02x] %022lo\n", expandTo3(sv), out, expandTo3(mult));
fprintf(stderr, "widemult[%0*lo][%02x] %0*lo\n", srPar.order, expandTo3(sv), out, srPar.order, expandTo3(mult));
#endif

gf4widemult[out] = mult;
Expand All @@ -116,29 +116,28 @@ searchShiftRegisterFast(shiftRegisterParameters &srPar) {

detect->clear(); // Reset the cycle detector.

for (cyclen=0; (detect->flipBit(sr) == false); cyclen++) {
for (cyclen=1; (detect->flipBit(sr) == false); cyclen++) {
uint64 out = sr & 0x03; // Save the output value
uint64 mul = gf4widemult[out]; // Compute the multiplier

#ifdef DEBUG
fprintf(stderr, "cycle %8lu out %02lx sr %022lo\n", cyclen, out, expandTo3(sr >> 2));
fprintf(stderr, " add %022lo\n", expandTo3(mul));
fprintf(stderr, " final %022lo\n", expandTo3((sr >> 2) ^ mul));
fprintf(stderr, "cycle %8lu out %02lx sr %0*lo\n", cyclen, out, srPar.order, expandTo3(sr >> 2));
fprintf(stderr, " add %0*lo\n", srPar.order, expandTo3(mul));
fprintf(stderr, " final %0*lo\n", srPar.order, expandTo3((sr >> 2) ^ mul));
#endif

sr = (sr >> 2) ^ mul;
}

// Report the cycle if it's sufficiently large.

#if 1
if (cyclen + 1 >= 1.00 * cycmax)
fprintf(stdout, "%12lu/%12lu %7.3f%% for vector %022lo\n",
if (cyclen + 1 >= srPar.report * cycmax)
fprintf(stdout, "%14lu/%14lu %7.3f%% for vector %0*lo\n",
cyclen,
cycmax,
100.0 * cyclen / cycmax,
srPar.order,
expandTo3(sv));
#endif

// And move the next SV.

Expand Down
46 changes: 23 additions & 23 deletions src/sequence/sequence-shiftregister-search-slow.C
Original file line number Diff line number Diff line change
Expand Up @@ -122,28 +122,28 @@ searchShiftRegisterSlow(shiftRegisterParameters &srPar) {
// Make the tap vector a monic polynomial, else we're guaranteed to
// never find a maximal size cycle.

cycmax = ((uint64)1) << (2 * srPar.len);
cycmax = ((uint64)1) << (2 * srPar.order);
SR[0] = 1;
sr[0] = 1;
sv[srPar.len-1] = 1;
sv[srPar.order-1] = 1;

for (uint32 ii=0; ii<srPar.len; ii++)
for (uint32 ii=0; ii<srPar.order; ii++)
svmax[ii] = 3;

// If we're given intiial values, use those.

if (srPar.sr[0] != 0) {
for (uint32 ii=0; ii<srPar.len; ii++)
for (uint32 ii=0; ii<srPar.order; ii++)
SR[ii] = sr[ii] = srPar.sr[ii] - '0';
}

if (srPar.svmin[0] != 0) {
for (uint32 ii=0; ii<srPar.len; ii++)
for (uint32 ii=0; ii<srPar.order; ii++)
sv[ii] = srPar.svmin[ii] - '0';
}

if (srPar.svmax[0] != 0) {
for (uint32 ii=0; ii<srPar.len; ii++)
for (uint32 ii=0; ii<srPar.order; ii++)
svmax[ii] = srPar.svmax[ii] - '0';
}

Expand All @@ -153,27 +153,27 @@ searchShiftRegisterSlow(shiftRegisterParameters &srPar) {

// Log.

fprintf(stderr, "Finding cycles for length %u bases (slow method).\n", srPar.len);
fprintf(stderr, " sr %s\n", printSV(srPar.len, sr));
fprintf(stderr, "Finding cycles for length %u bases (slow method).\n", srPar.order);
fprintf(stderr, " sr %s\n", printSV(srPar.order, sr));
fprintf(stderr, " cyclen %lu\n", cyclen);
fprintf(stderr, " cycmax %lu\n", cycmax);
fprintf(stderr, " sv %s\n", printSV(srPar.len, sv));
fprintf(stderr, " svmax %s\n", printSV(srPar.len, svmax));
fprintf(stderr, " sv %s\n", printSV(srPar.order, sv));
fprintf(stderr, " svmax %s\n", printSV(srPar.order, svmax));
fprintf(stderr, "\n");

//

bitArray *detect = new bitArray(cycmax);
uint64 kmer = 0;

while (lessThanEqual(srPar.len, sv, svmax) == true) {
//fprintf(stderr, "SV %s\r", printSV(srPar.len, sv));
while (lessThanEqual(srPar.order, sv, svmax) == true) {
//fprintf(stderr, "SV %s\r", printSV(srPar.order, sv));

// Reset the shift register, rebuild the kmer.

kmer = 0;

for (uint32 ii=0; ii<srPar.len; ii++) {
for (uint32 ii=0; ii<srPar.order; ii++) {
sr[ii] = SR[ii];

kmer <<= 2;
Expand All @@ -184,42 +184,42 @@ searchShiftRegisterSlow(shiftRegisterParameters &srPar) {

detect->clear();

for (cyclen=0; (detect->flipBit(kmer) == false); cyclen++) {
for (cyclen=1; (detect->flipBit(kmer) == false); cyclen++) {
uint32 out = sr[0];

#ifdef DEBUG
fprintf(stderr, "cycle %8lu out %c sr %s\n", cyclen, out + '0', printSV(srPar.len, sr + 1));
fprintf(stderr, " add %s\n", printMult(srPar.len, sv, out));
fprintf(stderr, "cycle %8lu out %c sr %s\n", cyclen, out + '0', printSV(srPar.order, sr + 1));
fprintf(stderr, " add %s\n", printMult(srPar.order, sv, out));
#endif

// Shift, add in the taps, and rebuild the kmer

kmer = 0;

for (uint32 kk=0; kk<srPar.len; kk++) {
for (uint32 kk=0; kk<srPar.order; kk++) {
sr[kk] = sr[kk+1] ^ gf4mult[sv[kk]][out];

kmer <<= 2;
kmer |= sr[kk];
}

#ifdef DEBUG
fprintf(stderr, " final %s\n", printSV(srPar.len, sr));
fprintf(stderr, " final %s\n", printSV(srPar.order, sr));
#endif
}

// report the cycle.
if (cyclen + 1 >= 1.00 * cycmax)
fprintf(stdout, "%12lu/%12lu %7.3f%% for vector %s\n",
if (cyclen + 1 >= srPar.report * cycmax)
fprintf(stdout, "%14lu/%14lu %7.3f%% for vector %s\n",
cyclen,
cycmax,
100.0 * cyclen / cycmax,
printSV(srPar.len, sv));
printSV(srPar.order, sv));

incrementSV(srPar.len, sv);
incrementSV(srPar.order, sv);
}

fprintf(stderr, "SV %s\n", printSV(srPar.len, sv));
fprintf(stderr, "SV %s\n", printSV(srPar.order, sv));

delete detect;
}
Loading

0 comments on commit 01f6a6e

Please sign in to comment.