Skip to content

Commit

Permalink
Allow matches to N (and n) ambiguity bases. Based on Martinsos/edlib@9…
Browse files Browse the repository at this point in the history
  • Loading branch information
brianwalenz committed Jan 10, 2019
1 parent dfacdcb commit 3a3c336
Showing 1 changed file with 72 additions and 21 deletions.
93 changes: 72 additions & 21 deletions src/overlapInCore/libedlib/edlib.C
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,37 @@ struct Block {
Block(Word P, Word M, int score) :P(P), M(M), score(score) {}
};


// A memory efficient definition of equality, that
// allows A=a=n=N, C=c=n=N, etc.
//
class EqualityDefinition {
public:
EqualityDefinition() {
n = 255;
N = 255;
};

void setn(unsigned char n_) { // Set the value of an encoded 'n'.
n = n_; // If not set, defaults to 255 which
}; // doesn't match any valid letter.

void setN(unsigned char N_) {
N = N_;
};

bool areEqual(unsigned char a, unsigned char b) const {
return((a == b) ||
(a == n) || (b == n) ||
(a == N) || (b == N));
};

private:
unsigned char n;
unsigned char N;
};


static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
const unsigned char* query, int queryLength,
const unsigned char* target, int targetLength,
Expand All @@ -119,13 +150,13 @@ static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks,
static int obtainAlignment(
const unsigned char* query, const unsigned char* rQuery, int queryLength,
const unsigned char* target, const unsigned char* rTarget, int targetLength,
int alphabetLength, int bestScore,
const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
unsigned char** alignment, int* alignmentLength);

static int obtainAlignmentHirschberg(
const unsigned char* query, const unsigned char* rQuery, int queryLength,
const unsigned char* target, const unsigned char* rTarget, int targetLength,
int alphabetLength, int bestScore,
const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
unsigned char** alignment, int* alignmentLength);

static int obtainAlignmentTraceback(int queryLength, int targetLength,
Expand All @@ -135,14 +166,16 @@ static int obtainAlignmentTraceback(int queryLength, int targetLength,
static int transformSequences(const char* queryOriginal, int queryLength,
const char* targetOriginal, int targetLength,
unsigned char** queryTransformed,
unsigned char** targetTransformed);
unsigned char** targetTransformed,
EqualityDefinition& equalityDefinitio);

static inline int ceilDiv(int x, int y);

static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);

static inline Word* buildPeq(int alphabetLength, const unsigned char* query,
int queryLength);
int queryLength,
const EqualityDefinition& equalityDefinition);



Expand All @@ -165,8 +198,12 @@ EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLeng

/*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
unsigned char* query, * target;
int alphabetLength = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
&query, &target);
EqualityDefinition equalityDefinition;

int alphabetLength = transformSequences(queryOriginal, queryLength,
targetOriginal, targetLength,
&query, &target, equalityDefinition);

result.alphabetLength = alphabetLength;
/*-------------------------------------------------------*/

Expand All @@ -175,7 +212,7 @@ EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLeng
int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks

Word* Peq = buildPeq(alphabetLength, query, queryLength);
Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
/*-------------------------------------------------------*/


Expand Down Expand Up @@ -219,7 +256,7 @@ EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLeng
if (config.mode == EDLIB_MODE_HW) { // If HW, I need to calculate start locations.
const unsigned char* rTarget = createReverseCopy(target, targetLength);
const unsigned char* rQuery = createReverseCopy(query, queryLength);
Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength); // Peq for reversed query
Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition);
for (int i = 0; i < result.numLocations; i++) {
int endLocation = result.endLocations[i];
if (endLocation == -1) {
Expand Down Expand Up @@ -271,7 +308,7 @@ EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLeng
const unsigned char* rQuery = createReverseCopy(query, queryLength);
obtainAlignment(query, rQuery, queryLength,
alnTarget, rAlnTarget, alnTargetLength,
alphabetLength, result.editDistance,
equalityDefinition, alphabetLength, result.editDistance,
&(result.alignment), &(result.alignmentLength));
delete[] rAlnTarget;
delete[] rQuery;
Expand Down Expand Up @@ -369,8 +406,10 @@ void edlibAlignmentToStrings(const unsigned char* alignment, int alignmentLength
* Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
* NOTICE: free returned array with delete[]!
*/
static inline Word* buildPeq(const int alphabetLength, const unsigned char* const query,
const int queryLength) {
static inline Word* buildPeq(const int alphabetLength,
const unsigned char* const query,
const int queryLength,
const EqualityDefinition& equalityDefinition) {
int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
// table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];
Expand All @@ -383,7 +422,7 @@ static inline Word* buildPeq(const int alphabetLength, const unsigned char* cons
for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
Peq[symbol * maxNumBlocks + b] <<= 1;
// NOTE: We pretend like query is padded at the end with W wildcard symbols
if (r >= queryLength || query[r] == symbol)
if (r >= queryLength || equalityDefinition.areEqual(query[r], symbol)) // areEqual
Peq[symbol * maxNumBlocks + b] += 1;
}
} else { // Last symbol is wildcard, so it is all 1s
Expand Down Expand Up @@ -1174,6 +1213,7 @@ static int obtainAlignmentTraceback(const int queryLength, const int targetLengt
* @param [in] target
* @param [in] rTarget Reversed target.
* @param [in] targetLength
* @param [in] equalityDefinition
* @param [in] alphabetLength
* @param [in] bestScore Best(optimal) score.
* @param [out] alignment Sequence of edit operations that make target equal to query.
Expand All @@ -1183,7 +1223,7 @@ static int obtainAlignmentTraceback(const int queryLength, const int targetLengt
static int obtainAlignment(
const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
const int alphabetLength, const int bestScore,
const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
unsigned char** const alignment, int* const alignmentLength) {

// Handle special case when one of sequences has length of 0.
Expand Down Expand Up @@ -1212,7 +1252,7 @@ static int obtainAlignment(
if (alignmentDataSize < 1024 * 1024) {
int score_, endLocation_; // Used only to call function.
AlignmentData* alignData = NULL;
Word* Peq = buildPeq(alphabetLength, query, queryLength);
Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
query, queryLength,
target, targetLength,
Expand All @@ -1229,7 +1269,7 @@ static int obtainAlignment(
} else {
statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength,
target, rTarget, targetLength,
alphabetLength, bestScore,
equalityDefinition, alphabetLength, bestScore,
alignment, alignmentLength);
}
return statusCode;
Expand All @@ -1254,14 +1294,14 @@ static int obtainAlignment(
static int obtainAlignmentHirschberg(
const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
const int alphabetLength, const int bestScore,
const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
unsigned char** const alignment, int* const alignmentLength) {

const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
const int W = maxNumBlocks * WORD_SIZE - queryLength;

Word* Peq = buildPeq(alphabetLength, query, queryLength);
Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength);
Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition);

// Used only to call functions.
int score_, endLocation_;
Expand Down Expand Up @@ -1400,11 +1440,13 @@ static int obtainAlignmentHirschberg(
unsigned char* ulAlignment = NULL; int ulAlignmentLength;
int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight,
target, rTarget + lrWidth, ulWidth,
alphabetLength, leftScore, &ulAlignment, &ulAlignmentLength);
equalityDefinition, alphabetLength, leftScore,
&ulAlignment, &ulAlignmentLength);
unsigned char* lrAlignment = NULL; int lrAlignmentLength;
int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight,
target + ulWidth, rTarget, lrWidth,
alphabetLength, rightScore, &lrAlignment, &lrAlignmentLength);
equalityDefinition, alphabetLength, rightScore,
&lrAlignment, &lrAlignmentLength);
if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) {
delete[] ulAlignment;
delete[] lrAlignment;
Expand Down Expand Up @@ -1443,7 +1485,8 @@ static int obtainAlignmentHirschberg(
static int transformSequences(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength,
unsigned char** const queryTransformed,
unsigned char** const targetTransformed) {
unsigned char** const targetTransformed,
EqualityDefinition &equalityDefinition) {
// Alphabet is constructed from letters that are present in sequences.
// Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
// and new query and target are created in which letters are replaced with their ordinal numbers.
Expand Down Expand Up @@ -1476,6 +1519,14 @@ static int transformSequences(const char* const queryOriginal, const int queryLe
(*targetTransformed)[i] = letterIdx[c];
}

if (inAlphabet['n']) {
equalityDefinition.setn(letterIdx['n']);
}

if (inAlphabet['N']) {
equalityDefinition.setN(letterIdx['N']);
}

return alphabetLength;
}

Expand Down

0 comments on commit 3a3c336

Please sign in to comment.