From e2044dbe30bf8c3b7e1d6de015a29e153c3bd5c6 Mon Sep 17 00:00:00 2001 From: "Brian P. Walenz" Date: Mon, 21 Aug 2017 19:13:20 -0400 Subject: [PATCH] Add refactored falcon consensus. --- addCopyrights.dat | 10 + src/correction/falconConsensus-alignTag.C | 236 +++++++++++++++ src/correction/falconConsensus-alignTag.H | 154 ++++++++++ src/correction/falconConsensus-msa.H | 295 ++++++++++++++++++ src/correction/falconConsensus.C | 345 ++++++++++++++++++++++ src/correction/falconConsensus.H | 172 +++++++++++ src/main.mk | 2 + 7 files changed, 1214 insertions(+) create mode 100644 src/correction/falconConsensus-alignTag.C create mode 100644 src/correction/falconConsensus-alignTag.H create mode 100644 src/correction/falconConsensus-msa.H create mode 100644 src/correction/falconConsensus.C create mode 100644 src/correction/falconConsensus.H diff --git a/addCopyrights.dat b/addCopyrights.dat index 995869773..55d6bc6ed 100644 --- a/addCopyrights.dat +++ b/addCopyrights.dat @@ -12856,3 +12856,13 @@ A src/bogart/AS_BAT_Unitig.C nihh20161121Brian P. Walenz A addCopyrights-BuildData.pl nihh20161121Brian P. Walenz A addCopyrights.dat nihh20161121Brian P. Walenz A addCopyrights.pl nihh20161121Brian P. Walenz +D src/correction/falconConsensus-alignTag.C src/falcon_sense/libfalcon/falcon.C +D src/correction/falconConsensus-alignTag.H src/falcon_sense/libfalcon/falcon.C +D src/correction/falconConsensus-msa.H src/falcon_sense/libfalcon/falcon.C +D src/correction/falconConsensus.C src/falcon_sense/libfalcon/falcon.C +D src/correction/falconConsensus.H src/falcon_sense/libfalcon/falcon.C +A src/correction/falconConsensus-alignTag.C nihh20170821Brian P. Walenz +A src/correction/falconConsensus-alignTag.H nihh20170821Brian P. Walenz +A src/correction/falconConsensus-msa.H nihh20170821Brian P. Walenz +A src/correction/falconConsensus.C nihh20170821Brian P. Walenz +A src/correction/falconConsensus.H nihh20170821Brian P. Walenz diff --git a/src/correction/falconConsensus-alignTag.C b/src/correction/falconConsensus-alignTag.C new file mode 100644 index 000000000..d7c6863c6 --- /dev/null +++ b/src/correction/falconConsensus-alignTag.C @@ -0,0 +1,236 @@ + +/****************************************************************************** + * + * This file is part of canu, a software program that assembles whole-genome + * sequencing reads into contigs. + * + * This software is based on: + * 'Celera Assembler' (http://wgs-assembler.sourceforge.net) + * the 'kmer package' (http://kmer.sourceforge.net) + * both originally distributed by Applera Corporation under the GNU General + * Public License, version 2. + * + * Canu branched from Celera Assembler at its revision 4587. + * Canu branched from the kmer project at its revision 1994. + * + * This file is derived from: + * + * src/falcon_sense/libfalcon/falcon.C + * + * Modifications by: + * + * Brian P. Walenz beginning on 2017-AUG-21 + * are a 'United States Government Work', and + * are released in the public domain + * + * File 'README.licenses' in the root directory of this distribution contains + * full conditions and disclaimers for each license. + */ + +/* + * https://github.com/PacificBiosciences/FALCON/blob/master/src/c/falcon.c + * + * Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Pacific Biosciences nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC + * BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "falconConsensus-alignTag.H" +#include "edlib.H" + +static +alignTagList * +getAlignTags(char *Qalign, int32 Qbgn, int32 Qlen, int32 Qid, // read + char *Talign, int32 Tbgn, int32 Tlen, // template + int32 alignLen) { + int32 i = Qbgn - 1; + int32 j = Tbgn - 1; + int32 jj = 0; // Number of non-gap bases in Q aligned to a gap in T + int32 p_j = -1; + int32 p_jj = 0; + char p_q_base = '.'; + + alignTagList *tags = new alignTagList(alignLen); + + for (int32 k=0; k < alignLen; k++) { + if (Qalign[k] != '-') { + i++; + jj++; + } + + if (Talign[k] != '-') { + j++; + jj = 0; + } + + assert(i >= 0); + assert(i < Qlen); + assert(j >= 0); + assert(j < Tlen); + +#ifdef DEBUG + fprintf(stderr, "t %d %d %d %c %c\n", q_id, j, jj, Talign[k], Qalign[k]); +#endif + + if ((jj >= uint16MAX) || + (p_jj >= uint16MAX)) + continue; + + //tags->addTag(alignTag(Qid, j, p_j, jj, p_jj, Qalign[k], p_q_base)); + tags->setTag(Qid, j, p_j, jj, p_jj, Qalign[k], p_q_base); + + p_j = j; + p_jj = jj; + p_q_base = Qalign[k]; + } + + // sentinal at the end + + //tags->addTag(alignTag()); + + return(tags); +} + + + +alignTagList ** +alignReadsToTemplate(falconInput *evidence, + uint32 evidenceLen, + double minIdentity) { + + double maxDifference = 1.0 - minIdentity; + alignTagList **tagList = new alignTagList * [evidenceLen]; + + // I don't remember where this was causing problems, but reads longer than the template were. So truncate them. + + for (uint32 j=0; j evidence[0].readLength) { + evidence[j].readLength = evidence[0].readLength; + evidence[j].read[evidence[j].readLength] = 0; + } + + // Set everything to an empty list. Makes aborting the algnment loop much easier. + + for (uint32 j=0; j= maxDifference) { + edlibFreeAlignResult(align); + continue; + } + + int32 rBgn = 0; + int32 rEnd = evidence[j].readLength; + + int32 tBgn = align.startLocations[0]; + int32 tEnd = align.endLocations[0] + 1; // Edlib returns position of last base aligned + +#ifdef DEBUG + fprintf(stderr, "Found alignment for seq %d from %d - %d to %d - %d the dist %d length %d\n", + j, rBgn, rEnd, tBgn, tEnd, align.editDistance, align.alignmentLength); +#endif + + // convert edlib to expected + char *tAln = new char [align.alignmentLength + 1]; + char *rAln = new char [align.alignmentLength + 1]; + + edlibAlignmentToStrings(align.alignment, + align.alignmentLength, + tBgn, tEnd, + rBgn, rEnd, + evidence[0].read, evidence[j].read, + tAln, rAln); + + // strip leading/trailing gaps on template + + uint32 fBase = 0; // First non-gap in the alignment + uint32 lBase = align.alignmentLength; // Last base in the alignment (actually, first gap in the gaps at the end, but that was too long for a variable name) + + while ((fBase < align.alignmentLength) && (tAln[fBase] == '-')) + fBase++; + + while ((lBase > fBase) && (tAln[lBase-1] == '-')) + lBase--; + + rBgn += fBase; + rEnd -= align.alignmentLength - lBase; + + assert(rBgn >= 0); assert(rEnd <= evidence[j].readLength); + assert(tBgn >= 0); assert(tEnd <= evidence[0].readLength); + + rAln[lBase] = 0; // Truncate the alignments before the gaps. + tAln[lBase] = 0; + +#ifdef DEBUG + fprintf(stderr, "Final positions to be %d %d for str %d and %d %d for str %d adjst %d %d %d\n", arange.s1, arange.e1, evidence[j].readLength, arange.s2, arange.e2, evidence[0].readLength, fBase, lBase, lBase-fBase); + fprintf(stderr, "Tgt string is %s %d\n", tAln+fBase, strlen(tAln+fBase)); + fprintf(stderr, "Qry string is %s %d\n", rAln+fBase, strlen(rAln+fBase)); +#endif + + //delete tagList[j]; + + tagList[j] = getAlignTags(rAln + fBase, rBgn, evidence[j].readLength, j, + tAln + fBase, tBgn, evidence[0].readLength, + lBase - fBase); + + delete [] tAln; + delete [] rAln; + + edlibFreeAlignResult(align); + } + + return(tagList); +} diff --git a/src/correction/falconConsensus-alignTag.H b/src/correction/falconConsensus-alignTag.H new file mode 100644 index 000000000..569fcedc9 --- /dev/null +++ b/src/correction/falconConsensus-alignTag.H @@ -0,0 +1,154 @@ + +/****************************************************************************** + * + * This file is part of canu, a software program that assembles whole-genome + * sequencing reads into contigs. + * + * This software is based on: + * 'Celera Assembler' (http://wgs-assembler.sourceforge.net) + * the 'kmer package' (http://kmer.sourceforge.net) + * both originally distributed by Applera Corporation under the GNU General + * Public License, version 2. + * + * Canu branched from Celera Assembler at its revision 4587. + * Canu branched from the kmer project at its revision 1994. + * + * This file is derived from: + * + * src/falcon_sense/libfalcon/falcon.C + * + * Modifications by: + * + * Brian P. Walenz beginning on 2017-AUG-21 + * are a 'United States Government Work', and + * are released in the public domain + * + * File 'README.licenses' in the root directory of this distribution contains + * full conditions and disclaimers for each license. + */ + +/* + * https://github.com/PacificBiosciences/FALCON/blob/master/src/c/falcon.c + * + * Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Pacific Biosciences nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC + * BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "falconConsensus.H" + +#ifndef FALCONCONSENSUS_ALIGNTAG_H +#define FALCONCONSENSUS_ALIGNTAG_H + +class alignTag { // Completely initialized in get_align_tags() +public: + alignTag() { + q_id = INT32_MAX; + t_pos = INT32_MAX; + p_t_pos = INT32_MAX; + delta = UINT16_MAX; + p_delta = UINT16_MAX; + q_base = '.'; + p_q_base = '.'; + }; + alignTag(uint32 id, int32 tp, int32 ptp, uint16 d, uint16 pd, char qb, char pqb) { + q_id = id; + t_pos = tp; + p_t_pos = ptp; + delta = d; + p_delta = pd; + q_base = qb; + p_q_base = pqb; + }; + + + uint32 q_id; + + int32 t_pos; + int32 p_t_pos; // the tag position of the previous base + + uint16 delta; + uint16 p_delta; // the tag delta of the previous base + + char q_base; + char p_q_base; // the previous base +}; + + + +// A list of aligned tags for a single evidence read. +// +class alignTagList { +public: + alignTagList(uint32 l) { + tagsLen = 0; + tags = (l == 0) ? NULL : new alignTag [l + 1]; + }; + + ~alignTagList() { + delete [] tags; + }; + + alignTag *operator[](int32 i) { return(&tags[i]); }; + int32 numberOfTags(void) { return(tagsLen); }; + + void setTag(uint32 id, int32 tp, int32 ptp, uint16 d, uint16 pd, char qb, char pqb) { + tags[tagsLen].q_id = id; + tags[tagsLen].t_pos = tp; + tags[tagsLen].p_t_pos = ptp; + tags[tagsLen].delta = d; + tags[tagsLen].p_delta = pd; + tags[tagsLen].q_base = qb; + tags[tagsLen].p_q_base = pqb; + + tagsLen++; + }; + + void addTag(alignTag tag) { + tags[tagsLen++] = tag; + }; + +private: + int32 tagsLen; + alignTag *tags; +}; + + + +alignTagList ** +alignReadsToTemplate(falconInput *evidence, + uint32 evidenceLen, + double minIdentity); + +#endif // FALCONCONSENSUS_ALIGNTAG_H diff --git a/src/correction/falconConsensus-msa.H b/src/correction/falconConsensus-msa.H new file mode 100644 index 000000000..70899c4b2 --- /dev/null +++ b/src/correction/falconConsensus-msa.H @@ -0,0 +1,295 @@ + +/****************************************************************************** + * + * This file is part of canu, a software program that assembles whole-genome + * sequencing reads into contigs. + * + * This software is based on: + * 'Celera Assembler' (http://wgs-assembler.sourceforge.net) + * the 'kmer package' (http://kmer.sourceforge.net) + * both originally distributed by Applera Corporation under the GNU General + * Public License, version 2. + * + * Canu branched from Celera Assembler at its revision 4587. + * Canu branched from the kmer project at its revision 1994. + * + * This file is derived from: + * + * src/falcon_sense/libfalcon/falcon.C + * + * Modifications by: + * + * Brian P. Walenz beginning on 2017-AUG-21 + * are a 'United States Government Work', and + * are released in the public domain + * + * File 'README.licenses' in the root directory of this distribution contains + * full conditions and disclaimers for each license. + */ + +/* + * https://github.com/PacificBiosciences/FALCON/blob/master/src/c/falcon.c + * + * Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Pacific Biosciences nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC + * BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "falconConsensus.H" + +#ifndef FALCONCONSENSUS_MSA_H +#define FALCONCONSENSUS_MSA_H + +class align_tag_col_t { +public: + align_tag_col_t() { + size = 8; + n_link = 0; + p_t_pos = new int32 [size]; //memset(p_t_pos, 0, sizeof(int32) * size); + p_delta = new uint16 [size]; //memset(p_delta, 0, sizeof(uint16) * size); + p_q_base = new char [size]; //memset(p_q_base, 0, sizeof(char) * size); + link_count = new uint16 [size]; //memset(link_count, 0, sizeof(uint16) * size); + count = 0; + best_p_t_pos = -1; + best_p_delta = -1; + best_p_q_base = -1; + score = 0; + }; + + ~align_tag_col_t() { + delete [] p_t_pos; + delete [] p_delta; + delete [] p_q_base; + delete [] link_count; + }; + + void clean(void) { + n_link = 0; + //memset(p_t_pos, 0, sizeof(int32) * size); + //memset(p_delta, 0, sizeof(uint16) * size); + //memset(p_q_base, 0, sizeof(char) * size); + //memset(link_count, 0, sizeof(uint16) * size); + count = 0; + best_p_t_pos = -1; + best_p_delta = -1; + best_p_q_base = -1; + score = 0; + }; + + void addEntry(alignTag *tag) { + + if (n_link >= size) { + int32 ns; // Need to keep resetting this back to the real number of elements in each array + + ns = size; resizeArray(p_t_pos, n_link, ns, size + 16); + ns = size; resizeArray(p_delta, n_link, ns, size + 16); + ns = size; resizeArray(p_q_base, n_link, ns, size + 16); + ns = size; resizeArray(link_count, n_link, ns, size + 16); + + size = size + 16; + } + + p_t_pos [n_link] = tag->p_t_pos; + p_delta [n_link] = tag->p_delta; + p_q_base [n_link] = tag->p_q_base; + link_count[n_link] = 1; + + n_link++; + }; + + double score; + + int32 *p_t_pos; // the tag position of the previous base + uint16 *p_delta; // the tag delta of the previous base + char *p_q_base; // the previous base + uint16 *link_count; + + int32 best_p_t_pos; + + uint16 best_p_delta; + uint16 best_p_q_base; // encoded base + uint16 count; + uint16 size; // Number of items allocated in the arrays + uint16 n_link; // Number of items used in the arrays +}; + + + +class msa_base_group_t { +public: + msa_base_group_t() { + //base = new align_tag_col_t [5]; + }; + + ~msa_base_group_t() { + //delete [] base; + }; + + void clean(void) { + base[0].clean(); // A + base[1].clean(); // C + base[2].clean(); // G + base[3].clean(); // T + base[4].clean(); // - and everything else + } + + align_tag_col_t base[5]; +}; + + + +class msa_delta_group_t { +public: + msa_delta_group_t() { + deltaAlloc = 8; // Allocate pointers for 8 new items + deltaLen = 0; + delta = new msa_base_group_t * [deltaAlloc]; + + deltaPtrsLen = 1; + deltaPtrs[0] = new msa_base_group_t [deltaAlloc]; // Allocate a block of items + + for (uint32 ii=1; ii<128; ii++) // Clear the other pointers. + deltaPtrs[ii] = 0; + + for (uint32 ii=0; iiclean(); + + deltaLen = 0; + } + + + uint16 deltaAlloc; // Size of delta array + uint16 deltaLen; // Number of delta positions actually used + msa_base_group_t **delta; // Just pointers into deltaPtrs; + + uint32 deltaPtrsLen; // Next available pointer + msa_base_group_t *deltaPtrs[128]; // Holds the actual delta storage +}; + + + +// msa_delta_group_t **msa_array +class msa_vector_t { +public: + msa_vector_t(uint32 templateLen) { + delta_groupsLen = templateLen; + delta_groupsMax = templateLen; + delta_groups = new msa_delta_group_t [templateLen]; + //msa_delta_group_t **msa_array = new msa_delta_group_t * [templateLen]; + }; + + ~msa_vector_t() { + delete [] delta_groups; + }; + + void clean(void) { + for (uint32 i=0; inumberOfTags(); j++) { + alignTag *tag = (*tags[i])[j]; + + if (tag->delta == 0) { + t_pos = tag->t_pos; + coverage[ t_pos ] ++; + } + +#ifdef DEBUG + fprintf(stderr, "Processing position %d in sequence %d (in msa it is column %d with cov %d) with delta %d and current size is %d\n", j, i, t_pos, coverage[t_pos], tag->delta, msa[t_pos]->size); +#endif + + // Assume t_pos was set on earlier iteration. + // (Otherwise, use its initial value, which might be an error. ~cd) + + assert(tag->delta < uint16MAX); + + msa[t_pos]->increaseDeltaGroup(tag->delta); + + uint32 base = 4; + + switch (tag->q_base) { + case 'A': base = 0; break; + case 'C': base = 1; break; + case 'G': base = 2; break; + case 'T': base = 3; break; + case '-': base = 4; break; + default : base = 4; break; + } + + if (j > 0) assert(tag->p_t_pos >= 0); + + + // Update the column + + assert(tag->delta < msa[t_pos]->deltaLen); + align_tag_col_t &col = msa[t_pos]->delta[tag->delta]->base[base]; + + bool updated = false; + + col.count += 1; + + // Search for a matching column. If found, add one. If not found, make a new entry. + + for (int32 kk=0; kkp_t_pos == col.p_t_pos[kk]) && + (tag->p_delta == col.p_delta[kk]) && + (tag->p_q_base == col.p_q_base[kk])) { + col.link_count[kk]++; + updated = true; + break; + } + } + + if (updated == false) + col.addEntry(tag); + +#ifdef DEBUG + fprintf(stderr, "Updating column from seq %d at position %d in column %d base pos %d base %d to be %c and length is %d\n", i, j, t_pos, base, tag->p_t_pos, tag->p_q_base, msa[t_pos]->deltaLen); +#endif + } + + delete tags[i]; + tags[i] = NULL; + } + + // Done with the tags. + + delete [] tags; + tags = NULL; + + // propogate score throught the alignment links, setup backtracking information + + align_tag_col_t *g_best_aln_col = NULL; + uint32 g_best_ck = 0; + int32 g_best_t_pos = 0; + double g_best_score = DBL_MIN; + + // Over every template base, + // And every delta position, + // And every base at that position + // Search links to previous columns, remember the highest scoring one, + // Then remember the highest scoring link for each + + for (uint32 i=0; ideltaLen; j++) { + for (uint32 kk=0; kk<5; kk++) { + align_tag_col_t *aln_col = msa[i]->delta[j]->base + kk; + + aln_col->score = DBL_MIN; + + uint32 best_i = UINT32_MAX; + uint32 best_j = UINT32_MAX; + uint32 best_b = UINT32_MAX; + uint32 best_ck = UINT32_MAX; + double best_score = DBL_MIN; + + //fprintf(stderr, "Processing consensus template %d which as %d delta and on base %d i pulled up col %d with %d links and best %d %d %d\n", + // i, j, kk, aln_col, aln_col->n_link, aln_col->best_p_t_pos, aln_col->best_p_delta, aln_col->best_p_q_base); + + // Search links to previous columns, remember the highest scoring one. + + for (uint32 ck=0; ckn_link; ck++) { + int32 pi = aln_col->p_t_pos[ck]; + int32 pj = aln_col->p_delta[ck]; + int32 pkk = 4; + + switch (aln_col->p_q_base[ck]) { + case 'A': pkk = 0; break; + case 'C': pkk = 1; break; + case 'G': pkk = 2; break; + case 'T': pkk = 3; break; + case '-': pkk = 4; break; + default : pkk = 4; break; + } + + // Score is just our link weight, possibly with the previous column's score, and penalizing for coverage. + double score = aln_col->link_count[ck] - coverage[i] * 0.5; + + if ((aln_col->p_t_pos[ck] != -1) && + (pj <= msa[pi]->deltaLen)) + score += msa[pi]->delta[pj]->base[pkk].score; + + // Save best score. + + if (score > best_score) { + best_i = aln_col->best_p_t_pos = pi; + best_j = aln_col->best_p_delta = pj; + best_b = aln_col->best_p_q_base = pkk; + best_ck = ck; + best_score = score; + } + } // Over all links + + aln_col->score = best_score; + + if (best_score > g_best_score) { + g_best_score = best_score; + g_best_aln_col = aln_col; + g_best_ck = best_ck; + g_best_t_pos = i; + } + } + } + } + + assert(g_best_score > DBL_MIN); + + // Reconstruct the sequences. + + falconData *fd = new falconData(templateLen * 2 + 1); + +#ifdef TRACK_POSITIONS + consensus->originalPos.reserve(templateLen * 2 + 1); // This is an over-generous pre-allocation +#endif + + uint32 index = 0; + uint32 ck = g_best_ck; + uint32 i = g_best_t_pos; + uint32 j = 0; + + char bb = '$'; + + while (1) { +#ifdef TRACK_POSITIONS + int originalI = i; +#endif + + // Original version had bb outside, initialized to '$', with a comment that 'on bad input, bb + // will keep previous value, possibly '$'.' + + //char bb = '-'; + + switch (ck) { + case 0: bb = (coverage[i] <= minAllowedCoverage) ? 'a' : 'A'; break; + case 1: bb = (coverage[i] <= minAllowedCoverage) ? 'c' : 'C'; break; + case 2: bb = (coverage[i] <= minAllowedCoverage) ? 'g' : 'G'; break; + case 3: bb = (coverage[i] <= minAllowedCoverage) ? 't' : 'T'; break; + case 4: bb = '-'; break; + } + + i = g_best_aln_col->best_p_t_pos; + j = g_best_aln_col->best_p_delta; + ck = g_best_aln_col->best_p_q_base; + + double sco = g_best_aln_col->score; + + if ((i == -1) || (index >= templateLen * 2)) + break; + + g_best_aln_col = msa[i]->delta[j]->base + ck; // Move to the next previous column + + if (bb != '-') { + fd->seq[index] = bb; + fd->eqv[index] = (int) sco - (int) g_best_aln_col->score; + //fprintf(stderr, "C %d %d %c %lf %d %d\n", i, index, bb, g_best_aln_col->score, coverage[i], fd->eqv[index] ); + index++; + +#ifdef TRACK_POSITIONS + consensus->originalPos.push_back(originalI); +#endif + } + } + + fd->seq[index] = 0; + + // reverse the sequence + +#ifdef TRACK_POSITIONS + std::reverse(consensus->originalPos.begin(), consensus->originalPos.end()); +#endif + + reverse(fd->seq, fd->seq+index); + reverse(fd->eqv, fd->eqv+index); + +#if 0 + for (int32 i=0; iseq[i] = fd->seq[i] ^ fd->seq[index-i-1]; + fd->seq[index-i-1] = fd->seq[i] ^ fd->seq[index-i-1]; + fd->seq[i] = fd->seq[i] ^ fd->seq[index-i-1]; + + fd->eqv[i] = fd->eqv[i] ^ fd->eqv[index-i-1]; + fd->eqv[index-i-1] = fd->eqv[i] ^ fd->eqv[index-i-1]; + fd->eqv[i] = fd->eqv[i] ^ fd->eqv[index-i-1]; + } +#endif + + msa.clean(); + + free(coverage); + + return(fd); +} + + + +falconData * +generateConsensus(falconInput *evidence, + uint32 evidenceLen, + uint32 minAllowedCoverage, + double minIdentity, + uint32 UNUSED(minOutputLength)) { + + return(getConsensus(evidenceLen, + alignReadsToTemplate(evidence, evidenceLen, minIdentity), + evidence[0].readLength, + minAllowedCoverage)); +} diff --git a/src/correction/falconConsensus.H b/src/correction/falconConsensus.H new file mode 100644 index 000000000..d11c30157 --- /dev/null +++ b/src/correction/falconConsensus.H @@ -0,0 +1,172 @@ + +/****************************************************************************** + * + * This file is part of canu, a software program that assembles whole-genome + * sequencing reads into contigs. + * + * This software is based on: + * 'Celera Assembler' (http://wgs-assembler.sourceforge.net) + * the 'kmer package' (http://kmer.sourceforge.net) + * both originally distributed by Applera Corporation under the GNU General + * Public License, version 2. + * + * Canu branched from Celera Assembler at its revision 4587. + * Canu branched from the kmer project at its revision 1994. + * + * This file is derived from: + * + * src/falcon_sense/libfalcon/falcon.C + * + * Modifications by: + * + * Brian P. Walenz beginning on 2017-AUG-21 + * are a 'United States Government Work', and + * are released in the public domain + * + * File 'README.licenses' in the root directory of this distribution contains + * full conditions and disclaimers for each license. + */ + +/* + * https://github.com/PacificBiosciences/FALCON/blob/master/src/c/falcon.c + * + * Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Pacific Biosciences nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC + * BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "AS_global.H" + +#include +#include +#include +#include + +#include +#include + +using namespace std; + +#ifndef FALCONCONSENSUS_H +#define FALCONCONSENSUS_H + +class falconInput { +public: + falconInput() { + read = NULL; + readLength = 0; + + ident = 0; + + placedBgn = 0; + placedEnd = 0; + + alignedBgn = 0; + alignedEnd = 0; + }; + + void addInput(uint32 ident_, + char *read_, + uint32 readLen_, + uint32 bgn_, + uint32 end_) { + + read = new char [readLen_ + 1]; + readLength = readLen_; + + memcpy(read, read_, readLen_); + + ident = ident_; + + placedBgn = bgn_; + placedEnd = end_; + + alignedBgn = 0; + alignedEnd = 0; + }; + + ~falconInput() { + delete [] read; + }; + + char *read; + int32 readLength; + + uint32 ident; + + int32 placedBgn; + int32 placedEnd; + + int32 alignedBgn; + int32 alignedEnd; +}; + + + +class falconData { +public: + falconData() { + seq = NULL; + eqv = NULL; + }; + + falconData(int32 len) { + maxLen = len; + seq = new char [len]; + eqv = new int32 [len]; + }; + + ~falconData() { + delete [] seq; + delete [] eqv; + }; + + int32 maxLen; + + char *seq; + int32 *eqv; + vector originalPos; // For tracking original read positions in the corrected read +}; + + + +falconData * +generateConsensus(falconInput *evidence, + uint32 evidenceLen, + uint32 minAllowedCoverage, + double minIdentity, + uint32 minOutputLength); + + +#endif // FALCONCONSENSUS_H diff --git a/src/main.mk b/src/main.mk index 8f7218ee2..2a76c702b 100644 --- a/src/main.mk +++ b/src/main.mk @@ -48,6 +48,8 @@ SOURCES := AS_global.C \ AS_UTL/kMer.C \ \ falcon_sense/libfalcon/falcon.C \ + correction/falconConsensus.C \ + correction/falconConsensus-alignTag.C \ \ stores/gkStore.C \ stores/gkStoreEncode.C \