From fd757f3dddd4614594d91b6c85174767b201b380 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Sun, 24 Mar 2019 15:12:55 +0800 Subject: [PATCH 01/19] enable correction for merging mode --- src/options.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/options.cpp b/src/options.cpp index ad431f9..1c6c8fe 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -68,6 +68,9 @@ bool Options::validate() { cerr << "Merging mode. Ignore argument --out2 = " << out2 << endl; out2 = ""; } + // enable correction if it's not enabled + if(!correction.enabled) + correction.enabled = true; } // if output to STDOUT, then... From 2c5bdadc28605728bf3c292568dc9a23cd5722d4 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Fri, 29 Mar 2019 17:23:19 +0800 Subject: [PATCH 02/19] fix the issue that trim_front may affect adapter trimming https://github.com/OpenGene/fastp/issues/146 --- src/adaptertrimmer.cpp | 23 +++++++++++++++-------- src/adaptertrimmer.h | 2 +- src/filter.cpp | 9 +++++++-- src/filter.h | 2 +- src/peprocessor.cpp | 8 +++++--- src/seprocessor.cpp | 3 ++- 6 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp index e8c7516..de51118 100644 --- a/src/adaptertrimmer.cpp +++ b/src/adaptertrimmer.cpp @@ -12,25 +12,32 @@ bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr) return trimByOverlapAnalysis(r1, r2, fr, ov); } -bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov) { +bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1, int frontTrimmed2) { int ol = ov.overlap_len; if(ov.diff<=5 && ov.overlapped && ov.offset < 0 && ol > r1->length()/3) { - string adapter1 = r1->mSeq.mStr.substr(ol, r1->length() - ol); - string adapter2 = r2->mSeq.mStr.substr(ol, r2->length() - ol); + + //5' ......frontTrimmed1......|------------------------------------------|----- 3' + //3' -----|-------------------------------------------|......frontTrimmed2..... 5' + + int len1 = min(r1->length(), ol + frontTrimmed2); + int len2 = min(r2->length(), ol + frontTrimmed1); + string adapter1 = r1->mSeq.mStr.substr(len1, r1->length() - len1); + string adapter2 = r2->mSeq.mStr.substr(len2, r2->length() - len2); if(_DEBUG) { cerr << adapter1 << endl; cerr << adapter2 << endl; + cerr << "frontTrimmed2: " << frontTrimmed1 << endl; + cerr << "frontTrimmed2: " << frontTrimmed2 << endl; cerr << "overlap:" << ov.offset << "," << ov.overlap_len << ", " << ov.diff << endl; r1->print(); r2->reverseComplement()->print(); cerr <mSeq.mStr = r1->mSeq.mStr.substr(0, ol); - r1->mQuality = r1->mQuality.substr(0, ol); - r2->mSeq.mStr = r2->mSeq.mStr.substr(0, ol); - r2->mQuality = r2->mQuality.substr(0, ol); + r1->mSeq.mStr = r1->mSeq.mStr.substr(0, len1); + r1->mQuality = r1->mQuality.substr(0, len1); + r2->mSeq.mStr = r2->mSeq.mStr.substr(0, len2); + r2->mQuality = r2->mQuality.substr(0, len2); fr->addAdapterTrimmed(adapter1, adapter2); return true; diff --git a/src/adaptertrimmer.h b/src/adaptertrimmer.h index 974918e..f68481c 100644 --- a/src/adaptertrimmer.h +++ b/src/adaptertrimmer.h @@ -16,7 +16,7 @@ class AdapterTrimmer{ ~AdapterTrimmer(); static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr); - static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov); + static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false); static bool test(); diff --git a/src/filter.cpp b/src/filter.cpp index 448430a..cf6f842 100644 --- a/src/filter.cpp +++ b/src/filter.cpp @@ -75,7 +75,8 @@ bool Filter::passLowComplexityFilter(Read* r) { return false; } -Read* Filter::trimAndCut(Read* r, int front, int tail) { +Read* Filter::trimAndCut(Read* r, int front, int tail, int& frontTrimmed) { + frontTrimmed = 0; // return the same read for speed if no change needed if(front == 0 && tail == 0 && !mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight) return r; @@ -91,6 +92,7 @@ Read* Filter::trimAndCut(Read* r, int front, int tail) { } else if(!mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight){ r->mSeq.mStr = r->mSeq.mStr.substr(front, rlen); r->mQuality = r->mQuality.substr(front, rlen); + frontTrimmed = front; return r; } @@ -205,6 +207,8 @@ Read* Filter::trimAndCut(Read* r, int front, int tail) { r->mSeq.mStr = r->mSeq.mStr.substr(front, rlen); r->mQuality = r->mQuality.substr(front, rlen); + frontTrimmed = front; + return r; } @@ -257,7 +261,8 @@ bool Filter::test() { opt.qualityCut.windowSizeTail = 4; opt.qualityCut.qualityTail = 20; Filter filter(&opt); - Read* ret = filter.trimAndCut(&r, 0, 1); + int frontTrimmed = 0; + Read* ret = filter.trimAndCut(&r, 0, 1, frontTrimmed); ret->print(); return ret->mSeq.mStr == "CCCCCCCCCCCCCCCCCCCCCCCCCCCC" diff --git a/src/filter.h b/src/filter.h index 5284414..2c1fcd0 100644 --- a/src/filter.h +++ b/src/filter.h @@ -16,7 +16,7 @@ class Filter{ ~Filter(); int passFilter(Read* r); bool passLowComplexityFilter(Read* r); - Read* trimAndCut(Read* r, int front, int tail); + Read* trimAndCut(Read* r, int front, int tail, int& frontTrimmed); bool filterByIndex(Read* r); bool filterByIndex(Read* r1, Read* r2); static bool test(); diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index 0188c2e..aa839e3 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -274,8 +274,10 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ mUmiProcessor->process(or1, or2); // trim in head and tail, and apply quality cut in sliding window - Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1); - Read* r2 = mFilter->trimAndCut(or2, mOptions->trim.front2, mOptions->trim.tail2); + int frontTrimmed1 = 0; + int frontTrimmed2 = 0; + Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1, frontTrimmed1); + Read* r2 = mFilter->trimAndCut(or2, mOptions->trim.front2, mOptions->trim.tail2, frontTrimmed2); if(r1 != NULL && r2!=NULL) { if(mOptions->polyGTrim.enabled) @@ -293,7 +295,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ BaseCorrector::correctByOverlapAnalysis(r1, r2, config->getFilterResult(), ov); } if(mOptions->adapter.enabled) { - bool trimmed = AdapterTrimmer::trimByOverlapAnalysis(r1, r2, config->getFilterResult(), ov); + bool trimmed = AdapterTrimmer::trimByOverlapAnalysis(r1, r2, config->getFilterResult(), ov, frontTrimmed1, frontTrimmed2); if(!trimmed){ if(mOptions->adapter.hasSeqR1) AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false); diff --git a/src/seprocessor.cpp b/src/seprocessor.cpp index d192eb5..127a293 100644 --- a/src/seprocessor.cpp +++ b/src/seprocessor.cpp @@ -201,8 +201,9 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){ if(mOptions->umi.enabled) mUmiProcessor->process(or1); + int frontTrimmed = 0; // trim in head and tail, and apply quality cut in sliding window - Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1); + Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1, frontTrimmed); if(r1 != NULL) { if(mOptions->polyGTrim.enabled) From 8c756ae99d1d50b58b5568f0ea63dce05183b931 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Fri, 29 Mar 2019 17:41:46 +0800 Subject: [PATCH 03/19] revise the insert size estimation a little when trim front is enabled --- src/peprocessor.cpp | 10 +++++----- src/peprocessor.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index aa839e3..7ec79df 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -288,7 +288,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); // we only use thread 0 to evaluae ISIZE if(config->getThreadId() == 0) { - statInsertSize(r1, r2, ov); + statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2); isizeEvaluated = true; } if(mOptions->correction.enabled) { @@ -307,7 +307,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ if(config->getThreadId() == 0 && !isizeEvaluated && r1 != NULL && r2!=NULL) { OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); - statInsertSize(r1, r2, ov); + statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2); isizeEvaluated = true; } @@ -435,13 +435,13 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ return true; } -void PairEndProcessor::statInsertSize(Read* r1, Read* r2, OverlapResult& ov) { +void PairEndProcessor::statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1, int frontTrimmed2) { int isize = mOptions->insertSizeMax; if(ov.overlapped) { if(ov.offset > 0) - isize = r1->length() + r2->length() - ov.overlap_len; + isize = r1->length() + r2->length() - ov.overlap_len + frontTrimmed1 + frontTrimmed2; else - isize = ov.overlap_len; + isize = ov.overlap_len + frontTrimmed1 + frontTrimmed2; } if(isize > mOptions->insertSizeMax) diff --git a/src/peprocessor.h b/src/peprocessor.h index 26fd98d..1d64e4d 100644 --- a/src/peprocessor.h +++ b/src/peprocessor.h @@ -57,7 +57,7 @@ class PairEndProcessor{ void initConfig(ThreadConfig* config); void initOutput(); void closeOutput(); - void statInsertSize(Read* r1, Read* r2, OverlapResult& ov); + void statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); int getPeakInsertSize(); void writeTask(WriterThread* config); From 390f6af6efd945d248214742ca8d3b4c953280d9 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Thu, 4 Apr 2019 22:38:32 +0800 Subject: [PATCH 04/19] support outputing unpaired reads --- src/common.h | 2 +- src/main.cpp | 7 ++++++ src/options.cpp | 30 ++++++++++++++++++++++ src/options.h | 6 ++++- src/peprocessor.cpp | 61 +++++++++++++++++++++++++++++++++++++++++++++ src/peprocessor.h | 2 ++ 6 files changed, 106 insertions(+), 2 deletions(-) diff --git a/src/common.h b/src/common.h index 867d720..3575309 100644 --- a/src/common.h +++ b/src/common.h @@ -1,7 +1,7 @@ #ifndef COMMON_H #define COMMON_H -#define FASTP_VER "0.19.8" +#define FASTP_VER "0.19.9" #define _DEBUG false diff --git a/src/main.cpp b/src/main.cpp index 1b2cc02..6ed42b0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -35,6 +35,8 @@ int main(int argc, char* argv[]){ cmd.add("out1", 'o', "read1 output file name", false, ""); cmd.add("in2", 'I', "read2 input file name", false, ""); cmd.add("out2", 'O', "read2 output file name", false, ""); + cmd.add("unpaired1", 0, "for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it.", false, ""); + cmd.add("unpaired2", 0, "for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. Default is same as --unpaired1. If --unpaired2 is same as --umpaired1, both unpaired reads will be written this same file.", false, ""); cmd.add("merge", 'm', "for paired-end input, merge each pair of reads into a single read if they are overlapped. Disabled by default."); cmd.add("discard_unmerged", 0, "in the merging mode, discard the pairs of reads if they cannot be merged successfully. Disabled by default."); cmd.add("phred64", '6', "indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33)"); @@ -151,6 +153,11 @@ int main(int argc, char* argv[]){ opt.in2 = cmd.get("in2"); opt.out1 = cmd.get("out1"); opt.out2 = cmd.get("out2"); + opt.unpaired1 = cmd.get("unpaired1"); + opt.unpaired2 = cmd.get("unpaired2"); + // write to the same file + if(opt.unpaired2.empty()) + opt.unpaired2 = opt.unpaired1; opt.compression = cmd.get("compression"); opt.readsToProcess = cmd.get("reads_to_process"); opt.phred64 = cmd.exist("phred64"); diff --git a/src/options.cpp b/src/options.cpp index 1c6c8fe..9ffc496 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -126,6 +126,36 @@ bool Options::validate() { error_exit(out2 + " already exists and you have set to not rewrite output files by --dont_overwrite"); } } + if(!isPaired()) { + if(!unpaired1.empty()) { + cerr << "Not paired-end mode. Ignore argument --unpaired1 = " << unpaired1 << endl; + unpaired1 = ""; + } + if(!unpaired2.empty()) { + cerr << "Not paired-end mode. Ignore argument --unpaired2 = " << unpaired2 << endl; + unpaired2 = ""; + } + } + if(split.enabled) { + if(!unpaired1.empty()) { + cerr << "Outputing unpaired reads is not supported in splitting mode. Ignore argument --unpaired1 = " << unpaired1 << endl; + unpaired1 = ""; + } + if(!unpaired2.empty()) { + cerr << "Outputing unpaired reads is not supported in splitting mode. Ignore argument --unpaired2 = " << unpaired2 << endl; + unpaired2 = ""; + } + } + if(!unpaired1.empty()) { + if(dontOverwrite && file_exists(unpaired1)) { + error_exit(unpaired1 + " already exists and you have set to not rewrite output files by --dont_overwrite"); + } + } + if(!unpaired2.empty()) { + if(dontOverwrite && file_exists(unpaired2)) { + error_exit(unpaired2 + " already exists and you have set to not rewrite output files by --dont_overwrite"); + } + } if(dontOverwrite) { if(file_exists(jsonFile)) { diff --git a/src/options.h b/src/options.h index 913caa8..407e55e 100644 --- a/src/options.h +++ b/src/options.h @@ -289,8 +289,12 @@ class Options{ string in2; // file name of read1 output string out1; - // file name of read1 output + // file name of read2 output string out2; + // file name of unpaired read1 output + string unpaired1; + // file name of unpaired read2 output + string unpaired2; // json file string jsonFile; // html file diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index 7ec79df..a55dfd7 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -28,6 +28,8 @@ PairEndProcessor::PairEndProcessor(Options* opt){ memset(mInsertSizeHist, 0, sizeof(long)*isizeBufLen); mLeftWriter = NULL; mRightWriter = NULL; + mUnpairedLeftWriter = NULL; + mUnpairedRightWriter = NULL; mDuplicate = NULL; if(mOptions->duplicate.enabled) { @@ -44,6 +46,12 @@ PairEndProcessor::~PairEndProcessor() { } void PairEndProcessor::initOutput() { + if(!mOptions->unpaired1.empty()) + mUnpairedLeftWriter = new WriterThread(mOptions, mOptions->unpaired1); + + if(!mOptions->unpaired2.empty() && mOptions->unpaired2 != mOptions->unpaired1) + mUnpairedRightWriter = new WriterThread(mOptions, mOptions->unpaired2); + if(mOptions->out1.empty()) return; @@ -61,6 +69,14 @@ void PairEndProcessor::closeOutput() { delete mRightWriter; mRightWriter = NULL; } + if(mUnpairedLeftWriter) { + delete mUnpairedLeftWriter; + mLeftWriter = NULL; + } + if(mUnpairedRightWriter) { + delete mUnpairedRightWriter; + mRightWriter = NULL; + } } void PairEndProcessor::initConfig(ThreadConfig* config) { @@ -94,10 +110,16 @@ bool PairEndProcessor::process(){ std::thread* leftWriterThread = NULL; std::thread* rightWriterThread = NULL; + std::thread* unpairedLeftWriterThread = NULL; + std::thread* unpairedRightWriterThread = NULL; if(mLeftWriter) leftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mLeftWriter)); if(mRightWriter) rightWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mRightWriter)); + if(mUnpairedLeftWriter) + unpairedLeftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mUnpairedLeftWriter)); + if(mUnpairedRightWriter) + unpairedRightWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mUnpairedRightWriter)); producer.join(); for(int t=0; tthread; t++){ @@ -109,6 +131,10 @@ bool PairEndProcessor::process(){ leftWriterThread->join(); if(rightWriterThread) rightWriterThread->join(); + if(unpairedLeftWriterThread) + unpairedLeftWriterThread->join(); + if(unpairedRightWriterThread) + unpairedRightWriterThread->join(); } if(mOptions->verbose) @@ -220,6 +246,10 @@ bool PairEndProcessor::process(){ delete leftWriterThread; if(rightWriterThread) delete rightWriterThread; + if(unpairedLeftWriterThread) + delete unpairedLeftWriterThread; + if(unpairedRightWriterThread) + delete unpairedRightWriterThread; if(!mOptions->split.enabled) closeOutput(); @@ -242,6 +272,8 @@ int PairEndProcessor::getPeakInsertSize() { bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ string outstr1; string outstr2; + string unpairedOut1; + string unpairedOut2; string singleOutput; int readPassed = 0; int mergedCount = 0; @@ -376,6 +408,14 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ config->getPostStats2()->statRead(r2); readPassed++; + } else if( r1 != NULL && result1 == PASS_FILTER) { + if(mUnpairedLeftWriter) { + unpairedOut1 += r1->toString(); + } + } else if( r2 != NULL && result2 == PASS_FILTER) { + if(mUnpairedLeftWriter || mUnpairedRightWriter) { + unpairedOut2 += r2->toString(); + } } } @@ -416,7 +456,24 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ memcpy(ldata, singleOutput.c_str(), singleOutput.size()); mLeftWriter->input(ldata, singleOutput.size()); } + // output unpaired reads + if(mUnpairedLeftWriter && mUnpairedRightWriter) { + // write PE + char* unpairedData1 = new char[unpairedOut1.size()]; + memcpy(unpairedData1, unpairedOut1.c_str(), unpairedOut1.size()); + mUnpairedLeftWriter->input(unpairedData1, unpairedOut1.size()); + + char* unpairedData2 = new char[unpairedOut2.size()]; + memcpy(unpairedData2, unpairedOut2.c_str(), unpairedOut2.size()); + mUnpairedRightWriter->input(unpairedData2, unpairedOut2.size()); + } else if(mUnpairedLeftWriter) { + char* unpairedData = new char[unpairedOut1.size() + unpairedOut2.size() ]; + memcpy(unpairedData, unpairedOut1.c_str(), unpairedOut1.size()); + memcpy(unpairedData + unpairedOut1.size(), unpairedOut2.c_str(), unpairedOut2.size()); + mUnpairedLeftWriter->input(unpairedData, unpairedOut1.size() + unpairedOut2.size()); + } } + if(!mOptions->split.enabled) mOutputMtx.unlock(); @@ -653,6 +710,10 @@ void PairEndProcessor::consumerTask(ThreadConfig* config) mLeftWriter->setInputCompleted(); if(mRightWriter) mRightWriter->setInputCompleted(); + if(mUnpairedLeftWriter) + mUnpairedLeftWriter->setInputCompleted(); + if(mUnpairedRightWriter) + mUnpairedRightWriter->setInputCompleted(); } if(mOptions->verbose) { diff --git a/src/peprocessor.h b/src/peprocessor.h index 1d64e4d..1468ba2 100644 --- a/src/peprocessor.h +++ b/src/peprocessor.h @@ -77,6 +77,8 @@ class PairEndProcessor{ long* mInsertSizeHist; WriterThread* mLeftWriter; WriterThread* mRightWriter; + WriterThread* mUnpairedLeftWriter; + WriterThread* mUnpairedRightWriter; Duplicate* mDuplicate; }; From 0b5dd3e06cbc3fbe126f9da4ca5bc400b186014b Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Sat, 6 Apr 2019 23:01:55 +0800 Subject: [PATCH 05/19] revise merging and support outputing unmerged reads --- src/htmlreporter.cpp | 7 ++- src/jsonreporter.cpp | 13 ++++-- src/main.cpp | 17 +++++-- src/options.cpp | 64 ++++++++++++++++---------- src/options.h | 5 +- src/peprocessor.cpp | 106 ++++++++++++++++++++++++++++++------------- src/peprocessor.h | 1 + 7 files changed, 145 insertions(+), 68 deletions(-) diff --git a/src/htmlreporter.cpp b/src/htmlreporter.cpp index 9c35576..820e26a 100644 --- a/src/htmlreporter.cpp +++ b/src/htmlreporter.cpp @@ -351,8 +351,11 @@ void HtmlReporter::report(FilterResult* result, Stats* preStats1, Stats* postSta ofs << "\n"; ofs << "
\n"; - if(postStats1) { - postStats1 -> reportHtml(ofs, "After filtering", "read1"); + if(postStats1) { + string name = "read1"; + if(mOptions->merge.enabled) + name = "merged"; + postStats1 -> reportHtml(ofs, "After filtering", name); } if(postStats2 && !mOptions->merge.enabled) { diff --git a/src/jsonreporter.cpp b/src/jsonreporter.cpp index 9e2377c..829acc7 100644 --- a/src/jsonreporter.cpp +++ b/src/jsonreporter.cpp @@ -150,16 +150,19 @@ void JsonReporter::report(FilterResult* result, Stats* preStats1, Stats* postSta preStats1 -> reportJson(ofs, "\t"); } - if(postStats1) { - ofs << "\t" << "\"read1_after_filtering\": " ; - postStats1 -> reportJson(ofs, "\t"); - } - if(preStats2) { ofs << "\t" << "\"read2_before_filtering\": " ; preStats2 -> reportJson(ofs, "\t"); } + if(postStats1) { + string name = "read1_after_filtering"; + if(mOptions->merge.enabled) + name = "merged_and_filtered"; + ofs << "\t" << "\"" << name << "\": " ; + postStats1 -> reportJson(ofs, "\t"); + } + if(postStats2 && !mOptions->merge.enabled) { ofs << "\t" << "\"read2_after_filtering\": " ; postStats2 -> reportJson(ofs, "\t"); diff --git a/src/main.cpp b/src/main.cpp index 6ed42b0..31401a4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,13 +36,14 @@ int main(int argc, char* argv[]){ cmd.add("in2", 'I', "read2 input file name", false, ""); cmd.add("out2", 'O', "read2 output file name", false, ""); cmd.add("unpaired1", 0, "for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it.", false, ""); - cmd.add("unpaired2", 0, "for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. Default is same as --unpaired1. If --unpaired2 is same as --umpaired1, both unpaired reads will be written this same file.", false, ""); - cmd.add("merge", 'm', "for paired-end input, merge each pair of reads into a single read if they are overlapped. Disabled by default."); - cmd.add("discard_unmerged", 0, "in the merging mode, discard the pairs of reads if they cannot be merged successfully. Disabled by default."); + cmd.add("unpaired2", 0, "for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --umpaired1 (default mode), both unpaired reads will be written to this same file.", false, ""); + cmd.add("merge", 'm', "for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default."); + cmd.add("merged_out", 0, "in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output", false, ""); + cmd.add("include_unmerged", 0, "in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default."); cmd.add("phred64", '6', "indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33)"); cmd.add("compression", 'z', "compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4.", false, 4); cmd.add("stdin", 0, "input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in."); - cmd.add("stdout", 0, "stream passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end input. Disabled by default."); + cmd.add("stdout", 0, "stream passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end output. Disabled by default."); cmd.add("interleaved_in", 0, "indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default."); cmd.add("reads_to_process", 0, "specify how many reads/pairs to be processed. Default 0 means process all reads.", false, 0); cmd.add("dont_overwrite", 0, "don't overwrite existing files. Overwritting is allowed by default."); @@ -138,6 +139,7 @@ int main(int argc, char* argv[]){ cmd.add("cut_by_quality5", 0, "DEPRECATED, use --cut_front instead."); cmd.add("cut_by_quality3", 0, "DEPRECATED, use --cut_tail instead."); cmd.add("cut_by_quality_aggressive", 0, "DEPRECATED, use --cut_right instead."); + cmd.add("discard_unmerged", 0, "DEPRECATED, no effect now, see the introduction for merging."); cmd.parse_check(argc, argv); @@ -146,6 +148,10 @@ int main(int argc, char* argv[]){ return 0; } + if(cmd.exist("discard_unmerged")) { + cerr << "DEPRECATED: --discard_unmerged has no effect now, see the introduction for merging." << endl; + } + Options opt; // I/O @@ -169,7 +175,8 @@ int main(int argc, char* argv[]){ // merge PE opt.merge.enabled = cmd.exist("merge"); - opt.merge.discardUnmerged = cmd.exist("discard_unmerged"); + opt.merge.out = cmd.get("merged_out"); + opt.merge.includeUnmerged = cmd.exist("include_unmerged"); // adapter cutting opt.adapter.enabled = !cmd.exist("disable_adapter_trimming"); diff --git a/src/options.cpp b/src/options.cpp index 9ffc496..da2cdcd 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -64,32 +64,50 @@ bool Options::validate() { if(in2.empty() && !interleavedInput) { error_exit("read2 input should be specified by --in2 for merging mode"); } - if(!out2.empty()) { - cerr << "Merging mode. Ignore argument --out2 = " << out2 << endl; - out2 = ""; - } // enable correction if it's not enabled if(!correction.enabled) correction.enabled = true; + if(merge.out.empty() && !outputToSTDOUT && !out1.empty() && out2.empty()) { + cerr << "You specified --out1, but haven't specified --merged_out in merging mode. Using --out1 to store the merging output to be compatible with fastp 0.19.8" << endl << endl; + merge.out = out1; + out1 = ""; + } + if(merge.includeUnmerged) { + if(!out1.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --out1 = " << out1 << endl; + out1 = ""; + } + if(!out2.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --out2 = " << out2 << endl; + out2 = ""; + } + if(!unpaired1.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --unpaired1 = " << unpaired1 << endl; + unpaired1 = ""; + } + if(!unpaired2.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --unpaired1 = " << unpaired2 << endl; + unpaired2 = ""; + } + } + if(merge.out.empty() && !outputToSTDOUT) { + error_exit("In merging mode, you should either specify --merged_out or enable --stdout"); + } } // if output to STDOUT, then... if(outputToSTDOUT) { - cerr << "Streaming uncompressed output to STDOUT..." << endl; - if(!in1.empty() && !in2.empty()) - cerr << "Enable interleaved output mode for paired-end input." << endl; - if(!out1.empty()) { - cerr << "Ignore argument --out1 = " << out1 << endl; - out1 = ""; - } - if(!out2.empty()) { - cerr << "Ignore argument --out2 = " << out2 << endl; - out2 = ""; - } if(split.enabled) { - cerr << "Ignore split mode" << endl; - split.enabled = false; + error_exit("splitting mode cannot work with stdout mode"); } + cerr << "Streaming uncompressed "; + if(merge.enabled) + cerr << "merged"; + else if(isPaired()) + cerr << "interleaved"; + cerr << " reads to STDOUT..." << endl; + if(isPaired() && !merge.enabled) + cerr << "Enable interleaved output mode for paired-end input." << endl; cerr << endl; } @@ -98,7 +116,7 @@ bool Options::validate() { } if(!in2.empty() || interleavedInput) { - if(!out1.empty() && out2.empty() && !merge.enabled) { + if(!out1.empty() && out2.empty()) { error_exit("paired-end input, read1 output should be specified together with read2 output (--out2 needed) "); } if(out1.empty() && !out2.empty()) { @@ -128,21 +146,21 @@ bool Options::validate() { } if(!isPaired()) { if(!unpaired1.empty()) { - cerr << "Not paired-end mode. Ignore argument --unpaired1 = " << unpaired1 << endl; + cerr << "Not paired-end mode. Ignoring argument --unpaired1 = " << unpaired1 << endl; unpaired1 = ""; } if(!unpaired2.empty()) { - cerr << "Not paired-end mode. Ignore argument --unpaired2 = " << unpaired2 << endl; + cerr << "Not paired-end mode. Ignoring argument --unpaired2 = " << unpaired2 << endl; unpaired2 = ""; } } if(split.enabled) { if(!unpaired1.empty()) { - cerr << "Outputing unpaired reads is not supported in splitting mode. Ignore argument --unpaired1 = " << unpaired1 << endl; + cerr << "Outputing unpaired reads is not supported in splitting mode. Ignoring argument --unpaired1 = " << unpaired1 << endl; unpaired1 = ""; } if(!unpaired2.empty()) { - cerr << "Outputing unpaired reads is not supported in splitting mode. Ignore argument --unpaired2 = " << unpaired2 << endl; + cerr << "Outputing unpaired reads is not supported in splitting mode. Ignoring argument --unpaired2 = " << unpaired2 << endl; unpaired2 = ""; } } @@ -273,7 +291,7 @@ bool Options::validate() { } if(correction.enabled && !isPaired()) { - cerr << "WARNING: base correction is only appliable for paired end data, ignored -c/--correction" << endl; + cerr << "WARNING: base correction is only appliable for paired end data, ignoring -c/--correction" << endl; correction.enabled = false; } diff --git a/src/options.h b/src/options.h index 407e55e..bb6a799 100644 --- a/src/options.h +++ b/src/options.h @@ -21,11 +21,12 @@ class MergeOptions { public: MergeOptions() { enabled = false; - discardUnmerged = false; + includeUnmerged = false; } public: bool enabled; - bool discardUnmerged; + bool includeUnmerged; + string out; }; class DuplicationOptions { diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index a55dfd7..9a51a21 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -30,6 +30,7 @@ PairEndProcessor::PairEndProcessor(Options* opt){ mRightWriter = NULL; mUnpairedLeftWriter = NULL; mUnpairedRightWriter = NULL; + mMergedWriter = NULL; mDuplicate = NULL; if(mOptions->duplicate.enabled) { @@ -52,6 +53,11 @@ void PairEndProcessor::initOutput() { if(!mOptions->unpaired2.empty() && mOptions->unpaired2 != mOptions->unpaired1) mUnpairedRightWriter = new WriterThread(mOptions, mOptions->unpaired2); + if(mOptions->merge.enabled) { + if(!mOptions->merge.out.empty()) + mMergedWriter = new WriterThread(mOptions, mOptions->merge.out); + } + if(mOptions->out1.empty()) return; @@ -69,6 +75,10 @@ void PairEndProcessor::closeOutput() { delete mRightWriter; mRightWriter = NULL; } + if(mMergedWriter) { + delete mMergedWriter; + mMergedWriter = NULL; + } if(mUnpairedLeftWriter) { delete mUnpairedLeftWriter; mLeftWriter = NULL; @@ -112,6 +122,7 @@ bool PairEndProcessor::process(){ std::thread* rightWriterThread = NULL; std::thread* unpairedLeftWriterThread = NULL; std::thread* unpairedRightWriterThread = NULL; + std::thread* mergedWriterThread = NULL; if(mLeftWriter) leftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mLeftWriter)); if(mRightWriter) @@ -120,6 +131,8 @@ bool PairEndProcessor::process(){ unpairedLeftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mUnpairedLeftWriter)); if(mUnpairedRightWriter) unpairedRightWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mUnpairedRightWriter)); + if(mMergedWriter) + mergedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mMergedWriter)); producer.join(); for(int t=0; tthread; t++){ @@ -135,6 +148,8 @@ bool PairEndProcessor::process(){ unpairedLeftWriterThread->join(); if(unpairedRightWriterThread) unpairedRightWriterThread->join(); + if(mergedWriterThread) + mergedWriterThread->join(); } if(mOptions->verbose) @@ -162,15 +177,18 @@ bool PairEndProcessor::process(){ cerr << "Read1 before filtering:"<print(); cerr << endl; - cerr << "Read1 after filtering:"<print(); - cerr << endl; cerr << "Read2 before filtering:"<print(); + cerr << endl; if(!mOptions->merge.enabled) { + cerr << "Read1 after filtering:"<print(); cerr << endl; cerr << "Read2 aftering filtering:"<print(); + } else { + cerr << "Merged and filtered:"<print(); } cerr << endl; @@ -250,6 +268,8 @@ bool PairEndProcessor::process(){ delete unpairedLeftWriterThread; if(unpairedRightWriterThread) delete unpairedRightWriterThread; + if(mergedWriterThread) + delete mergedWriterThread; if(!mOptions->split.enabled) closeOutput(); @@ -275,6 +295,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ string unpairedOut1; string unpairedOut2; string singleOutput; + string mergedOutput; int readPassed = 0; int mergedCount = 0; for(int p=0;pcount;p++){ @@ -357,6 +378,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ Read* merged = NULL; // merging mode + bool mergeProcessed = false; if(mOptions->merge.enabled && r1 && r2) { OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); if(ov.overlapped) { @@ -364,30 +386,34 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ int result = mFilter->passFilter(merged); config->addFilterResult(result, 2); if(result == PASS_FILTER) { - singleOutput += merged->toString(); + mergedOutput += merged->toString(); config->getPostStats1()->statRead(merged); readPassed++; mergedCount++; } delete merged; - } else if(!mOptions->merge.discardUnmerged){ + mergeProcessed = true; + } else if(mOptions->merge.includeUnmerged){ int result1 = mFilter->passFilter(r1); config->addFilterResult(result1, 1); if(result1 == PASS_FILTER) { - singleOutput += r1->toString(); + mergedOutput += r1->toString(); config->getPostStats1()->statRead(r1); } int result2 = mFilter->passFilter(r2); config->addFilterResult(result2, 1); if(result2 == PASS_FILTER) { - singleOutput += r2->toString(); + mergedOutput += r2->toString(); config->getPostStats1()->statRead(r2); } if(result1 == PASS_FILTER && result2 == PASS_FILTER ) readPassed++; + mergeProcessed = true; } - } else { + } + + if(!mergeProcessed) { int result1 = mFilter->passFilter(r1); int result2 = mFilter->passFilter(r2); @@ -396,7 +422,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ if( r1 != NULL && result1 == PASS_FILTER && r2 != NULL && result2 == PASS_FILTER ) { - if(mOptions->outputToSTDOUT) { + if(mOptions->outputToSTDOUT && !mOptions->merge.enabled) { singleOutput += r1->toString() + r2->toString(); } else { outstr1 += r1->toString(); @@ -404,8 +430,10 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ } // stats the read after filtering - config->getPostStats1()->statRead(r1); - config->getPostStats2()->statRead(r2); + if(!mOptions->merge.enabled) { + config->getPostStats1()->statRead(r1); + config->getPostStats2()->statRead(r2); + } readPassed++; } else if( r1 != NULL && result1 == PASS_FILTER) { @@ -428,35 +456,49 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ delete r2; } // if splitting output, then no lock is need since different threads write different files - if(!mOptions->split.enabled) + if(!mOptions->split.enabled) mOutputMtx.lock(); if(mOptions->outputToSTDOUT) { // STDOUT output - fwrite(singleOutput.c_str(), 1, singleOutput.length(), stdout); + // if it's merging mode, write the merged reads to STDOUT + // otherwise write interleaved single output + if(mOptions->merge.enabled) + fwrite(mergedOutput.c_str(), 1, mergedOutput.length(), stdout); + else + fwrite(singleOutput.c_str(), 1, singleOutput.length(), stdout); } else if(mOptions->split.enabled) { // split output by each worker thread if(!mOptions->out1.empty()) config->getWriter1()->writeString(outstr1); if(!mOptions->out2.empty()) config->getWriter2()->writeString(outstr2); - } else { - // normal output by left/right writer thread - if(mRightWriter && mLeftWriter) { - // write PE - char* ldata = new char[outstr1.size()]; - memcpy(ldata, outstr1.c_str(), outstr1.size()); - mLeftWriter->input(ldata, outstr1.size()); - - char* rdata = new char[outstr2.size()]; - memcpy(rdata, outstr2.c_str(), outstr2.size()); - mRightWriter->input(rdata, outstr2.size()); - } else if(mLeftWriter) { - // write singleOutput - char* ldata = new char[singleOutput.size()]; - memcpy(ldata, singleOutput.c_str(), singleOutput.size()); - mLeftWriter->input(ldata, singleOutput.size()); - } - // output unpaired reads + } + + if(mMergedWriter && !mergedOutput.empty()) { + // write merged data + char* ldata = new char[mergedOutput.size()]; + memcpy(ldata, mergedOutput.c_str(), mergedOutput.size()); + mMergedWriter->input(ldata, mergedOutput.size()); + } + + // normal output by left/right writer thread + if(mRightWriter && mLeftWriter && (!outstr1.empty() || !outstr2.empty())) { + // write PE + char* ldata = new char[outstr1.size()]; + memcpy(ldata, outstr1.c_str(), outstr1.size()); + mLeftWriter->input(ldata, outstr1.size()); + + char* rdata = new char[outstr2.size()]; + memcpy(rdata, outstr2.c_str(), outstr2.size()); + mRightWriter->input(rdata, outstr2.size()); + } else if(mLeftWriter && !singleOutput.empty()) { + // write singleOutput + char* ldata = new char[singleOutput.size()]; + memcpy(ldata, singleOutput.c_str(), singleOutput.size()); + mLeftWriter->input(ldata, singleOutput.size()); + } + // output unpaired reads + if (!unpairedOut1.empty() || !unpairedOut2.empty()) { if(mUnpairedLeftWriter && mUnpairedRightWriter) { // write PE char* unpairedData1 = new char[unpairedOut1.size()]; @@ -714,6 +756,8 @@ void PairEndProcessor::consumerTask(ThreadConfig* config) mUnpairedLeftWriter->setInputCompleted(); if(mUnpairedRightWriter) mUnpairedRightWriter->setInputCompleted(); + if(mMergedWriter) + mMergedWriter->setInputCompleted(); } if(mOptions->verbose) { diff --git a/src/peprocessor.h b/src/peprocessor.h index 1468ba2..29e89ee 100644 --- a/src/peprocessor.h +++ b/src/peprocessor.h @@ -79,6 +79,7 @@ class PairEndProcessor{ WriterThread* mRightWriter; WriterThread* mUnpairedLeftWriter; WriterThread* mUnpairedRightWriter; + WriterThread* mMergedWriter; Duplicate* mDuplicate; }; From 969fc3382a63bb96e61c9fc51bbabb2fbde4b5c2 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Sun, 7 Apr 2019 07:22:37 +0800 Subject: [PATCH 06/19] update README for merging introduction --- README.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 3b45c76..7ade784 100644 --- a/README.md +++ b/README.md @@ -252,14 +252,18 @@ By default, fastp uses 1/20 reads for sequence counting, and you can change this `fastp` not only gives the counts of overrepresented sequence, but also gives the information that how they distribute over cycles. A figure is provided for each detected overrepresented sequence, from which you can know where this sequence is mostly found. # merge paired-end reads -For paired-end (PE) input, fastp supports stiching them by specifying the `-m/--merge` option. In this `merging` mode, the output will be a single file. +For paired-end (PE) input, fastp supports stiching them by specifying the `-m/--merge` option. In this `merging` mode: + +* `--merged_out` shouuld be given to specify the file to store merged reads, otherwise you should enable `--stdout` to stream the merged reads to STDOUT. The merged reads are also filtered. +* `--out1` and `--out2` will be the reads that cannot be merged successfully, but both pass all the filters. +* `--unpaired1` will be the reads that cannot be merged, `read1` passes filters but `read2` doesn't. +* `--unpaired2` will be the reads that cannot be merged, `read2` passes filters but `read1` doesn't. +* `--include_unmerged` can be enabled to make reads of `--out1`, `--out2`, `--unpaired1` and `--unpaired2` redirected to `--merged_out`. So you will get a single output file. This option is disabled by default. In the output file, a tag like `merged_xxx_yyy`will be added to each read name to indicate that how many base pairs are from read1 and from read2, respectively. For example, ` @NB551106:9:H5Y5GBGX2:1:22306:18653:13119 1:N:0:GATCAG merged_150_15` means that 150bp are from read1, and 15bp are from read2. `fastp` prefers the bases in read1 since they usually have higher quality than read2. -For the pairs of reads that cannot be merged successfully, they will be both included in the output by default. But you can specify the `--discard_unmerged` option to discard the unmerged reads. - Same as the [base correction feature](#base-correction-for-pe-data), this function is also based on overlapping detection, which has adjustable parameters `overlap_len_require (default 30)` and `overlap_diff_limit (default 5)`. # all options @@ -270,9 +274,12 @@ options: -i, --in1 read1 input file name (string) -o, --out1 read1 output file name (string [=]) -I, --in2 read2 input file name (string [=]) - -O, --out2 read2 output file name (string [=]) - -m, --merge for paired-end input, merge each pair of reads into a single read if they are overlapped. Disabled by default. - --discard_unmerged in the merging mode, discard the pairs of reads if they cannot be merged successfully. Disabled by default. + -O, --out2 read2 output file name (string [=]) + --unpaired1 for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=]) + --unpaired2 for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --umpaired1 (default mode), both unpaired reads will be written to this same file. (string [=]) + -m, --merge for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. + --merged_out in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=]) + --include_unmerged in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. -6, --phred64 indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) -z, --compression compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. (int [=4]) --stdin input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in. From fcd97313f5db4e3dccba52fc8ee74c631203e8fd Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Sun, 7 Apr 2019 15:27:08 +0800 Subject: [PATCH 07/19] support outputing failed reads --- src/common.h | 11 +++++++++++ src/main.cpp | 2 ++ src/options.cpp | 7 ++++++- src/options.h | 2 ++ src/peprocessor.cpp | 45 +++++++++++++++++++++++++++++++++++++++++--- src/peprocessor.h | 1 + src/read.cpp | 4 ++++ src/read.h | 1 + src/seprocessor.cpp | 36 +++++++++++++++++++++++++++++------ src/seprocessor.h | 1 + testdata/R1.fq.gz | Bin 319 -> 0 bytes testdata/R2.fq.gz | Bin 345 -> 0 bytes 12 files changed, 100 insertions(+), 10 deletions(-) delete mode 100644 testdata/R1.fq.gz delete mode 100644 testdata/R2.fq.gz diff --git a/src/common.h b/src/common.h index 3575309..22b2e4b 100644 --- a/src/common.h +++ b/src/common.h @@ -52,5 +52,16 @@ static const int FAIL_COMPLEXITY = 24; // how many types in total we support static const int FILTER_RESULT_TYPES = 32; +const static char* FAILED_TYPES[FILTER_RESULT_TYPES] = { + "passed", "", "", "", + "failed_polyx_filter", "", "", "", + "failed_bad_overlap", "", "", "", + "failed_too_many_n_bases", "", "", "", + "failed_too_short", "failed_too_long", "", "", + "failed_quality_filter", "", "", "", + "failed_low_complexity", "", "", "", + "", "", "", "" +}; + #endif /* COMMON_H */ diff --git a/src/main.cpp b/src/main.cpp index 31401a4..72fca96 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,6 +37,7 @@ int main(int argc, char* argv[]){ cmd.add("out2", 'O', "read2 output file name", false, ""); cmd.add("unpaired1", 0, "for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it.", false, ""); cmd.add("unpaired2", 0, "for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --umpaired1 (default mode), both unpaired reads will be written to this same file.", false, ""); + cmd.add("failed_out", 0, "specify the file to store reads that cannot pass the filters.", false, ""); cmd.add("merge", 'm', "for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default."); cmd.add("merged_out", 0, "in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output", false, ""); cmd.add("include_unmerged", 0, "in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default."); @@ -161,6 +162,7 @@ int main(int argc, char* argv[]){ opt.out2 = cmd.get("out2"); opt.unpaired1 = cmd.get("unpaired1"); opt.unpaired2 = cmd.get("unpaired2"); + opt.failedOut = cmd.get("failed_out"); // write to the same file if(opt.unpaired2.empty()) opt.unpaired2 = opt.unpaired1; diff --git a/src/options.cpp b/src/options.cpp index da2cdcd..73bf2d6 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -68,7 +68,7 @@ bool Options::validate() { if(!correction.enabled) correction.enabled = true; if(merge.out.empty() && !outputToSTDOUT && !out1.empty() && out2.empty()) { - cerr << "You specified --out1, but haven't specified --merged_out in merging mode. Using --out1 to store the merging output to be compatible with fastp 0.19.8" << endl << endl; + cerr << "You specified --out1, but haven't specified --merged_out in merging mode. Using --out1 to store the merged reads to be compatible with fastp 0.19.8" << endl << endl; merge.out = out1; out1 = ""; } @@ -174,6 +174,11 @@ bool Options::validate() { error_exit(unpaired2 + " already exists and you have set to not rewrite output files by --dont_overwrite"); } } + if(!failedOut.empty()) { + if(dontOverwrite && file_exists(failedOut)) { + error_exit(failedOut + " already exists and you have set to not rewrite output files by --dont_overwrite"); + } + } if(dontOverwrite) { if(file_exists(jsonFile)) { diff --git a/src/options.h b/src/options.h index bb6a799..35d64a5 100644 --- a/src/options.h +++ b/src/options.h @@ -296,6 +296,8 @@ class Options{ string unpaired1; // file name of unpaired read2 output string unpaired2; + // file name of failed reads output + string failedOut; // json file string jsonFile; // html file diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index 9a51a21..86f5cc4 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -31,6 +31,7 @@ PairEndProcessor::PairEndProcessor(Options* opt){ mUnpairedLeftWriter = NULL; mUnpairedRightWriter = NULL; mMergedWriter = NULL; + mFailedWriter = NULL; mDuplicate = NULL; if(mOptions->duplicate.enabled) { @@ -58,6 +59,9 @@ void PairEndProcessor::initOutput() { mMergedWriter = new WriterThread(mOptions, mOptions->merge.out); } + if(!mOptions->failedOut.empty()) + mFailedWriter = new WriterThread(mOptions, mOptions->failedOut); + if(mOptions->out1.empty()) return; @@ -79,6 +83,10 @@ void PairEndProcessor::closeOutput() { delete mMergedWriter; mMergedWriter = NULL; } + if(mFailedWriter) { + delete mFailedWriter; + mFailedWriter = NULL; + } if(mUnpairedLeftWriter) { delete mUnpairedLeftWriter; mLeftWriter = NULL; @@ -123,6 +131,7 @@ bool PairEndProcessor::process(){ std::thread* unpairedLeftWriterThread = NULL; std::thread* unpairedRightWriterThread = NULL; std::thread* mergedWriterThread = NULL; + std::thread* failedWriterThread = NULL; if(mLeftWriter) leftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mLeftWriter)); if(mRightWriter) @@ -133,6 +142,8 @@ bool PairEndProcessor::process(){ unpairedRightWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mUnpairedRightWriter)); if(mMergedWriter) mergedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mMergedWriter)); + if(mFailedWriter) + failedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mFailedWriter)); producer.join(); for(int t=0; tthread; t++){ @@ -150,6 +161,8 @@ bool PairEndProcessor::process(){ unpairedRightWriterThread->join(); if(mergedWriterThread) mergedWriterThread->join(); + if(failedWriterThread) + failedWriterThread->join(); } if(mOptions->verbose) @@ -270,6 +283,8 @@ bool PairEndProcessor::process(){ delete unpairedRightWriterThread; if(mergedWriterThread) delete mergedWriterThread; + if(failedWriterThread) + delete failedWriterThread; if(!mOptions->split.enabled) closeOutput(); @@ -296,6 +311,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ string unpairedOut2; string singleOutput; string mergedOutput; + string failedOut; int readPassed = 0; int mergedCount = 0; for(int p=0;pcount;p++){ @@ -439,10 +455,24 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ } else if( r1 != NULL && result1 == PASS_FILTER) { if(mUnpairedLeftWriter) { unpairedOut1 += r1->toString(); + if(mFailedWriter) + failedOut += or2->toStringWithTag(FAILED_TYPES[result2]); + } else { + if(mFailedWriter) { + failedOut += or1->toStringWithTag("paired_read_is_failing"); + failedOut += or2->toStringWithTag(FAILED_TYPES[result2]); + } } } else if( r2 != NULL && result2 == PASS_FILTER) { if(mUnpairedLeftWriter || mUnpairedRightWriter) { unpairedOut2 += r2->toString(); + if(mFailedWriter) + failedOut += or1->toStringWithTag(FAILED_TYPES[result1]); + } else { + if(mFailedWriter) { + failedOut += or1->toStringWithTag(FAILED_TYPES[result1]); + failedOut += or2->toStringWithTag("paired_read_is_failing"); + } } } } @@ -476,9 +506,16 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ if(mMergedWriter && !mergedOutput.empty()) { // write merged data - char* ldata = new char[mergedOutput.size()]; - memcpy(ldata, mergedOutput.c_str(), mergedOutput.size()); - mMergedWriter->input(ldata, mergedOutput.size()); + char* mdata = new char[mergedOutput.size()]; + memcpy(mdata, mergedOutput.c_str(), mergedOutput.size()); + mMergedWriter->input(mdata, mergedOutput.size()); + } + + if(mFailedWriter && !failedOut.empty()) { + // write failed data + char* fdata = new char[failedOut.size()]; + memcpy(fdata, failedOut.c_str(), failedOut.size()); + mFailedWriter->input(fdata, failedOut.size()); } // normal output by left/right writer thread @@ -758,6 +795,8 @@ void PairEndProcessor::consumerTask(ThreadConfig* config) mUnpairedRightWriter->setInputCompleted(); if(mMergedWriter) mMergedWriter->setInputCompleted(); + if(mFailedWriter) + mFailedWriter->setInputCompleted(); } if(mOptions->verbose) { diff --git a/src/peprocessor.h b/src/peprocessor.h index 29e89ee..b646989 100644 --- a/src/peprocessor.h +++ b/src/peprocessor.h @@ -80,6 +80,7 @@ class PairEndProcessor{ WriterThread* mUnpairedLeftWriter; WriterThread* mUnpairedRightWriter; WriterThread* mMergedWriter; + WriterThread* mFailedWriter; Duplicate* mDuplicate; }; diff --git a/src/read.cpp b/src/read.cpp index 597b4f1..407644c 100644 --- a/src/read.cpp +++ b/src/read.cpp @@ -131,6 +131,10 @@ string Read::toString() { return mName + "\n" + mSeq.mStr + "\n" + mStrand + "\n" + mQuality + "\n"; } +string Read::toStringWithTag(string tag) { + return mName + " " + tag + "\n" + mSeq.mStr + "\n" + mStrand + "\n" + mQuality + "\n"; +} + bool Read::test(){ Read r("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA", "CTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAGTTTTCCTTAGGAGGACATTTTTTACATGAAATTATTAACCTAAATAGAGTTGATC", diff --git a/src/read.h b/src/read.h index 8c2efe9..0ce9093 100644 --- a/src/read.h +++ b/src/read.h @@ -27,6 +27,7 @@ class Read{ int lowQualCount(int qual=20); int length(); string toString(); + string toStringWithTag(string tag); void resize(int len); void convertPhred64To33(); void trimFront(int len); diff --git a/src/seprocessor.cpp b/src/seprocessor.cpp index 127a293..a9a6f0b 100644 --- a/src/seprocessor.cpp +++ b/src/seprocessor.cpp @@ -20,6 +20,7 @@ SingleEndProcessor::SingleEndProcessor(Options* opt){ mZipFile = NULL; mUmiProcessor = new UmiProcessor(opt); mLeftWriter = NULL; + mFailedWriter = NULL; mDuplicate = NULL; if(mOptions->duplicate.enabled) { @@ -36,6 +37,8 @@ SingleEndProcessor::~SingleEndProcessor() { } void SingleEndProcessor::initOutput() { + if(!mOptions->failedOut.empty()) + mFailedWriter = new WriterThread(mOptions, mOptions->failedOut); if(mOptions->out1.empty()) return; mLeftWriter = new WriterThread(mOptions, mOptions->out1); @@ -46,6 +49,10 @@ void SingleEndProcessor::closeOutput() { delete mLeftWriter; mLeftWriter = NULL; } + if(mFailedWriter) { + delete mFailedWriter; + mFailedWriter = NULL; + } } void SingleEndProcessor::initConfig(ThreadConfig* config) { @@ -78,8 +85,11 @@ bool SingleEndProcessor::process(){ } std::thread* leftWriterThread = NULL; + std::thread* failedWriterThread = NULL; if(mLeftWriter) leftWriterThread = new std::thread(std::bind(&SingleEndProcessor::writeTask, this, mLeftWriter)); + if(mFailedWriter) + failedWriterThread = new std::thread(std::bind(&SingleEndProcessor::writeTask, this, mFailedWriter)); producer.join(); for(int t=0; tthread; t++){ @@ -89,6 +99,8 @@ bool SingleEndProcessor::process(){ if(!mOptions->split.enabled) { if(leftWriterThread) leftWriterThread->join(); + if(failedWriterThread) + failedWriterThread->join(); } if(mOptions->verbose) @@ -169,6 +181,8 @@ bool SingleEndProcessor::process(){ if(leftWriterThread) delete leftWriterThread; + if(failedWriterThread) + delete failedWriterThread; if(!mOptions->split.enabled) closeOutput(); @@ -178,6 +192,7 @@ bool SingleEndProcessor::process(){ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){ string outstr; + string failedOut; int readPassed = 0; for(int p=0;pcount;p++){ @@ -234,6 +249,8 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){ // stats the read after filtering config->getPostStats1()->statRead(r1); readPassed++; + } else if(mFailedWriter) { + failedOut += or1->toStringWithTag(FAILED_TYPES[result]); } delete or1; @@ -251,12 +268,17 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){ if(!mOptions->out1.empty()) config->getWriter1()->writeString(outstr); } - else { - if(mLeftWriter) { - char* ldata = new char[outstr.size()]; - memcpy(ldata, outstr.c_str(), outstr.size()); - mLeftWriter->input(ldata, outstr.size()); - } + + if(mLeftWriter) { + char* ldata = new char[outstr.size()]; + memcpy(ldata, outstr.c_str(), outstr.size()); + mLeftWriter->input(ldata, outstr.size()); + } + if(mFailedWriter && !failedOut.empty()) { + // write failed data + char* fdata = new char[failedOut.size()]; + memcpy(fdata, failedOut.c_str(), failedOut.size()); + mFailedWriter->input(fdata, failedOut.size()); } if(!mOptions->split.enabled) mOutputMtx.unlock(); @@ -469,6 +491,8 @@ void SingleEndProcessor::consumerTask(ThreadConfig* config) if(mFinishedThreads == mOptions->thread) { if(mLeftWriter) mLeftWriter->setInputCompleted(); + if(mFailedWriter) + mFailedWriter->setInputCompleted(); } if(mOptions->verbose) { diff --git a/src/seprocessor.h b/src/seprocessor.h index a6f0cac..b60c59f 100644 --- a/src/seprocessor.h +++ b/src/seprocessor.h @@ -68,6 +68,7 @@ class SingleEndProcessor{ ofstream* mOutStream; UmiProcessor* mUmiProcessor; WriterThread* mLeftWriter; + WriterThread* mFailedWriter; Duplicate* mDuplicate; }; diff --git a/testdata/R1.fq.gz b/testdata/R1.fq.gz deleted file mode 100644 index 8a59d73ce1a35cb888cea0328d82ab20fbf41471..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 319 zcmV-F0l@wriwFpJFZo#j15zWJWxD&PLE|qQVe91kyu5VWa>-x7;n8<=UgMDiEFLlv?93 zCF;-Q%Er|gYFe+|meU`SZ;8-D&2>%QZ_zz4`LMST4eM+P^P*Si2 z3K0VnT}iQ{l*ODgLQPFTw2gYz=~3o<_zp1d()?HQDcbJ-&v4uBZGVd_VR2uz2dH|+ RTOvpie*?;jj#K6W005v?n$!RQ diff --git a/testdata/R2.fq.gz b/testdata/R2.fq.gz deleted file mode 100644 index a5c38479eb00d8ea50882432632930e8fa608fa0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 345 zcmV-f0jB;RiwFpJFZo#j15z?BW^n+8k+E*WFbqU@e?)gAaNWYupuvZ`C`FVx(CvyS-i>XJ_aP=SqG7rLz!5ks< z Date: Mon, 8 Apr 2019 10:47:48 +0800 Subject: [PATCH 08/19] check file names to avoid different outputs having same file name --- src/options.cpp | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/options.cpp b/src/options.cpp index 73bf2d6..5c88b1d 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -93,6 +93,22 @@ bool Options::validate() { if(merge.out.empty() && !outputToSTDOUT) { error_exit("In merging mode, you should either specify --merged_out or enable --stdout"); } + if(!merge.out.empty()) { + if(merge.out == out1) + error_exit("--merged_out and --out1 shouldn't have same file name"); + if(merge.out == out2) + error_exit("--merged_out and --out2 shouldn't have same file name"); + if(merge.out == unpaired1) + error_exit("--merged_out and --unpaired1 shouldn't have same file name"); + if(merge.out == unpaired2) + error_exit("--merged_out and --unpaired2 shouldn't have same file name"); + } + } else { + // not in merging mode + if(!merge.out.empty()) { + cerr << "You haven't enabled merging mode (-m/--merge), ignoring argument --merged_out = " << merge.out << endl; + merge.out = ""; + } } // if output to STDOUT, then... @@ -168,16 +184,34 @@ bool Options::validate() { if(dontOverwrite && file_exists(unpaired1)) { error_exit(unpaired1 + " already exists and you have set to not rewrite output files by --dont_overwrite"); } + if(unpaired1 == out1) + error_exit("--unpaired1 and --out1 shouldn't have same file name"); + if(unpaired1 == out2) + error_exit("--unpaired1 and --out2 shouldn't have same file name"); } if(!unpaired2.empty()) { if(dontOverwrite && file_exists(unpaired2)) { error_exit(unpaired2 + " already exists and you have set to not rewrite output files by --dont_overwrite"); } + if(unpaired2 == out1) + error_exit("--unpaired2 and --out1 shouldn't have same file name"); + if(unpaired2 == out2) + error_exit("--unpaired2 and --out2 shouldn't have same file name"); } if(!failedOut.empty()) { if(dontOverwrite && file_exists(failedOut)) { error_exit(failedOut + " already exists and you have set to not rewrite output files by --dont_overwrite"); } + if(failedOut == out1) + error_exit("--failed_out and --out1 shouldn't have same file name"); + if(failedOut == out2) + error_exit("--failed_out and --out2 shouldn't have same file name"); + if(failedOut == unpaired1) + error_exit("--failed_out and --unpaired1 shouldn't have same file name"); + if(failedOut == unpaired2) + error_exit("--failed_out and --unpaired2 shouldn't have same file name"); + if(failedOut == merge.out) + error_exit("--failed_out and --merged_out shouldn't have same file name"); } if(dontOverwrite) { From ae89bcd3d6b5b2ab1f55d49ce42b75ace1cd27aa Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Mon, 8 Apr 2019 10:59:44 +0800 Subject: [PATCH 09/19] update README for unpaired or failed output --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 7ade784..b1276d8 100644 --- a/README.md +++ b/README.md @@ -101,12 +101,21 @@ sudo make install ## input from STDIN * specify `--stdin` if you want to read the STDIN for processing. * if the STDIN is an interleaved paired-end stream, specify `--interleaved_in` to indicate that. +## store the unpaired reads for PE data +* you can specify `--unpaired1` to store the reads that read1 passes filters but its paired read2 doesn't, as well as `--unpaired2` for unpaired read2. +* `--unpaired1` and `--unpaired2` can be the same, so the unpaired read1/read2 will be written to the same single file. +## store the reads that fail the filters +* give `--failed_out` to specify the file name to store the failed reads. +* if one read failed and is written to `--failed_out`, its `failure reason` will be appended to its read name. For example, `failed_quality_filter`, `failed_too_short` etc. +* for PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the `failure reason` will be `paired_read_is_failing`. ## process only part of the data If you don't want to process all the data, you can specify `--reads_to_process` to limit the reads to be processed. This is useful if you want to have a fast preview of the data quality, or you want to create a subset of the filtered data. ## do not overwrite exiting files You can enable the option `--dont_overwrite` to protect the existing files not to be overwritten by `fastp`. In this case, `fastp` will report an error and quit if it finds any of the output files (read1, read2, json report, html report) already exists before. ## split the output to multiple files for parallel processing See [output splitting](#output-splitting) +## merge PE reads +See [merge paired-end reads](#merge-paired-end-reads) # filtering Multiple filters have been implemented. @@ -277,6 +286,7 @@ options: -O, --out2 read2 output file name (string [=]) --unpaired1 for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=]) --unpaired2 for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --umpaired1 (default mode), both unpaired reads will be written to this same file. (string [=]) + --failed_out specify the file to store reads that cannot pass the filters. (string [=]) -m, --merge for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. --merged_out in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=]) --include_unmerged in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. From 345538ade329effb5c8e6be6656139a4b0012f2f Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Mon, 8 Apr 2019 11:13:36 +0800 Subject: [PATCH 10/19] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b1276d8..fbc1d63 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,8 @@ For paired-end (PE) input, fastp supports stiching them by specifying the `-m/-- * `--unpaired2` will be the reads that cannot be merged, `read2` passes filters but `read1` doesn't. * `--include_unmerged` can be enabled to make reads of `--out1`, `--out2`, `--unpaired1` and `--unpaired2` redirected to `--merged_out`. So you will get a single output file. This option is disabled by default. +`--failed_out` can still be given to store the reads (either merged or unmerged) failed to passing filters. + In the output file, a tag like `merged_xxx_yyy`will be added to each read name to indicate that how many base pairs are from read1 and from read2, respectively. For example, ` @NB551106:9:H5Y5GBGX2:1:22306:18653:13119 1:N:0:GATCAG merged_150_15` means that 150bp are from read1, and 15bp are from read2. `fastp` prefers the bases in read1 since they usually have higher quality than read2. From 473f5fd6872e84c44466916efdca86a22536d049 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 9 Apr 2019 07:56:45 +0800 Subject: [PATCH 11/19] update adapter detection with more builtin adapters --- src/common.h | 2 +- src/evaluator.cpp | 2 +- src/knownadapters.h | 236 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 235 insertions(+), 5 deletions(-) diff --git a/src/common.h b/src/common.h index 22b2e4b..a43528f 100644 --- a/src/common.h +++ b/src/common.h @@ -1,7 +1,7 @@ #ifndef COMMON_H #define COMMON_H -#define FASTP_VER "0.19.9" +#define FASTP_VER "0.19.10" #define _DEBUG false diff --git a/src/evaluator.cpp b/src/evaluator.cpp index 1064f57..86af97a 100644 --- a/src/evaluator.cpp +++ b/src/evaluator.cpp @@ -617,7 +617,7 @@ string Evaluator::getAdapterWithSeed(int seed, Read** loadedReads, long records, string matchedAdapter = matchKnownAdapter(adapter); if(!matchedAdapter.empty()) { map knownAdapters = getKnownAdapter(); - cerr << knownAdapters[matchedAdapter] << ": " << matchedAdapter << endl; + cerr << knownAdapters[matchedAdapter] << endl << matchedAdapter << endl; return matchedAdapter; } else { if(reachedLeaf) { diff --git a/src/knownadapters.h b/src/knownadapters.h index b7def58..ecbffd8 100644 --- a/src/knownadapters.h +++ b/src/knownadapters.h @@ -1,6 +1,8 @@ #ifndef KNOWN_ADAPTERS_H #define KNOWN_ADAPTERS_H +// some adapter sequences are from https://github.com/stephenturner/adapters/blob/master/adapters_combined_256_unique.fasta + #include #include #include @@ -9,9 +11,237 @@ inline map getKnownAdapter() { map knownAdapters; - knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"] = "Illumina TruSeq Adapter Read 1"; - knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"] = "Illumina TruSeq Adapter Read 2"; - knownAdapters["GATCGTCGGACTGTAGAACTCTGAACGTGTAGA"] = "Illumina Small RNA Adapter Read 2"; + knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"] = ">Illumina TruSeq Adapter Read 1"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"] = ">Illumina TruSeq Adapter Read 2"; + knownAdapters["GATCGTCGGACTGTAGAACTCTGAACGTGTAGA"] = ">Illumina Small RNA Adapter Read 2"; + knownAdapters["AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"] = ">Illumina DpnII expression PCR Primer 2 | >Illumina NlaIII expression PCR Primer 2 | >Illumina Small RNA PCR Primer 2 | >Illumina DpnII Gex PCR Primer 2 | >Illumina NlaIII Gex PCR Primer 2"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA"] = ">Illumina RNA PCR Primer"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"] = ">TruSeq_Universal_Adapter | >PrefixPE/1 | >PCR_Primer1 | >Illumina Single End PCR Primer 1 | >Illumina Paried End PCR Primer 1 | >Illumina Multiplexing PCR Primer 1.01 | >TruSeq Universal Adapter | >TruSeq_Universal_Adapter | >PrefixPE/1 | >PCR_Primer1"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"] = ">pcr_dimer"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"] = ">PCR_Primers"; + knownAdapters["ACACTCTTTCCCTACACGACGCTCTTCCGATCT"] = ">Illumina Single End Sequencing Primer | >Illumina Paired End Adapter 1 | >Illumina Paried End Sequencing Primer 1 | >Illumina Multiplexing Adapter 2 | >Illumina Multiplexing Read1 Sequencing Primer"; + knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"] = ">PE2_rc | >TruSeq3_IndexedAdapter | >PE2_rc | >TruSeq3_IndexedAdapter"; + knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"] = ">Reverse_adapter"; + knownAdapters["AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"] = ">TruSeq2_PE_r"; + knownAdapters["AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"] = ">PCR_Primer2_rc"; + knownAdapters["AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAA"] = ">PhiX_read1_adapter"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA"] = ">PE1_rc | >TruSeq3_UniversalAdapter | >PE1_rc | >TruSeq3_UniversalAdapter"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">PCR_Primer1_rc"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA"] = ">PhiX_read2_adapter"; + knownAdapters["AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq2_SE"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 35"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 10"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 10"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 2"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 2"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 38"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 27"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 25"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 31"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 44"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 37"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 6"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 6"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 5"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 5"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 23"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 30"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 21"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 42"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 33"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"] = ">PrefixPE/2 | >PCR_Primer2 | >Illumina Paired End PCR Primer 2 | >PrefixPE/2 | >PCR_Primer2"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 22"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 1"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 1"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 17"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 9"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 9"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 47"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 28"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 45"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 7"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 7"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 34"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 3"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 3"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 18"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 24"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 26"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 43"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 14"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 16"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 20"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 11"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 11"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 39"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 41"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 12"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 12"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 29"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 8"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 8"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 46"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 40"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 15"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 32"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 48"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 4"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 4"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 36"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 13"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 19"; + knownAdapters["CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"] = ">Illumina Single End Adapter 2 | >Illumina Single End PCR Primer 2"; + knownAdapters["CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT"] = ">ABI Solid3 Adapter B"; + knownAdapters["CCGACAGGTTCAGAGTTCTACAGTCCGACATG"] = ">Illumina NlaIII expression Sequencing Primer | >Illumina NlaIII Gex Sequencing Primer"; + knownAdapters["CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N711 | >I7_Primer_Nextera_XT_Index_Kit_v2_N711 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N711 | >I7_Primer_Nextera_XT_Index_Kit_v2_N711"; + knownAdapters["CCGAGCCCACGAGACACTCGCTAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N716"; + knownAdapters["CCGAGCCCACGAGACACTGAGCGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N724"; + knownAdapters["CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N703 | >I7_Primer_Nextera_XT_Index_Kit_v2_N703 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N703 | >I7_Primer_Nextera_XT_Index_Kit_v2_N703"; + knownAdapters["CCGAGCCCACGAGACATCTCAGGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N715"; + knownAdapters["CCGAGCCCACGAGACATGCGCAGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N722"; + knownAdapters["CCGAGCCCACGAGACCAGAGAGGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N708"; + knownAdapters["CCGAGCCCACGAGACCCTAAGACATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N726"; + knownAdapters["CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N710 | >I7_Primer_Nextera_XT_Index_Kit_v2_N710 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N710 | >I7_Primer_Nextera_XT_Index_Kit_v2_N710"; + knownAdapters["CCGAGCCCACGAGACCGATCAGTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N727"; + knownAdapters["CCGAGCCCACGAGACCGGAGCCTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N720"; + knownAdapters["CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N702 | >I7_Primer_Nextera_XT_Index_Kit_v2_N702 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N702 | >I7_Primer_Nextera_XT_Index_Kit_v2_N702"; + knownAdapters["CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N707 | >I7_Primer_Nextera_XT_Index_Kit_v2_N707 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N707 | >I7_Primer_Nextera_XT_Index_Kit_v2_N707"; + knownAdapters["CCGAGCCCACGAGACGCGTAGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N719"; + knownAdapters["CCGAGCCCACGAGACGCTACGCTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N709"; + knownAdapters["CCGAGCCCACGAGACGCTCATGAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N714"; + knownAdapters["CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N705 | >I7_Primer_Nextera_XT_Index_Kit_v2_N705 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N705 | >I7_Primer_Nextera_XT_Index_Kit_v2_N705"; + knownAdapters["CCGAGCCCACGAGACGGAGCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N718"; + knownAdapters["CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N712 | >I7_Primer_Nextera_XT_Index_Kit_v2_N712 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N712 | >I7_Primer_Nextera_XT_Index_Kit_v2_N712"; + knownAdapters["CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N701 | >I7_Primer_Nextera_XT_Index_Kit_v2_N701 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N701 | >I7_Primer_Nextera_XT_Index_Kit_v2_N701"; + knownAdapters["CCGAGCCCACGAGACTACGCTGCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N721"; + knownAdapters["CCGAGCCCACGAGACTAGCGCTCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N723"; + knownAdapters["CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N706 | >I7_Primer_Nextera_XT_Index_Kit_v2_N706 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N706 | >I7_Primer_Nextera_XT_Index_Kit_v2_N706"; + knownAdapters["CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N704 | >I7_Primer_Nextera_XT_Index_Kit_v2_N704 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N704 | >I7_Primer_Nextera_XT_Index_Kit_v2_N704"; + knownAdapters["CCGAGCCCACGAGACTCGACGTCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N729"; + knownAdapters["CCGAGCCCACGAGACTGCAGCTAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N728"; + knownAdapters["CGACAGGTTCAGAGTTCTACAGTCCGACGATC"] = ">Illumina DpnII expression Sequencing Primer | >Illumina Small RNA Sequencing Primer | >Illumina DpnII Gex Sequencing Primer"; + knownAdapters["CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"] = ">Illumina Paired End Sequencing Primer 2"; + knownAdapters["CTAATACGACTCACTATAGGGCAAGCAGTGGTATCAACGCAGAGT"] = ">Clontech Universal Primer Mix Long"; + knownAdapters["CTGAGCGGGCTGGCAAGGCAGACCGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Adapter_Nextera_No_Barcode"; + knownAdapters["CTGATGGCGCGAGGGAGGCGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Adapter_Nextera"; + knownAdapters["CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG"] = ">ABI Solid3 Adapter A"; + knownAdapters["CTGTCTCTTATACACATCTCCGAGCCCACGAGAC"] = ">I7_Nextera_Transposase_1 | >Trans2_rc | >I7_Nextera_Transposase_1 | >Trans2_rc"; + knownAdapters["CTGTCTCTTATACACATCTCTGAGCGGGCTGGCAAGGC"] = ">I7_Nextera_Transposase_2"; + knownAdapters["CTGTCTCTTATACACATCTCTGATGGCGCGAGGGAGGC"] = ">I5_Nextera_Transposase_2"; + knownAdapters["CTGTCTCTTATACACATCTGACGCTGCCGACGA"] = ">I5_Nextera_Transposase_1 | >Trans1_rc | >I5_Nextera_Transposase_1 | >Trans1_rc"; + knownAdapters["GACGCTGCCGACGAACTCTAGGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S516"; + knownAdapters["GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]503 | >I5_Primer_Nextera_XT_Index_Kit_v2_S503 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]503 | >I5_Primer_Nextera_XT_Index_Kit_v2_S503"; + knownAdapters["GACGCTGCCGACGAAGCTAGAAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S515"; + knownAdapters["GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]508 | >I5_Primer_Nextera_XT_Index_Kit_v2_S508 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]508 | >I5_Primer_Nextera_XT_Index_Kit_v2_S508"; + knownAdapters["GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]502 | >I5_Primer_Nextera_XT_Index_Kit_v2_S502 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]502 | >I5_Primer_Nextera_XT_Index_Kit_v2_S502"; + knownAdapters["GACGCTGCCGACGAATAGCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S520"; + knownAdapters["GACGCTGCCGACGAATTAGACGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S510"; + knownAdapters["GACGCTGCCGACGACGGAGAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S511"; + knownAdapters["GACGCTGCCGACGACTAGTCGAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S513"; + knownAdapters["GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]505 | >I5_Primer_Nextera_XT_Index_Kit_v2_S505 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]505 | >I5_Primer_Nextera_XT_Index_Kit_v2_S505"; + knownAdapters["GACGCTGCCGACGACTTAATAGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S518"; + knownAdapters["GACGCTGCCGACGAGCGATCTAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]501"; + knownAdapters["GACGCTGCCGACGATAAGGCTCGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S521"; + knownAdapters["GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]507 | >I5_Primer_Nextera_XT_Index_Kit_v2_S507 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]507 | >I5_Primer_Nextera_XT_Index_Kit_v2_S507"; + knownAdapters["GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]506 | >I5_Primer_Nextera_XT_Index_Kit_v2_S506 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]506 | >I5_Primer_Nextera_XT_Index_Kit_v2_S506"; + knownAdapters["GACGCTGCCGACGATCGCATAAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S522"; + knownAdapters["GACGCTGCCGACGATCTACTCTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]504"; + knownAdapters["GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]517 | >I5_Primer_Nextera_XT_Index_Kit_v2_S517 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]517 | >I5_Primer_Nextera_XT_Index_Kit_v2_S517"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"] = ">Nextera_LMP_Read1_External_Adapter | >Illumina Multiplexing Index Sequencing Primer"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_5 | >TruSeq Adapter, Index 5"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_25"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 25"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_8 | >TruSeq Adapter, Index 8"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_13"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 13"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_14"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 14"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_1_6 | >TruSeq Adapter, Index 1"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_15"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 15"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_27"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 27"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_7 | >TruSeq Adapter, Index 7"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCACTCTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 23"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_16"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 16"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_2 | >TruSeq Adapter, Index 2"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_22"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 22"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_12 | >TruSeq Adapter, Index 12"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_23"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_9 | >TruSeq Adapter, Index 9"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_6 | >TruSeq Adapter, Index 6"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_11 | >TruSeq Adapter, Index 11"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_18_7"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 18"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_19"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 19"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_20"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 20"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_21"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 21"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_10 | >TruSeq Adapter, Index 10"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_4 | >TruSeq Adapter, Index 4"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_3 | >TruSeq Adapter, Index 3"; + knownAdapters["GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"] = ">Illumina Paired End Adapter 2"; + knownAdapters["GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"] = ">Nextera_LMP_Read2_External_Adapter"; + knownAdapters["GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"] = ">Illumina Single End Adapter 1"; + knownAdapters["GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG"] = ">Trans2"; + knownAdapters["GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT"] = ">PrefixPE/2 | >PE2 | >Illumina Multiplexing PCR Primer 2.01 | >Illumina Multiplexing Read2 Sequencing Primer | >PrefixPE/2 | >PE2"; + knownAdapters["TACACTCTTTCCCTACACGACGCTCTTCCGATCT"] = ">PrefixPE/1 | >PE1 | >PrefixPE/1 | >PE1"; + knownAdapters["TCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">RNA_PCR_Primer_(RP1)_part_#_15013198"; + knownAdapters["TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"] = ">Trans1"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_5_(RPI5)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_25_(RPI25)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_8_(RPI8)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTCAAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_13_(RPI13)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTTCCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_14_(RPI14)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_1_(RPI1)_2,9"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_26_(RPI26)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGTCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_15_(RPI15)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATTCCTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_27_(RPI27)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAAAAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_28_(RPI28)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAACTAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_29_(RPI29)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACCGGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_30_(RPI30)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACGATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_31_(RPI31)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACTCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_32_(RPI32)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_7_(RPI7)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGGCGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_33_(RPI33)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATGGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_34_(RPI34)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATTTTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_35_(RPI35)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCAACAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_36_(RPI36)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_16_(RPI16)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_2_(RPI2)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGGAATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_37_(RPI37)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGTACGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_22_(RPI22)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTAGCTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_38_(RPI38)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTATACATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_39_(RPI39)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTCAGAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_40_(RPI40)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_12_(RPI12)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGACGACATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_41_(RPI41)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGAGTGGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_23_(RPI23)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_9_(RPI9)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_6_(RPI6)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_11_(RPI11)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGTAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_24_(RPI24)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTAGAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_17_(RPI17)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_18_(RPI18)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGAAAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_19_(RPI19)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGGCCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_20_(RPI20)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTTTCGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_21_(RPI21)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAATCGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_42_(RPI42)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTACAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_43_(RPI43)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_10_(RPI10)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTATAATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_44_(RPI44)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCATTCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_45_(RPI45)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCCCGAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_46_(RPI46)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGAAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_47_(RPI47)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGGCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_48_(RPI48)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_4_(RPI4)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_3_(RPI3)"; + knownAdapters["TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC"] = ">FlowCell1"; + knownAdapters["TTTTTTTTTTCAAGCAGAAGACGGCATACGA"] = ">FlowCell2"; return knownAdapters; } From 5b79bbd421fcd6d839c8e9c3f4205d46b005c21c Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 9 Apr 2019 15:20:23 +0800 Subject: [PATCH 12/19] support multi adapters provided in a FASTA file https://github.com/OpenGene/fastp/issues/58 --- README.md | 1 + src/adaptertrimmer.cpp | 47 +++++++++++++++++++++++++++++++++++++++--- src/adaptertrimmer.h | 3 ++- src/filterresult.cpp | 5 +++-- src/filterresult.h | 2 +- src/main.cpp | 5 +++++ src/options.cpp | 25 ++++++++++++++++++++++ src/options.h | 4 ++++ src/peprocessor.cpp | 10 +++++++-- src/seprocessor.cpp | 10 +++++++-- src/util.h | 17 ++++++++++----- 11 files changed, 113 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index fbc1d63..efa5b90 100644 --- a/README.md +++ b/README.md @@ -304,6 +304,7 @@ options: -A, --disable_adapter_trimming adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled -a, --adapter_sequence the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto]) --adapter_sequence_r2 the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as (string [=]) + --adapter_fasta specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file (string [=]) --detect_adapter_for_pe by default, the adapter sequence auto-detection is enabled for SE data only, turn on this option to enable it for PE data. # global trimming options diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp index de51118..a55993c 100644 --- a/src/adaptertrimmer.cpp +++ b/src/adaptertrimmer.cpp @@ -45,8 +45,31 @@ bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, return false; } -bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2) { - const int matchReq = 4; +bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector& adapterList, bool isR2, bool incTrimmedCounter) { + int matchReq = 4; + if(adapterList.size() > 16) + matchReq = 5; + if(adapterList.size() > 256) + matchReq = 6; + bool trimmed = false; + + string originalSeq = r->mSeq.mStr; + for(int i=0; ilength(), originalSeq.length() - r->length()); + if(fr) + fr->addAdapterTrimmed(adapter, isR2, incTrimmedCounter); + else + cerr << adapter << endl; + } + + return trimmed; +} + +bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int matchReq) { const int allowOneMismatchForEach = 8; int rlen = r->length(); @@ -119,5 +142,23 @@ bool AdapterTrimmer::test() { "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E"); string adapter = "TTTTCCACGGGGATACTACTG"; bool trimmed = AdapterTrimmer::trimBySequence(&r, NULL, adapter); - return r.mSeq.mStr == "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA"; + if (r.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA") + return false; + + Read read("@name", + "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGGAAATTTCCCGGGAAATTTCCCGGGATCGATCGATCGATCGAATTCC", + "+", + "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE"); + vector adapterList; + adapterList.push_back("GCTAGCTAGCTAGCTA"); + adapterList.push_back("AAATTTCCCGGGAAATTTCCCGGG"); + adapterList.push_back("ATCGATCGATCGATCG"); + adapterList.push_back("AATTCCGGAATTCCGG"); + trimmed = AdapterTrimmer::trimByMultiSequences(&read, NULL, adapterList); + if (read.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") { + cerr << read.mSeq.mStr << endl; + return false; + } + + return true; } \ No newline at end of file diff --git a/src/adaptertrimmer.h b/src/adaptertrimmer.h index f68481c..c0d2905 100644 --- a/src/adaptertrimmer.h +++ b/src/adaptertrimmer.h @@ -17,7 +17,8 @@ class AdapterTrimmer{ static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr); static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); - static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false); + static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4); + static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector& adapterList, bool isR2 = false, bool incTrimmedCounter = true); static bool test(); diff --git a/src/filterresult.cpp b/src/filterresult.cpp index 2f860d9..88dad73 100644 --- a/src/filterresult.cpp +++ b/src/filterresult.cpp @@ -104,10 +104,11 @@ long FilterResult::getCorrectionNum(char from, char to) { return mCorrectionMatrix[f*8 + t]; } -void FilterResult::addAdapterTrimmed(string adapter, bool isR2) { +void FilterResult::addAdapterTrimmed(string adapter, bool isR2, bool incTrimmedCounter ) { if(adapter.empty()) return; - mTrimmedAdapterRead++; + if(incTrimmedCounter) + mTrimmedAdapterRead++; mTrimmedAdapterBases += adapter.length(); if(!isR2) { if(mAdapter1.count(adapter) >0 ) diff --git a/src/filterresult.h b/src/filterresult.h index 9c566f1..a330d23 100644 --- a/src/filterresult.h +++ b/src/filterresult.h @@ -33,7 +33,7 @@ class FilterResult{ static FilterResult* merge(vector& list); void print(); // for single end - void addAdapterTrimmed(string adapter, bool isR2 = false); + void addAdapterTrimmed(string adapter, bool isR2 = false, bool incTrimmedCounter = true); // for paired end void addAdapterTrimmed(string adapter1, string adapter2); // a part of JSON report diff --git a/src/main.cpp b/src/main.cpp index 72fca96..1d33006 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -54,6 +54,7 @@ int main(int argc, char* argv[]){ cmd.add("disable_adapter_trimming", 'A', "adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled"); cmd.add("adapter_sequence", 'a', "the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped.", false, "auto"); cmd.add("adapter_sequence_r2", 0, "the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as ", false, "auto"); + cmd.add("adapter_fasta", 0, "specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file", false, ""); cmd.add("detect_adapter_for_pe", 0, "by default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data."); // trimming @@ -185,9 +186,13 @@ int main(int argc, char* argv[]){ opt.adapter.detectAdapterForPE = cmd.exist("detect_adapter_for_pe"); opt.adapter.sequence = cmd.get("adapter_sequence"); opt.adapter.sequenceR2 = cmd.get("adapter_sequence_r2"); + opt.adapter.fastaFile = cmd.get("adapter_fasta"); if(opt.adapter.sequenceR2=="auto" && !opt.adapter.detectAdapterForPE && opt.adapter.sequence != "auto") { opt.adapter.sequenceR2 = opt.adapter.sequence; } + if(!opt.adapter.fastaFile.empty()) { + opt.loadFastaAdapters(); + } // trimming opt.trim.front1 = cmd.get("trim_front1"); diff --git a/src/options.cpp b/src/options.cpp index 5c88b1d..81d9afa 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -3,6 +3,7 @@ #include #include #include +#include "fastareader.h" Options::Options(){ in1 = ""; @@ -41,6 +42,30 @@ bool Options::adapterCuttingEnabled() { return false; } +void Options::loadFastaAdapters() { + if(adapter.fastaFile.empty()) { + adapter.hasFasta = false; + return; + } + + check_file_valid(adapter.fastaFile); + + FastaReader reader(adapter.fastaFile); + reader.readAll(); + + map contigs = reader.contigs(); + map::iterator iter; + for(iter = contigs.begin(); iter != contigs.end(); iter++) { + adapter.seqsInFasta.push_back(iter->second); + } + + if(adapter.seqsInFasta.size() > 0) { + adapter.hasFasta = true; + } else + adapter.hasFasta = false; + +} + bool Options::validate() { if(in1.empty()) { if(!in2.empty()) diff --git a/src/options.h b/src/options.h index 35d64a5..57e6240 100644 --- a/src/options.h +++ b/src/options.h @@ -204,8 +204,11 @@ class AdapterOptions { string sequenceR2; string detectedAdapter1; string detectedAdapter2; + vector seqsInFasta; + string fastaFile; bool hasSeqR1; bool hasSeqR2; + bool hasFasta; bool detectAdapterForPE; }; @@ -282,6 +285,7 @@ class Options{ void initIndexFiltering(string blacklistFile1, string blacklistFile2, int threshold = 0); vector makeListFromFileByLine(string filename); bool shallDetectAdapter(bool isR2 = false); + void loadFastaAdapters(); public: // file name of read1 input diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index 86f5cc4..e9320c1 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -365,11 +365,17 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ } if(mOptions->adapter.enabled) { bool trimmed = AdapterTrimmer::trimByOverlapAnalysis(r1, r2, config->getFilterResult(), ov, frontTrimmed1, frontTrimmed2); + bool trimmed1 = trimmed; + bool trimmed2 = trimmed; if(!trimmed){ if(mOptions->adapter.hasSeqR1) - AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false); + trimmed1 = AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false); if(mOptions->adapter.hasSeqR2) - AdapterTrimmer::trimBySequence(r2, config->getFilterResult(), mOptions->adapter.sequenceR2, true); + trimmed2 = AdapterTrimmer::trimBySequence(r2, config->getFilterResult(), mOptions->adapter.sequenceR2, true); + } + if(mOptions->adapter.hasFasta) { + AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, !trimmed1); + AdapterTrimmer::trimByMultiSequences(r2, config->getFilterResult(), mOptions->adapter.seqsInFasta, true, !trimmed2); } } } diff --git a/src/seprocessor.cpp b/src/seprocessor.cpp index a9a6f0b..c8ee1cd 100644 --- a/src/seprocessor.cpp +++ b/src/seprocessor.cpp @@ -225,8 +225,14 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){ PolyX::trimPolyG(r1, config->getFilterResult(), mOptions->polyGTrim.minLen); } - if(r1 != NULL && mOptions->adapter.enabled && mOptions->adapter.hasSeqR1){ - AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence); + if(r1 != NULL && mOptions->adapter.enabled){ + bool trimmed = false; + if(mOptions->adapter.hasSeqR1) + trimmed = AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false); + bool incTrimmedCounter = !trimmed; + if(mOptions->adapter.hasFasta) { + AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, incTrimmedCounter); + } } if(r1 != NULL) { diff --git a/src/util.h b/src/util.h index fb46c2a..abd7e7b 100644 --- a/src/util.h +++ b/src/util.h @@ -213,15 +213,22 @@ inline string str_keep_alpha(const string& s) // Remove invalid sequence characters from a string -inline string str_keep_valid_sequence(const string& s) +inline void str_keep_valid_sequence( string& s, bool forceUpperCase = false) { - string new_str; + size_t total = 0; + const char case_gap = 'a' - 'A'; for( size_t it =0; it < s.size(); it++) { - if( isalpha(s[it]) || s[it] == '-' || s[it] == '*' ) { - new_str += s[it]; + char c = s[it]; + if(forceUpperCase && c>='a' && c<='z') { + c -= case_gap; + } + if( isalpha(c) || c == '-' || c == '*' ) { + s[total] = c; + total ++; } } - return new_str; + + s.resize(total); } inline int find_with_right_pos(const string& str, const string& pattern, int start=0) { From 002b1951fad5fb79f6b885329c6fe919d91ac634 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 9 Apr 2019 16:49:30 +0800 Subject: [PATCH 13/19] add src/fastareader --- src/fastareader.cpp | 121 ++++++++++++++++++++++++++++++++++++++++++++ src/fastareader.h | 66 ++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 src/fastareader.cpp create mode 100644 src/fastareader.h diff --git a/src/fastareader.cpp b/src/fastareader.cpp new file mode 100644 index 0000000..032c697 --- /dev/null +++ b/src/fastareader.cpp @@ -0,0 +1,121 @@ + +#include "fastareader.h" +#include "util.h" +#include + +FastaReader::FastaReader(string faFile, bool forceUpperCase) +{ + // Set locale and disable stdio synchronization to improve iostream performance + // http://www.drdobbs.com/the-standard-librarian-iostreams-and-std/184401305 + // http://stackoverflow.com/questions/5166263/how-to-get-iostream-to-perform-better + setlocale(LC_ALL,"C"); + ios_base::sync_with_stdio(false); + + mFastaFile = faFile; + mForceUpperCase = forceUpperCase; + if (is_directory(mFastaFile)) { + string error_msg = "There is a problem with the provided fasta file: \'"; + error_msg.append(mFastaFile); + error_msg.append("\' is a directory NOT a file...\n"); + throw invalid_argument(error_msg); + } + mFastaFileStream.open( mFastaFile.c_str(),ios::in); + // verify that the file can be read + if (!mFastaFileStream.is_open()) { + string msg = "There is a problem with the provided fasta file: could NOT read "; + msg.append(mFastaFile.c_str()); + msg.append("...\n"); + throw invalid_argument(msg); + } + + char c; + // seek to first contig + while (mFastaFileStream.get(c) && c != '>') { + if (mFastaFileStream.eof()) { + break; + } + } +} + +FastaReader::~FastaReader() +{ + if (mFastaFileStream.is_open()) { + mFastaFileStream.close(); + } +} + +void FastaReader::readNext() +{ + mCurrentID = ""; + mCurrentDescription = ""; + mCurrentSequence = ""; + bool foundHeader = false; + + char c; + stringstream ssSeq; + stringstream ssHeader; + while(true){ + mFastaFileStream.get(c); + if(c == '>' || mFastaFileStream.eof()) + break; + else { + if (foundHeader){ + if(mForceUpperCase && c>='a' && c<='z') { + c -= ('a' - 'A'); + } + ssSeq << c; + } + else + ssHeader << c; + } + + string line = ""; + getline(mFastaFileStream,line,'\n'); + + + if(foundHeader == false) { + ssHeader << line; + foundHeader = true; + } + else { + str_keep_valid_sequence(line, mForceUpperCase); + ssSeq << line; + } + } + mCurrentSequence = ssSeq.str(); + string header = ssHeader.str(); + + int space = header.find(" "); + mCurrentID = header.substr(0, space); +} + +bool FastaReader::hasNext() { + return !mFastaFileStream.eof(); +} + +void FastaReader::readAll() { + while(!mFastaFileStream.eof()){ + readNext(); + mAllContigs[mCurrentID] = mCurrentSequence; + } +} + +bool FastaReader::test(){ + FastaReader reader("testdata/tinyref.fa"); + reader.readAll(); + + string contig1 = "GATCACAGGTCTATCACCCTATTAATTGGTATTTTCGTCTGGGGGGTGTGGAGCCGGAGCACCCTATGTCGCAGT"; + string contig2 = "GTCTGCACAGCCGCTTTCCACACAGAACCCCCCCCTCCCCCCGCTTCTGGCAAACCCCAAAAACAAAGAACCCTA"; + + if(reader.mAllContigs.count("contig1") == 0 || reader.mAllContigs.count("contig2") == 0 ) + return false; + + if(reader.mAllContigs["contig1"] != contig1 || reader.mAllContigs["contig2"] != contig2 ) + return false; + + return true; + +} + + + diff --git a/src/fastareader.h b/src/fastareader.h new file mode 100644 index 0000000..97ac496 --- /dev/null +++ b/src/fastareader.h @@ -0,0 +1,66 @@ +#ifndef FASTA_READER_H +#define FASTA_READER_H + +// includes +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +class FastaReader +{ +public: + FastaReader(string fastaFile, bool forceUpperCase = true); + ~FastaReader(); + bool hasNext(); + void readNext(); + void readAll(); + + inline string currentID() + { + return mCurrentID; + } + + inline string currentDescription() + { + return mCurrentDescription; + } + + inline string currentSequence() + { + return mCurrentSequence; + } + + inline map& contigs() { + return mAllContigs; + } + + static bool test(); + + +public: + string mCurrentSequence; + string mCurrentID ; + string mCurrentDescription; + map mAllContigs; + +private: + bool readLine(); + bool endOfLine(char c); + void setFastaSequenceIdDescription(); + +private: + string mFastaFile; + ifstream mFastaFileStream; + bool mForceUpperCase; +}; + + +#endif + From 1bccf9030d29acf496348ed50c0b4a856a3981f6 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 9 Apr 2019 17:28:16 +0800 Subject: [PATCH 14/19] Update README.md --- README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index efa5b90..56e4353 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ A tool designed to provide fast all-in-one preprocessing for FastQ files. This t 1. filter out bad reads (too low quality, too short, or too many N...) 2. cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster). 3. trim all reads in front and tail -4. cut adapters. Adapter sequences can be automatically detected,which means you don't have to input the adapter sequences to trim them. +4. cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them. 5. correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality 6. trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data) 7. preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name. @@ -152,6 +152,20 @@ Adapter trimming is enabled by default, but you can disable it by `-A` or `--dis * The most widely used adapter is the Illumina TruSeq adapters. If your data is from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines, or enable auto detection for PE data by specifing `detect_adapter_for_pe`. * `fastp` contains some built-in known adapter sequences for better auto-detection. If you want to make some adapters to be a part of the built-in adapters, please file an issue. +You can also specify `--adapter_fasta` to give a FASTA file to tell `fastp` to trim multiple adapters in this FASTA file. Here is a sample of such adapter FASTA file: +``` +>Illumina TruSeq Adapter Read 1 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>Illumina TruSeq Adapter Read 2 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>polyA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +``` + +The adapter sequence in this file should be at least 10bp long. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). + +`fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. + The sequence distribution of trimmed adapters can be found at the HTML/JSON reports. # per read cutting by quality score From eb3c9628e2f4f2c7a5dba30c7b06ca1d6d146951 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 9 Apr 2019 20:30:19 +0800 Subject: [PATCH 15/19] fix a bug in fastareader for FASTA with space in name --- README.md | 2 +- src/fastareader.cpp | 3 +-- src/options.cpp | 7 ++++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 56e4353..1ba366d 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ``` -The adapter sequence in this file should be at least 10bp long. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). +The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). `fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. diff --git a/src/fastareader.cpp b/src/fastareader.cpp index 032c697..36795bc 100644 --- a/src/fastareader.cpp +++ b/src/fastareader.cpp @@ -85,8 +85,7 @@ void FastaReader::readNext() mCurrentSequence = ssSeq.str(); string header = ssHeader.str(); - int space = header.find(" "); - mCurrentID = header.substr(0, space); + mCurrentID = header; } bool FastaReader::hasNext() { diff --git a/src/options.cpp b/src/options.cpp index 81d9afa..fc7633f 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -56,7 +56,12 @@ void Options::loadFastaAdapters() { map contigs = reader.contigs(); map::iterator iter; for(iter = contigs.begin(); iter != contigs.end(); iter++) { - adapter.seqsInFasta.push_back(iter->second); + if(iter->second.length()>=6) { + adapter.seqsInFasta.push_back(iter->second); + } + else { + cerr << "skip too short adapter sequence in " << adapter.fastaFile << " (6bp required): " << iter->second << endl; + } } if(adapter.seqsInFasta.size() > 0) { From a540c613f65a7fc7700d9717eb1c8d081047eeef Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Sat, 13 Apr 2019 22:14:34 +0800 Subject: [PATCH 16/19] support filter by average quality score --average_qual https://github.com/OpenGene/fastp/issues/152 --- README.md | 5 +++++ src/common.h | 2 +- src/filter.cpp | 5 +++++ src/main.cpp | 2 ++ src/options.cpp | 3 +++ src/options.h | 2 ++ 6 files changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ba366d..699fce8 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,9 @@ To filter reads by its percentage of unqualified bases, two options should be pr * `-q, --qualified_quality_phred`       the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. * `-u, --unqualified_percent_limit`   how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% +You can also filter reads by its average quality score +* `-e, --average_qual` if one read's average quality score =Q15 is qualified. (int [=15]) -u, --unqualified_percent_limit how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% (int [=40]) -n, --n_base_limit if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5 (int [=5]) + -e, --average_qual if one read's average quality score length(); int lowQualNum = 0; int nBaseNum = 0; + int totalQual = 0; // need to recalculate lowQualNum and nBaseNum if the corresponding filters are enabled if(mOptions->qualfilter.enabled || mOptions->lengthFilter.enabled) { @@ -29,6 +30,8 @@ int Filter::passFilter(Read* r) { char base = seqstr[i]; char qual = qualstr[i]; + totalQual += qual - 33; + if(qual < mOptions->qualfilter.qualifiedQual) lowQualNum ++; @@ -40,6 +43,8 @@ int Filter::passFilter(Read* r) { if(mOptions->qualfilter.enabled) { if(lowQualNum > (mOptions->qualfilter.unqualifiedPercentLimit * rlen / 100.0) ) return FAIL_QUALITY; + else if(mOptions->qualfilter.avgQualReq > 0 && (totalQual / rlen)qualfilter.avgQualReq) + return FAIL_QUALITY; else if(nBaseNum > mOptions->qualfilter.nBaseLimit ) return FAIL_N_BASE; } diff --git a/src/main.cpp b/src/main.cpp index 1d33006..85d40f3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -93,6 +93,7 @@ int main(int argc, char* argv[]){ cmd.add("qualified_quality_phred", 'q', "the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified.", false, 15); cmd.add("unqualified_percent_limit", 'u', "how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%", false, 40); cmd.add("n_base_limit", 'n', "if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5", false, 5); + cmd.add("average_qual", 'e', "if one read's average quality score ("qualified_quality_phred")); opt.qualfilter.unqualifiedPercentLimit = cmd.get("unqualified_percent_limit"); + opt.qualfilter.avgQualReq = cmd.get("average_qual"); opt.qualfilter.nBaseLimit = cmd.get("n_base_limit"); // length filtering diff --git a/src/options.cpp b/src/options.cpp index fc7633f..4fe929c 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -281,6 +281,9 @@ bool Options::validate() { if(qualfilter.qualifiedQual - 33 < 0 || qualfilter.qualifiedQual - 33 > 93) error_exit("qualitified phred (--qualified_quality_phred) should be 0 ~ 93, suggest 10 ~ 20"); + if(qualfilter.avgQualReq < 0 || qualfilter.avgQualReq > 93) + error_exit("average quality score requirement (--average_qual) should be 0 ~ 93, suggest 20 ~ 30"); + if(qualfilter.unqualifiedPercentLimit < 0 || qualfilter.unqualifiedPercentLimit > 100) error_exit("unqualified percent limit (--unqualified_percent_limit) should be 0 ~ 100, suggest 20 ~ 60"); diff --git a/src/options.h b/src/options.h index 57e6240..4ba2f65 100644 --- a/src/options.h +++ b/src/options.h @@ -255,6 +255,8 @@ class QualityFilteringOptions { int unqualifiedPercentLimit; // if n_base_number > nBaseLimit, then discard this read int nBaseLimit; + // if average qual score < avgQualReq, then discard this read + int avgQualReq; }; class ReadLengthFilteringOptions { From ee6bed1f7b393eca18ca38345d79f25b3e685a5c Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 16 Apr 2019 18:03:34 +0800 Subject: [PATCH 17/19] add option overlap_diff_limit_percent --- README.md | 11 ++++++----- src/adaptertrimmer.cpp | 4 ++-- src/adaptertrimmer.h | 2 +- src/basecorrector.cpp | 6 +++--- src/basecorrector.h | 2 +- src/main.cpp | 6 ++++-- src/options.cpp | 4 ++++ src/options.h | 1 + src/overlapanalysis.cpp | 10 ++++++---- src/overlapanalysis.h | 4 ++-- src/peprocessor.cpp | 6 +++--- 11 files changed, 33 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 699fce8..92ff2cd 100644 --- a/README.md +++ b/README.md @@ -189,7 +189,7 @@ If you don't set window size and mean quality threshold for these function respe # base correction for PE data `fastp` perform `overlap analysis` for PE data, which try to find an overlap of each pair of reads. If an proper overlap is found, it can correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality. If a base is corrected, the quality of its paired base will be assigned to it so that they will share the same quality.   -This function is not enabled by default, specify `-c` or `--correction` to enable it. This function is based on overlapping detection, which has adjustable parameters `overlap_len_require (default 30)` and `overlap_diff_limit (default 5)`. +This function is not enabled by default, specify `-c` or `--correction` to enable it. This function is based on overlapping detection, which has adjustable parameters `overlap_len_require (default 30)`, `overlap_diff_limit (default 5)` and `overlap_diff_limit_percent (default 20%)`. # global trimming `fastp` supports global trimming, which means trim all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. @@ -292,7 +292,7 @@ In the output file, a tag like `merged_xxx_yyy`will be added to each read name t @NB551106:9:H5Y5GBGX2:1:22306:18653:13119 1:N:0:GATCAG merged_150_15` means that 150bp are from read1, and 15bp are from read2. `fastp` prefers the bases in read1 since they usually have higher quality than read2. -Same as the [base correction feature](#base-correction-for-pe-data), this function is also based on overlapping detection, which has adjustable parameters `overlap_len_require (default 30)` and `overlap_diff_limit (default 5)`. +Same as the [base correction feature](#base-correction-for-pe-data), this function is also based on overlapping detection, which has adjustable parameters `overlap_len_require (default 30)`, `overlap_diff_limit (default 5)` and `overlap_diff_limit_percent (default 20%)`. # all options ```shell @@ -378,9 +378,10 @@ options: # base correction by overlap analysis options -c, --correction enable base correction in overlapped regions (only for PE data), default is disabled - --overlap_len_require the minimum length of the overlapped region for overlap analysis based adapter trimming and correction. 30 by default. (int [=30]) - --overlap_diff_limit the maximum difference of the overlapped region for overlap analysis based adapter trimming and correction. 5 by default. (int [=5]) - + --overlap_len_require the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default. (int [=30]) + --overlap_diff_limit the maximum maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default. (int [=5]) + --overlap_diff_percent_limit the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%. (int [=20]) + # UMI processing -U, --umi enable unique molecular identifier (UMI) preprocessing --umi_loc specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none (string [=]) diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp index a55993c..b771639 100644 --- a/src/adaptertrimmer.cpp +++ b/src/adaptertrimmer.cpp @@ -7,8 +7,8 @@ AdapterTrimmer::AdapterTrimmer(){ AdapterTrimmer::~AdapterTrimmer(){ } -bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr) { - OverlapResult ov = OverlapAnalysis::analyze(r1, r2); +bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) { + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit); return trimByOverlapAnalysis(r1, r2, fr, ov); } diff --git a/src/adaptertrimmer.h b/src/adaptertrimmer.h index c0d2905..0ab8904 100644 --- a/src/adaptertrimmer.h +++ b/src/adaptertrimmer.h @@ -15,7 +15,7 @@ class AdapterTrimmer{ AdapterTrimmer(); ~AdapterTrimmer(); - static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr); + static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4); static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector& adapterList, bool isR2 = false, bool incTrimmedCounter = true); diff --git a/src/basecorrector.cpp b/src/basecorrector.cpp index e70ebf4..baa3919 100644 --- a/src/basecorrector.cpp +++ b/src/basecorrector.cpp @@ -8,8 +8,8 @@ BaseCorrector::BaseCorrector(){ BaseCorrector::~BaseCorrector(){ } -int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr) { - OverlapResult ov = OverlapAnalysis::analyze(r1, r2); +int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) { + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit); return correctByOverlapAnalysis(r1, r2, fr, ov); } @@ -92,7 +92,7 @@ bool BaseCorrector::test() { "+", "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEE"); - correctByOverlapAnalysis(&r1, &r2, NULL); + correctByOverlapAnalysis(&r1, &r2, NULL, 5, 30, 0.2); if(r1.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") return false; diff --git a/src/basecorrector.h b/src/basecorrector.h index 344e14e..bda0e5e 100644 --- a/src/basecorrector.h +++ b/src/basecorrector.h @@ -15,7 +15,7 @@ class BaseCorrector{ BaseCorrector(); ~BaseCorrector(); - static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr); + static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov); static bool test(); }; diff --git a/src/main.cpp b/src/main.cpp index 85d40f3..4f368b9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -111,8 +111,9 @@ int main(int argc, char* argv[]){ // base correction in overlapped regions of paired end data cmd.add("correction", 'c', "enable base correction in overlapped regions (only for PE data), default is disabled"); - cmd.add("overlap_len_require", 0, "the minimum length of the overlapped region for overlap analysis based adapter trimming and correction. 30 by default.", false, 30); - cmd.add("overlap_diff_limit", 0, "the maximum difference of the overlapped region for overlap analysis based adapter trimming and correction. 5 by default.", false, 5); + cmd.add("overlap_len_require", 0, "the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default.", false, 30); + cmd.add("overlap_diff_limit", 0, "the maximum maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default.", false, 5); + cmd.add("overlap_diff_percent_limit", 0, "the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%.", false, 20); // umi cmd.add("umi", 'U', "enable unique molecular identifier (UMI) preprocessing"); @@ -312,6 +313,7 @@ int main(int argc, char* argv[]){ opt.correction.enabled = cmd.exist("correction"); opt.overlapRequire = cmd.get("overlap_len_require"); opt.overlapDiffLimit = cmd.get("overlap_diff_limit"); + opt.overlapDiffPercentLimit = cmd.get("overlap_diff_percent_limit"); // threading opt.thread = cmd.get("thread"); diff --git a/src/options.cpp b/src/options.cpp index 4fe929c..b541134 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -22,6 +22,7 @@ Options::Options(){ insertSizeMax = 512; overlapRequire = 30; overlapDiffLimit = 5; + overlapDiffPercentLimit = 20; verbose = false; seqLen1 = 151; seqLen2 = 151; @@ -293,6 +294,9 @@ bool Options::validate() { if(lengthFilter.requiredLength < 0 ) error_exit("length requirement (--length_required) should be >0, suggest 15 ~ 100"); + if(overlapDiffPercentLimit < 0 || overlapDiffPercentLimit > 100) + error_exit("the maximum percentage of mismatched bases to detect overlapped region (--overlap_diff_percent_limit) should be 0 ~ 100, suggest 20 ~ 60"); + if(split.enabled ) { if(split.digits < 0 || split.digits > 10) error_exit("you have enabled splitting output to multiple files, the digits number of file name prefix (--split_prefix_digits) should be 0 ~ 10."); diff --git a/src/options.h b/src/options.h index 4ba2f65..10671fa 100644 --- a/src/options.h +++ b/src/options.h @@ -363,6 +363,7 @@ class Options{ // overlap analysis threshold int overlapRequire; int overlapDiffLimit; + int overlapDiffPercentLimit; // output debug information bool verbose; // merge options diff --git a/src/overlapanalysis.cpp b/src/overlapanalysis.cpp index 8b145ce..fa3c819 100644 --- a/src/overlapanalysis.cpp +++ b/src/overlapanalysis.cpp @@ -7,12 +7,12 @@ OverlapAnalysis::OverlapAnalysis(){ OverlapAnalysis::~OverlapAnalysis(){ } -OverlapResult OverlapAnalysis::analyze(Read* r1, Read* r2, int overlapDiffLimit, int overlapRequire) { - return analyze(r1->mSeq, r2->mSeq, overlapDiffLimit, overlapRequire); +OverlapResult OverlapAnalysis::analyze(Read* r1, Read* r2, int overlapDiffLimit, int overlapRequire, double diffPercentLimit) { + return analyze(r1->mSeq, r2->mSeq, overlapDiffLimit, overlapRequire, diffPercentLimit); } // ported from the python code of AfterQC -OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int overlapDiffLimit, int overlapRequire) { +OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int diffLimit, int overlapRequire, double diffPercentLimit) { Sequence rcr2 = ~r2; int len1 = r1.length(); int len2 = rcr2.length(); @@ -31,6 +31,7 @@ OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int overlapDi while (offset < len1-overlapRequire) { // the overlap length of r1 & r2 when r2 is move right for offset overlap_len = min(len1 - offset, len2); + int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); diff = 0; int i = 0; @@ -64,6 +65,7 @@ OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int overlapDi while (offset > -(len2-overlapRequire)){ // the overlap length of r1 & r2 when r2 is move right for offset overlap_len = min(len1, len2- abs(offset)); + int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); diff = 0; int i = 0; @@ -131,7 +133,7 @@ bool OverlapAnalysis::test(){ string qual1("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"); string qual2("#########################################################################################"); - OverlapResult ov = OverlapAnalysis::analyze(r1, r2); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, 5, 30, 0.2); Read read1("name1", r1, "+", qual1); Read read2("name2", r2, "+", qual2); diff --git a/src/overlapanalysis.h b/src/overlapanalysis.h index a9be767..bb1c50f 100644 --- a/src/overlapanalysis.h +++ b/src/overlapanalysis.h @@ -25,8 +25,8 @@ class OverlapAnalysis{ OverlapAnalysis(); ~OverlapAnalysis(); - static OverlapResult analyze(Sequence& r1, Sequence& r2, int overlapDiffLimit = 5, int overlapRequire=30); - static OverlapResult analyze(Read* r1, Read* r2, int overlapDiffLimit = 5, int overlapRequire=30); + static OverlapResult analyze(Sequence& r1, Sequence& r2, int diffLimit, int overlapRequire, double diffPercentLimit); + static OverlapResult analyze(Read* r1, Read* r2, int diffLimit, int overlapRequire, double diffPercentLimit); static Read* merge(Read* r1, Read* r2, OverlapResult ov); public: diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index e9320c1..53a008f 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -354,7 +354,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ } bool isizeEvaluated = false; if(r1 != NULL && r2!=NULL && (mOptions->adapter.enabled || mOptions->correction.enabled)){ - OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0); // we only use thread 0 to evaluae ISIZE if(config->getThreadId() == 0) { statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2); @@ -381,7 +381,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ } if(config->getThreadId() == 0 && !isizeEvaluated && r1 != NULL && r2!=NULL) { - OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0); statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2); isizeEvaluated = true; } @@ -402,7 +402,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ // merging mode bool mergeProcessed = false; if(mOptions->merge.enabled && r1 && r2) { - OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0); if(ov.overlapped) { merged = OverlapAnalysis::merge(r1, r2, ov); int result = mFilter->passFilter(merged); From d38e1b38836840f7526e97423c55eb1dc18cd5ca Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 16 Apr 2019 18:05:22 +0800 Subject: [PATCH 18/19] fixed a tpyo --- README.md | 2 +- src/main.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 92ff2cd..e6cd895 100644 --- a/README.md +++ b/README.md @@ -379,7 +379,7 @@ options: # base correction by overlap analysis options -c, --correction enable base correction in overlapped regions (only for PE data), default is disabled --overlap_len_require the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default. (int [=30]) - --overlap_diff_limit the maximum maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default. (int [=5]) + --overlap_diff_limit the maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default. (int [=5]) --overlap_diff_percent_limit the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%. (int [=20]) # UMI processing diff --git a/src/main.cpp b/src/main.cpp index 4f368b9..b9e2e2b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -112,7 +112,7 @@ int main(int argc, char* argv[]){ // base correction in overlapped regions of paired end data cmd.add("correction", 'c', "enable base correction in overlapped regions (only for PE data), default is disabled"); cmd.add("overlap_len_require", 0, "the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default.", false, 30); - cmd.add("overlap_diff_limit", 0, "the maximum maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default.", false, 5); + cmd.add("overlap_diff_limit", 0, "the maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default.", false, 5); cmd.add("overlap_diff_percent_limit", 0, "the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%.", false, 20); // umi From ccdbddc64134ee8005e69d2d99f1c7ed6de887b2 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Tue, 16 Apr 2019 18:27:47 +0800 Subject: [PATCH 19/19] revise overlap detection for its limit comparison --- src/overlapanalysis.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/overlapanalysis.cpp b/src/overlapanalysis.cpp index fa3c819..c1d05da 100644 --- a/src/overlapanalysis.cpp +++ b/src/overlapanalysis.cpp @@ -38,12 +38,12 @@ OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int diffLimit for (i=0; i= overlapDiffLimit && i < complete_compare_require) + if (diff > overlapDiffLimit && i < complete_compare_require) break; } } - if (diff < overlapDiffLimit || (diff >= overlapDiffLimit && i>complete_compare_require)){ + if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){ OverlapResult ov; ov.overlapped = true; ov.offset = offset; @@ -72,12 +72,12 @@ OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int diffLimit for (i=0; i= overlapDiffLimit && i < complete_compare_require) + if (diff > overlapDiffLimit && i < complete_compare_require) break; } } - if (diff < overlapDiffLimit || (diff >= overlapDiffLimit && i>complete_compare_require)){ + if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){ OverlapResult ov; ov.overlapped = true; ov.offset = offset; @@ -133,7 +133,7 @@ bool OverlapAnalysis::test(){ string qual1("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"); string qual2("#########################################################################################"); - OverlapResult ov = OverlapAnalysis::analyze(r1, r2, 5, 30, 0.2); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, 2, 30, 0.2); Read read1("name1", r1, "+", qual1); Read read2("name2", r2, "+", qual2);