From 94c08a174a8d0acc40b8f85bbab094dd4ed07efa Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Fri, 19 Jun 2020 13:57:02 +0800 Subject: [PATCH] support outputing overlapped regions to get the cleanest data --- README.md | 1 + src/common.h | 2 +- src/main.cpp | 2 ++ src/options.cpp | 10 ++++++++++ src/options.h | 2 ++ src/peprocessor.cpp | 34 ++++++++++++++++++++++++++++++++++ src/peprocessor.h | 1 + 7 files changed, 51 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5d17f2c..78f1c5a 100644 --- a/README.md +++ b/README.md @@ -306,6 +306,7 @@ options: --unpaired1 for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=]) --unpaired2 for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file. (string [=]) --failed_out specify the file to store reads that cannot pass the filters. (string [=]) + --overlapped_out for each read pair, output the overlapped region if it has no any mismatched base. (string [=]) -m, --merge for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. --merged_out in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=]) --include_unmerged in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. diff --git a/src/common.h b/src/common.h index 896ad51..cd95e9b 100644 --- a/src/common.h +++ b/src/common.h @@ -1,7 +1,7 @@ #ifndef COMMON_H #define COMMON_H -#define FASTP_VER "0.20.1" +#define FASTP_VER "0.21" #define _DEBUG false diff --git a/src/main.cpp b/src/main.cpp index e15536c..d07c803 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,6 +37,7 @@ int main(int argc, char* argv[]){ cmd.add("out2", 'O', "read2 output file name", false, ""); cmd.add("unpaired1", 0, "for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it.", false, ""); cmd.add("unpaired2", 0, "for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file.", false, ""); + cmd.add("overlapped_out", 0, "for each read pair, output the overlapped region if it has no any mismatched base.", false, ""); cmd.add("failed_out", 0, "specify the file to store reads that cannot pass the filters.", false, ""); cmd.add("merge", 'm', "for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default."); cmd.add("merged_out", 0, "in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output", false, ""); @@ -167,6 +168,7 @@ int main(int argc, char* argv[]){ opt.unpaired1 = cmd.get("unpaired1"); opt.unpaired2 = cmd.get("unpaired2"); opt.failedOut = cmd.get("failed_out"); + opt.overlappedOut = cmd.get("overlapped_out"); // write to the same file if(opt.unpaired2.empty()) opt.unpaired2 = opt.unpaired1; diff --git a/src/options.cpp b/src/options.cpp index 4c97438..2c71cb8 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -196,6 +196,12 @@ bool Options::validate() { error_exit(out2 + " already exists and you have set to not rewrite output files by --dont_overwrite"); } } + if(!overlappedOut.empty()) { + //check_file_writable(out2); + if(dontOverwrite && file_exists(overlappedOut)) { + error_exit(overlappedOut + " already exists and you have set to not rewrite output files by --dont_overwrite"); + } + } if(!isPaired()) { if(!unpaired1.empty()) { cerr << "Not paired-end mode. Ignoring argument --unpaired1 = " << unpaired1 << endl; @@ -205,6 +211,10 @@ bool Options::validate() { cerr << "Not paired-end mode. Ignoring argument --unpaired2 = " << unpaired2 << endl; unpaired2 = ""; } + if(!overlappedOut.empty()) { + cerr << "Not paired-end mode. Ignoring argument --overlapped_out = " << overlappedOut << endl; + overlappedOut = ""; + } } if(split.enabled) { if(!unpaired1.empty()) { diff --git a/src/options.h b/src/options.h index 77f6726..b4f3029 100644 --- a/src/options.h +++ b/src/options.h @@ -306,6 +306,8 @@ class Options{ // file name of failed reads output string failedOut; // json file + string overlappedOut; + // json file string jsonFile; // html file string htmlFile; diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index 44443ea..a8db0ec 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -32,6 +32,7 @@ PairEndProcessor::PairEndProcessor(Options* opt){ mUnpairedRightWriter = NULL; mMergedWriter = NULL; mFailedWriter = NULL; + mOverlappedWriter = NULL; mDuplicate = NULL; if(mOptions->duplicate.enabled) { @@ -62,6 +63,9 @@ void PairEndProcessor::initOutput() { if(!mOptions->failedOut.empty()) mFailedWriter = new WriterThread(mOptions, mOptions->failedOut); + if(!mOptions->overlappedOut.empty()) + mOverlappedWriter = new WriterThread(mOptions, mOptions->overlappedOut); + if(mOptions->out1.empty()) return; @@ -87,6 +91,10 @@ void PairEndProcessor::closeOutput() { delete mFailedWriter; mFailedWriter = NULL; } + if(mOverlappedWriter) { + delete mOverlappedWriter; + mOverlappedWriter = NULL; + } if(mUnpairedLeftWriter) { delete mUnpairedLeftWriter; mLeftWriter = NULL; @@ -132,6 +140,7 @@ bool PairEndProcessor::process(){ std::thread* unpairedRightWriterThread = NULL; std::thread* mergedWriterThread = NULL; std::thread* failedWriterThread = NULL; + std::thread* overlappedWriterThread = NULL; if(mLeftWriter) leftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mLeftWriter)); if(mRightWriter) @@ -144,6 +153,8 @@ bool PairEndProcessor::process(){ mergedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mMergedWriter)); if(mFailedWriter) failedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mFailedWriter)); + if(mOverlappedWriter) + overlappedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mOverlappedWriter)); producer.join(); for(int t=0; tthread; t++){ @@ -163,6 +174,8 @@ bool PairEndProcessor::process(){ mergedWriterThread->join(); if(failedWriterThread) failedWriterThread->join(); + if(overlappedWriterThread) + overlappedWriterThread->join(); } if(mOptions->verbose) @@ -285,6 +298,8 @@ bool PairEndProcessor::process(){ delete mergedWriterThread; if(failedWriterThread) delete failedWriterThread; + if(overlappedWriterThread) + delete overlappedWriterThread; if(!mOptions->split.enabled) closeOutput(); @@ -312,6 +327,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ string singleOutput; string mergedOutput; string failedOut; + string overlappedOut; int readPassed = 0; int mergedCount = 0; for(int p=0;pcount;p++){ @@ -385,6 +401,15 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ } } + if(r1 != NULL && r2!=NULL && mOverlappedWriter) { + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, 0); + if(ov.overlapped) { + Read* overlappedRead = new Read(r1->mName, r1->mSeq.mStr.substr(ov.offset, ov.overlap_len), r1->mStrand, r1->mQuality.substr(ov.offset, ov.overlap_len)); + overlappedOut += overlappedRead->toString(); + delete overlappedRead; + } + } + if(config->getThreadId() == 0 && !isizeEvaluated && r1 != NULL && r2!=NULL) { OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0); statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2); @@ -529,6 +554,13 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ mFailedWriter->input(fdata, failedOut.size()); } + if(mOverlappedWriter && !overlappedOut.empty()) { + // write failed data + char* odata = new char[overlappedOut.size()]; + memcpy(odata, overlappedOut.c_str(), overlappedOut.size()); + mOverlappedWriter->input(odata, overlappedOut.size()); + } + // normal output by left/right writer thread if(mRightWriter && mLeftWriter && (!outstr1.empty() || !outstr2.empty())) { // write PE @@ -808,6 +840,8 @@ void PairEndProcessor::consumerTask(ThreadConfig* config) mMergedWriter->setInputCompleted(); if(mFailedWriter) mFailedWriter->setInputCompleted(); + if(mOverlappedWriter) + mOverlappedWriter->setInputCompleted(); } if(mOptions->verbose) { diff --git a/src/peprocessor.h b/src/peprocessor.h index b646989..4d01887 100644 --- a/src/peprocessor.h +++ b/src/peprocessor.h @@ -81,6 +81,7 @@ class PairEndProcessor{ WriterThread* mUnpairedRightWriter; WriterThread* mMergedWriter; WriterThread* mFailedWriter; + WriterThread* mOverlappedWriter; Duplicate* mDuplicate; };