Skip to content

Commit

Permalink
support outputing overlapped regions to get the cleanest data
Browse files Browse the repository at this point in the history
  • Loading branch information
sfchen committed Jun 19, 2020
1 parent 8f0ecea commit 94c08a1
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ options:
--unpaired1 for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=])
--unpaired2 for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file. (string [=])
--failed_out specify the file to store reads that cannot pass the filters. (string [=])
--overlapped_out for each read pair, output the overlapped region if it has no any mismatched base. (string [=])
-m, --merge for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default.
--merged_out in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=])
--include_unmerged in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default.
Expand Down
2 changes: 1 addition & 1 deletion src/common.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef COMMON_H
#define COMMON_H

#define FASTP_VER "0.20.1"
#define FASTP_VER "0.21"

#define _DEBUG false

Expand Down
2 changes: 2 additions & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ int main(int argc, char* argv[]){
cmd.add<string>("out2", 'O', "read2 output file name", false, "");
cmd.add<string>("unpaired1", 0, "for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it.", false, "");
cmd.add<string>("unpaired2", 0, "for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file.", false, "");
cmd.add<string>("overlapped_out", 0, "for each read pair, output the overlapped region if it has no any mismatched base.", false, "");
cmd.add<string>("failed_out", 0, "specify the file to store reads that cannot pass the filters.", false, "");
cmd.add("merge", 'm', "for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default.");
cmd.add<string>("merged_out", 0, "in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output", false, "");
Expand Down Expand Up @@ -167,6 +168,7 @@ int main(int argc, char* argv[]){
opt.unpaired1 = cmd.get<string>("unpaired1");
opt.unpaired2 = cmd.get<string>("unpaired2");
opt.failedOut = cmd.get<string>("failed_out");
opt.overlappedOut = cmd.get<string>("overlapped_out");
// write to the same file
if(opt.unpaired2.empty())
opt.unpaired2 = opt.unpaired1;
Expand Down
10 changes: 10 additions & 0 deletions src/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ bool Options::validate() {
error_exit(out2 + " already exists and you have set to not rewrite output files by --dont_overwrite");
}
}
if(!overlappedOut.empty()) {
//check_file_writable(out2);
if(dontOverwrite && file_exists(overlappedOut)) {
error_exit(overlappedOut + " already exists and you have set to not rewrite output files by --dont_overwrite");
}
}
if(!isPaired()) {
if(!unpaired1.empty()) {
cerr << "Not paired-end mode. Ignoring argument --unpaired1 = " << unpaired1 << endl;
Expand All @@ -205,6 +211,10 @@ bool Options::validate() {
cerr << "Not paired-end mode. Ignoring argument --unpaired2 = " << unpaired2 << endl;
unpaired2 = "";
}
if(!overlappedOut.empty()) {
cerr << "Not paired-end mode. Ignoring argument --overlapped_out = " << overlappedOut << endl;
overlappedOut = "";
}
}
if(split.enabled) {
if(!unpaired1.empty()) {
Expand Down
2 changes: 2 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,8 @@ class Options{
// file name of failed reads output
string failedOut;
// json file
string overlappedOut;
// json file
string jsonFile;
// html file
string htmlFile;
Expand Down
34 changes: 34 additions & 0 deletions src/peprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ PairEndProcessor::PairEndProcessor(Options* opt){
mUnpairedRightWriter = NULL;
mMergedWriter = NULL;
mFailedWriter = NULL;
mOverlappedWriter = NULL;

mDuplicate = NULL;
if(mOptions->duplicate.enabled) {
Expand Down Expand Up @@ -62,6 +63,9 @@ void PairEndProcessor::initOutput() {
if(!mOptions->failedOut.empty())
mFailedWriter = new WriterThread(mOptions, mOptions->failedOut);

if(!mOptions->overlappedOut.empty())
mOverlappedWriter = new WriterThread(mOptions, mOptions->overlappedOut);

if(mOptions->out1.empty())
return;

Expand All @@ -87,6 +91,10 @@ void PairEndProcessor::closeOutput() {
delete mFailedWriter;
mFailedWriter = NULL;
}
if(mOverlappedWriter) {
delete mOverlappedWriter;
mOverlappedWriter = NULL;
}
if(mUnpairedLeftWriter) {
delete mUnpairedLeftWriter;
mLeftWriter = NULL;
Expand Down Expand Up @@ -132,6 +140,7 @@ bool PairEndProcessor::process(){
std::thread* unpairedRightWriterThread = NULL;
std::thread* mergedWriterThread = NULL;
std::thread* failedWriterThread = NULL;
std::thread* overlappedWriterThread = NULL;
if(mLeftWriter)
leftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mLeftWriter));
if(mRightWriter)
Expand All @@ -144,6 +153,8 @@ bool PairEndProcessor::process(){
mergedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mMergedWriter));
if(mFailedWriter)
failedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mFailedWriter));
if(mOverlappedWriter)
overlappedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mOverlappedWriter));

producer.join();
for(int t=0; t<mOptions->thread; t++){
Expand All @@ -163,6 +174,8 @@ bool PairEndProcessor::process(){
mergedWriterThread->join();
if(failedWriterThread)
failedWriterThread->join();
if(overlappedWriterThread)
overlappedWriterThread->join();
}

if(mOptions->verbose)
Expand Down Expand Up @@ -285,6 +298,8 @@ bool PairEndProcessor::process(){
delete mergedWriterThread;
if(failedWriterThread)
delete failedWriterThread;
if(overlappedWriterThread)
delete overlappedWriterThread;

if(!mOptions->split.enabled)
closeOutput();
Expand Down Expand Up @@ -312,6 +327,7 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){
string singleOutput;
string mergedOutput;
string failedOut;
string overlappedOut;
int readPassed = 0;
int mergedCount = 0;
for(int p=0;p<pack->count;p++){
Expand Down Expand Up @@ -385,6 +401,15 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){
}
}

if(r1 != NULL && r2!=NULL && mOverlappedWriter) {
OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, 0);
if(ov.overlapped) {
Read* overlappedRead = new Read(r1->mName, r1->mSeq.mStr.substr(ov.offset, ov.overlap_len), r1->mStrand, r1->mQuality.substr(ov.offset, ov.overlap_len));
overlappedOut += overlappedRead->toString();
delete overlappedRead;
}
}

if(config->getThreadId() == 0 && !isizeEvaluated && r1 != NULL && r2!=NULL) {
OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0);
statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2);
Expand Down Expand Up @@ -529,6 +554,13 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){
mFailedWriter->input(fdata, failedOut.size());
}

if(mOverlappedWriter && !overlappedOut.empty()) {
// write failed data
char* odata = new char[overlappedOut.size()];
memcpy(odata, overlappedOut.c_str(), overlappedOut.size());
mOverlappedWriter->input(odata, overlappedOut.size());
}

// normal output by left/right writer thread
if(mRightWriter && mLeftWriter && (!outstr1.empty() || !outstr2.empty())) {
// write PE
Expand Down Expand Up @@ -808,6 +840,8 @@ void PairEndProcessor::consumerTask(ThreadConfig* config)
mMergedWriter->setInputCompleted();
if(mFailedWriter)
mFailedWriter->setInputCompleted();
if(mOverlappedWriter)
mOverlappedWriter->setInputCompleted();
}

if(mOptions->verbose) {
Expand Down
1 change: 1 addition & 0 deletions src/peprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class PairEndProcessor{
WriterThread* mUnpairedRightWriter;
WriterThread* mMergedWriter;
WriterThread* mFailedWriter;
WriterThread* mOverlappedWriter;
Duplicate* mDuplicate;
};

Expand Down

0 comments on commit 94c08a1

Please sign in to comment.