From c4cf486ae036f554288a6b5f850fb342dfc8c946 Mon Sep 17 00:00:00 2001 From: "Brian P. Walenz" Date: Tue, 18 Jun 2013 01:34:59 +0000 Subject: [PATCH] Add option -RL to filter short reads. WILL NOT WORK WITH CGW. --- src/AS_BAT/AS_BAT_Datatypes.H | 2 +- src/AS_BAT/AS_BAT_FragmentInfo.C | 21 +++++++++++++++++---- src/AS_BAT/bogart.C | 10 +++++++++- src/AS_BAT/petey.C | 2 +- src/AS_BAT/rewriteCache.C | 2 +- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/AS_BAT/AS_BAT_Datatypes.H b/src/AS_BAT/AS_BAT_Datatypes.H index cb97943b1..26bbb3022 100644 --- a/src/AS_BAT/AS_BAT_Datatypes.H +++ b/src/AS_BAT/AS_BAT_Datatypes.H @@ -247,7 +247,7 @@ public: class FragmentInfo { public: - FragmentInfo(gkStore *gkpStore, const char *prefix); + FragmentInfo(gkStore *gkpStore, const char *prefix, uint32 minReadLen); ~FragmentInfo(); uint64 memoryUsage(void) { diff --git a/src/AS_BAT/AS_BAT_FragmentInfo.C b/src/AS_BAT/AS_BAT_FragmentInfo.C index 27343fa72..9263e79d7 100644 --- a/src/AS_BAT/AS_BAT_FragmentInfo.C +++ b/src/AS_BAT/AS_BAT_FragmentInfo.C @@ -27,13 +27,19 @@ const uint64 fiMagicNumber = 0x6f666e4967617266llu; // 'fragInfo' until it g const uint64 fiVersionNumber = 1; -FragmentInfo::FragmentInfo(gkStore *gkpStore, const char *prefix) { +FragmentInfo::FragmentInfo(gkStore *gkpStore, + const char *prefix, + uint32 minReadLen) { if (load(prefix)) return; writeLog("FragmentInfo()-- Loading fragment information\n"); + if (minReadLen > 0) + writeLog("FragmentInfo()-- Reads shorter than "F_U32" bases are forced to be singleton.\n", + minReadLen); + gkStream *fs = new gkStream(gkpStore, 0, 0, GKFRAGMENT_INF); gkFragment fr; @@ -71,11 +77,16 @@ FragmentInfo::FragmentInfo(gkStore *gkpStore, const char *prefix) { } uint32 numDeleted = 0; + uint32 numSkipped = 0; uint32 numLoaded = 0; while(fs->next(&fr)) { if (fr.gkFragment_getIsDeleted()) { numDeleted++; + + } else if (fr.gkFragment_getClearRegionLength() < minReadLen) { + numSkipped++; + } else { uint32 iid = fr.gkFragment_getReadIID(); uint32 lib = fr.gkFragment_getLibraryIID(); @@ -92,8 +103,9 @@ FragmentInfo::FragmentInfo(gkStore *gkpStore, const char *prefix) { numLoaded++; } - if (((numDeleted + numLoaded) % 10000000) == 0) - writeLog("FragmentInfo()-- Loading fragment information deleted:%9d active:%9d\n", numDeleted, numLoaded); + if (((numDeleted + numSkipped + numLoaded) % 10000000) == 0) + writeLog("FragmentInfo()-- Loading fragment information deleted:%9d skipped:%9d active:%9d\n", + numDeleted, numSkipped, numLoaded); } for (uint32 i=0; i<_numLibraries + 1; i++) @@ -121,7 +133,8 @@ FragmentInfo::FragmentInfo(gkStore *gkpStore, const char *prefix) { if (numBroken > 0) writeLog("FragmentInfo()-- WARNING! Removed "F_U32" mate relationships.\n", numBroken); - writeLog("FragmentInfo()-- Loaded %d alive fragments, skipped %d dead fragments.\n", numLoaded, numDeleted); + writeLog("FragmentInfo()-- Loaded %d alive fragments, skipped %d short and %d dead fragments.\n", + numLoaded, numSkipped, numDeleted); delete fs; diff --git a/src/AS_BAT/bogart.C b/src/AS_BAT/bogart.C index b64cbad29..ef757c7ce 100644 --- a/src/AS_BAT/bogart.C +++ b/src/AS_BAT/bogart.C @@ -81,6 +81,8 @@ main (int argc, char * argv []) { bool enableReconstructRepeats = false; bool enablePromoteToSingleton = true; + uint32 minReadLen = 0; + argc = AS_configure(argc, argv); int err = 0; @@ -129,6 +131,9 @@ main (int argc, char * argv []) { } else if (strcmp(argv[arg], "-DP") == 0) { enablePromoteToSingleton = false; + } else if (strcmp(argv[arg], "-RL") == 0) { + minReadLen = atoi(argv[++arg]); + } else if (strcmp(argv[arg], "-threads") == 0) { numThreads = atoi(argv[++arg]); @@ -250,6 +255,8 @@ main (int argc, char * argv []) { fprintf(stderr, " -E Shatter repeats, extend unique unitigs\n"); fprintf(stderr, " -DP When -R or -E, don't promote shattered leftovers to unitigs.\n"); fprintf(stderr, " This WILL cause CGW to fail; diagnostic only.\n"); + fprintf(stderr, " -RL len Force reads below 'len' bases to be singletons.\n"); + fprintf(stderr, " This WILL cause CGW to fail; diagnostic only.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -threads N Use N compute threads during repeat detection. EXPERIMENTAL!\n"); fprintf(stderr, " 0 - use OpenMP default\n"); @@ -257,6 +264,7 @@ main (int argc, char * argv []) { fprintf(stderr, "\n"); fprintf(stderr, "Overlap Selection - an overlap will be considered for use in a unitig if either of\n"); fprintf(stderr, " the following conditions hold:\n"); + fprintf(stderr, "\n"); fprintf(stderr, " When constructing the Best Overlap Graph and Promiscuous Unitigs ('g'raph):\n"); fprintf(stderr, " -eg 0.020 no more than 0.020 fraction (2.0%%) error\n"); fprintf(stderr, " -Eg 2 no more than 2 errors (useful with short reads)\n"); @@ -342,7 +350,7 @@ main (int argc, char * argv []) { setLogFile(output_prefix, NULL); - FI = new FragmentInfo(gkpStore, output_prefix); + FI = new FragmentInfo(gkpStore, output_prefix, minReadLen); // Initialize where we've been to nowhere Unitig::resetFragUnitigMap(FI->numFragments()); diff --git a/src/AS_BAT/petey.C b/src/AS_BAT/petey.C index 3312eb90f..99047397a 100644 --- a/src/AS_BAT/petey.C +++ b/src/AS_BAT/petey.C @@ -221,7 +221,7 @@ main (int argc, char * argv []) { OverlapStore *ovlStoreUniq = AS_OVS_openOverlapStore(ovlStoreUniqPath); OverlapStore *ovlStoreRept = ovlStoreReptPath ? AS_OVS_openOverlapStore(ovlStoreReptPath) : NULL; - FI = new FragmentInfo(gkpStore, output_prefix); + FI = new FragmentInfo(gkpStore, output_prefix, 0); OC = new OverlapCache(ovlStoreUniq, ovlStoreRept, output_prefix, 6.0, 1.5, 0, 0, false, false); OG = new BestOverlapGraph(erate, elimit, output_prefix, false, false); CG = new ChunkGraph(output_prefix); diff --git a/src/AS_BAT/rewriteCache.C b/src/AS_BAT/rewriteCache.C index e2f6b0300..b2383af1e 100644 --- a/src/AS_BAT/rewriteCache.C +++ b/src/AS_BAT/rewriteCache.C @@ -25,7 +25,7 @@ main(int argc, char **argv) { gkStore *gkpStore = new gkStore("salmon.gkpStore", FALSE, FALSE); - FragmentInfo *FI = new FragmentInfo(gkpStore, "salmon"); + FragmentInfo *FI = new FragmentInfo(gkpStore, "salmon", 0); sprintf(name, "salmon.ovlCache");