Bug fixes. Use CompactSequence.

TeweiLuo · Oct 2, 2014 · 41cd2b3 · 41cd2b3
1 parent 1e37703
commit 41cd2b3
Show file tree

Hide file tree

Showing 18 changed files with 421 additions and 473 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,30 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# ctags
+*.tags
+*.tags_sorted_by_file
+
+# Other library
+
+# Sublime Text file
+sftp-config.json
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -0,0 +1,14 @@
+### 0.1.1 beta / 2014-10-02
+
+Enhancements:
+
+* Add change log
+* More detailed README for input format
+* Use `CompactSequence` in `UnitigGraphs`
+* Remove unused parallel sorting codes
+
+Bug Fixes:
+
+* Fixed wrong computation of `word_per_read` in `cx1_functions.cpp`
+* Fixed crash in `FastxReader` if the file is empty
+* Fixed floating point error in `assembly_algorithms.cpp`
diff --git a/Makefile b/Makefile
@@ -21,7 +21,7 @@
 #
 # Makefile usage
 #
-# make <target> [use_gpu=<0|1>] [sm=<XXX,...>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
+# make <target>[use_gpu=<0|1>] [disablempopcnt=<0|1>] [sm=<XXX,...>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
 #
 #-------------------------------------------------------------------------------
 
@@ -147,7 +147,10 @@ DEPS =   ./Makefile \
 # CC = /nas1/dhli/gcc/4.8.3/rtf/bin/g++
 CC = g++
 CUDALIBFLAG = -L/usr/local/cuda/lib64/ -lcuda -lcudart
-CFLAGS = -O3 -Wall -funroll-loops -march=native -fomit-frame-pointer -maccumulate-outgoing-args -fprefetch-loop-arrays -lm -static-libgcc -mpopcnt -fopenmp -g -std=c++0x
+CFLAGS = -O3 -Wall -funroll-loops -march=core2 -fomit-frame-pointer -maccumulate-outgoing-args -fprefetch-loop-arrays -lm -static-libgcc -fopenmp -g -std=c++0x
+ifneq ($(disablempopcnt), 1)
+	CFLAGS += -mpopcnt
+endif
 DEPS = Makefile
 BIN_DIR = ./bin/
 
@@ -176,8 +179,8 @@ endif
 $(BIN_DIR)sdbg_builder_cpu: sdbg_builder.cpp .cx1_functions_cpu.o lv2_cpu_sort.h $(DEPS)
 	$(CC) $(CFLAGS) sdbg_builder.cpp .cx1_functions_cpu.o options_description.o -o $(BIN_DIR)sdbg_builder_cpu -D DISABLE_GPU -lz
 
-$(BIN_DIR)assembler: assembler.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o $(DEPS)
-	$(CC) $(CFLAGS) assembler.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o -o $(BIN_DIR)assembler
+$(BIN_DIR)assembler: assembler.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o compact_sequence.o $(DEPS)
+	$(CC) $(CFLAGS) assembler.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o compact_sequence.o -o $(BIN_DIR)assembler
 
 iterate_edges_all: $(BIN_DIR)iterate_edges_k61 $(BIN_DIR)iterate_edges_k92 $(BIN_DIR)iterate_edges_k124
 
@@ -190,8 +193,8 @@ $(BIN_DIR)iterate_edges_k92: iterate_edges.cpp iterate_edges.h options_descripti
 $(BIN_DIR)iterate_edges_k124: iterate_edges.cpp iterate_edges.h options_description.o $(DEPS)
 	$(CC) $(CFLAGS) -lz iterate_edges.cpp options_description.o -o $(BIN_DIR)iterate_edges_k124 -D KMER_NUM_UINT64=4
 
-$(BIN_DIR)query_sdbg: query_sdbg.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o unitig_graph.o $(DEPS)
-	$(CC) $(CFLAGS) query_sdbg.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o unitig_graph.o -o $(BIN_DIR)query_sdbg
+$(BIN_DIR)query_sdbg: query_sdbg.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o unitig_graph.o compact_sequence.o $(DEPS)
+	$(CC) $(CFLAGS) query_sdbg.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o unitig_graph.o compact_sequence.o -o $(BIN_DIR)query_sdbg
 
 $(BIN_DIR)rank_and_select_sample: rank_and_select_sample.cpp rank_and_select.o $(DEPS)
 	$(CC) $(CFLAGS) rank_and_select.o rank_and_select_sample.cpp -o $(BIN_DIR)rank_and_select_sample

diff --git a/README.md b/README.md
@@ -1,44 +1,80 @@
 MEGAHIT
 =========
 
-MEGAHIT is a single node assembler for large and complex metagenomics assembly, such as soil NGS reads. It makes used of succinct *de Bruijn* graph to achieve low memory usage, that can fit the whole assembly graph within a single node. However, its goal is not to make memory usage as low as possible, because it leverages all available memory (assigned by `-m` option) to build succinct *de Bruijn* graphs.
+MEGAHIT is a single node assembler for large and complex metagenomics NGS reads, such as soil. It makes use of succinct *de Bruijn* graph to achieve low memory usage, whereas its goal is not to make memory usage as low as possible. It leverages all available memory (assigned by `-m` option) to build succinct *de Bruijn* graphs. CPU-only and GPU-accelerated version of MEGAHIT are provided. The GPU-accelerated version of MEGAHIT has been tested on NVIDIA GTX680 (4G memory) and Tesla K40c (12G memory).
 
 Quick Start
 ----------------
 Use CPU-only version of MEGAHIT
 
 ```
-make
-python megahit -h # show the helping manual
-python megahit [options] --cpu-only -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
+% make
+% python ./megahit [options] --cpu-only -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
 ```
 
-Use CUDA version of MEGAHIT, with NVCC version 5.5 or higher.
+Use GPU-accelerated version of MEGAHIT, with an CUDA-enabled GPU and NVCC version 5.5 or higher.
 
 ```
-make use_gpu=1
-python megahit -h # show the helping manual
-python megahit [options] -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
+% make use_gpu=1
+% python ./megahit [options] -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
 ```
 
-We recommend to set `-m` as large as possible, but remember to leave some space for your server. For example, for a 64G server, use `-m 60000000000`, which is about 56GB. Typically, 56GB memory is quite enough for human guts samples containing 15-30G base-pairs.
+To show the usage message, type the command
+
+```
+% python ./megahit -h # show the helping manual
+```
+
+
+Memory control
+----------------
+We recommend to set `-m` as large as possible. But remember to leave some space for your server. For example, for a server with 64GB free memory, you may try `-m 60000000000`, which is about 56GB. Typically, 56GB memory is quite enough for human guts samples containing 15-30G base-pairs.
+
+Input files
+--------------
+
+MEGAHIT accepts **one** fasta or fastq file as input. The input file can be gzip'ed. Alternatively, you can use the option `--input-cmd` to input reads. Following the `--input-cmd` should be a command that output all reads to `STDOUT` in fasta or fastq format. A mixed of fasta and fastq is NOT supported. Some correct/wrong examples below.
+
+###Correct examples
+* Input from one fastq file named *reads.fastq*:
+```
+-r read.fastq
+```
+* Input from two fasta files with prefix *sample_1.fa* and  *sample_2.fa*:
+```
+--input-cmd "cat sample_[12].fa"
+```
+* Input from all gzip'ed fastq files in current directory:
+```
+--input-cmd "zcat *.fastq.gz"
+```
+* Assume fastq-dump is installed, input from a sra file *xxx.sra*:
+```
+--input-cmd "fastq-dump -Z --fasta xxx.sra"
+```
+
+###Wrong examples
+* Mixed fastq and fasta files to the input:
+```
+--input-cmd "cat *.fa *.fq"
+```
 
 Options
 ------------------------
 ###Choose *k*
-MEGAHIT uses iterative *k*-mer strategy. Minimum *k*, maximum *k* and the step for iteration can be set by options `--k-min`, `--k-max` and `--k-step`. *k* must be odd numbers while the step must be an even number.
+MEGAHIT uses multiple *k*-mer strategy. Minimum *k*, maximum *k* and the step for iteration can be set by options `--k-min`, `--k-max` and `--k-step` respectively. *k* must be odd numbers while the step must be an even number.
 
 ###Filter (*k_min*+1)-mer
-(*k_min*+1)-mer with multiplicity lower than *d* (default 2, assigned by `--min-count` option) will be discarded. You should be cautious to set *d* less than 2. This will lead to a much larger and noisy graph. We recommend use the default value 2.
+(*k_min*+1)-mer with multiplicity lower than *d* (default 2, assigned by `--min-count` option) will be discarded. You should be cautious to set *d* less than 2, which will lead to a much larger and noisy graph. We recommend use the default value 2.
 
 ###Mercy *k*-mer
-This is specially designed for metagenomics assembly. You can disuse it by adding `--no-mercy` option.
+This is specially designed for metagenomics assembly. You can disable this stategy by adding `--no-mercy` option.
 
 License
-------------------------
-
+-----------------------
 ```
-MEGAHIT
+  MEGAHIT
+  
   Copyright (C) 2014 The University of Hong Kong
 
   This program is free software: you can redistribute it and/or modify
@@ -53,4 +89,4 @@ MEGAHIT
 
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
-```
+```
diff --git a/assembly_algorithms.cpp b/assembly_algorithms.cpp
@@ -300,12 +300,17 @@ void PrintStat(long long genome_size) {
     // total length
     int64_t total_length = 0;
     int64_t total_contigs = 0;
+    int64_t average_length = 0;
     for (auto it = histogram.begin(); it != histogram.end(); ++it) {
         total_length += it->first * it->second;
         total_contigs += it->second;
     }
     if (genome_size == 0) { genome_size = total_length; }
 
+    if (total_contigs > 0) {
+        average_length = total_length / total_contigs;
+    }
+
     // N50
     int64_t n50 = -1;
     int64_t acc_length = 0;
@@ -317,7 +322,7 @@ void PrintStat(long long genome_size) {
         }
     }
 
-    printf("Total length: %ld, N50: %ld, Mean: %ld, number of contigs: %ld\n", total_length, n50, total_length / total_contigs, total_contigs);
+    printf("Total length: %ld, N50: %ld, Mean: %ld, number of contigs: %ld\n", total_length, n50, average_length, total_contigs);
     printf("Maximum length: %ld\n", histogram.size() > 0 ? histogram.rbegin()->first : 0);
 }
 

diff --git a/bit_operation.h b/bit_operation.h
@@ -22,6 +22,8 @@
  * @author Yu Peng ([email protected])
  * @version 1.0.0
  * @date 2011-08-02
+ * @modified by Dinghua Li
+ * @date 2014-10-02
  */
 
 #ifndef __BASIC_BIT_OPERATION_H_
@@ -55,11 +57,31 @@ inline void ReverseComplement(uint64_t &value)
     value = ~value;
 }
 
-inline void ReverseComplement(uint8_t &value)
+// inline void ReverseComplement(uint8_t &value)
+// {
+//     value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
+//     value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
+//     value = ~value;
+// }
+
+// inline void Reverse(uint8_t &value)
+// {
+//     value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
+//     value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
+// }
+
+inline uint8_t ReverseComplement(uint8_t value)
 {
     value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
     value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
-    value = ~value;
+    return ~value;
+}
+
+inline uint8_t Reverse(uint8_t value)
+{
+    value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
+    value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
+    return value;
 }
 
 inline int BitCount(uint8_t x)