Skip to content

Commit

Permalink
Bug fixes. Use CompactSequence.
Browse files Browse the repository at this point in the history
  • Loading branch information
voutcn committed Oct 2, 2014
1 parent 1e37703 commit 41cd2b3
Show file tree
Hide file tree
Showing 18 changed files with 421 additions and 473 deletions.
30 changes: 30 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

# ctags
*.tags
*.tags_sorted_by_file

# Other library

# Sublime Text file
sftp-config.json
14 changes: 14 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
### 0.1.1 beta / 2014-10-02

Enhancements:

* Add change log
* More detailed README for input format
* Use `CompactSequence` in `UnitigGraphs`
* Remove unused parallel sorting codes

Bug Fixes:

* Fixed wrong computation of `word_per_read` in `cx1_functions.cpp`
* Fixed crash in `FastxReader` if the file is empty
* Fixed floating point error in `assembly_algorithms.cpp`
15 changes: 9 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#
# Makefile usage
#
# make <target> [use_gpu=<0|1>] [sm=<XXX,...>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
# make <target>[use_gpu=<0|1>] [disablempopcnt=<0|1>] [sm=<XXX,...>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
#
#-------------------------------------------------------------------------------

Expand Down Expand Up @@ -147,7 +147,10 @@ DEPS = ./Makefile \
# CC = /nas1/dhli/gcc/4.8.3/rtf/bin/g++
CC = g++
CUDALIBFLAG = -L/usr/local/cuda/lib64/ -lcuda -lcudart
CFLAGS = -O3 -Wall -funroll-loops -march=native -fomit-frame-pointer -maccumulate-outgoing-args -fprefetch-loop-arrays -lm -static-libgcc -mpopcnt -fopenmp -g -std=c++0x
CFLAGS = -O3 -Wall -funroll-loops -march=core2 -fomit-frame-pointer -maccumulate-outgoing-args -fprefetch-loop-arrays -lm -static-libgcc -fopenmp -g -std=c++0x
ifneq ($(disablempopcnt), 1)
CFLAGS += -mpopcnt
endif
DEPS = Makefile
BIN_DIR = ./bin/

Expand Down Expand Up @@ -176,8 +179,8 @@ endif
$(BIN_DIR)sdbg_builder_cpu: sdbg_builder.cpp .cx1_functions_cpu.o lv2_cpu_sort.h $(DEPS)
$(CC) $(CFLAGS) sdbg_builder.cpp .cx1_functions_cpu.o options_description.o -o $(BIN_DIR)sdbg_builder_cpu -D DISABLE_GPU -lz

$(BIN_DIR)assembler: assembler.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o $(DEPS)
$(CC) $(CFLAGS) assembler.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o -o $(BIN_DIR)assembler
$(BIN_DIR)assembler: assembler.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o compact_sequence.o $(DEPS)
$(CC) $(CFLAGS) assembler.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o compact_sequence.o -o $(BIN_DIR)assembler

iterate_edges_all: $(BIN_DIR)iterate_edges_k61 $(BIN_DIR)iterate_edges_k92 $(BIN_DIR)iterate_edges_k124

Expand All @@ -190,8 +193,8 @@ $(BIN_DIR)iterate_edges_k92: iterate_edges.cpp iterate_edges.h options_descripti
$(BIN_DIR)iterate_edges_k124: iterate_edges.cpp iterate_edges.h options_description.o $(DEPS)
$(CC) $(CFLAGS) -lz iterate_edges.cpp options_description.o -o $(BIN_DIR)iterate_edges_k124 -D KMER_NUM_UINT64=4

$(BIN_DIR)query_sdbg: query_sdbg.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o unitig_graph.o $(DEPS)
$(CC) $(CFLAGS) query_sdbg.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o unitig_graph.o -o $(BIN_DIR)query_sdbg
$(BIN_DIR)query_sdbg: query_sdbg.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o unitig_graph.o compact_sequence.o $(DEPS)
$(CC) $(CFLAGS) query_sdbg.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o unitig_graph.o compact_sequence.o -o $(BIN_DIR)query_sdbg

$(BIN_DIR)rank_and_select_sample: rank_and_select_sample.cpp rank_and_select.o $(DEPS)
$(CC) $(CFLAGS) rank_and_select.o rank_and_select_sample.cpp -o $(BIN_DIR)rank_and_select_sample
Expand Down
68 changes: 52 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,44 +1,80 @@
MEGAHIT
=========

MEGAHIT is a single node assembler for large and complex metagenomics assembly, such as soil NGS reads. It makes used of succinct *de Bruijn* graph to achieve low memory usage, that can fit the whole assembly graph within a single node. However, its goal is not to make memory usage as low as possible, because it leverages all available memory (assigned by `-m` option) to build succinct *de Bruijn* graphs.
MEGAHIT is a single node assembler for large and complex metagenomics NGS reads, such as soil. It makes use of succinct *de Bruijn* graph to achieve low memory usage, whereas its goal is not to make memory usage as low as possible. It leverages all available memory (assigned by `-m` option) to build succinct *de Bruijn* graphs. CPU-only and GPU-accelerated version of MEGAHIT are provided. The GPU-accelerated version of MEGAHIT has been tested on NVIDIA GTX680 (4G memory) and Tesla K40c (12G memory).

Quick Start
----------------
Use CPU-only version of MEGAHIT

```
make
python megahit -h # show the helping manual
python megahit [options] --cpu-only -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
% make
% python ./megahit [options] --cpu-only -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
```

Use CUDA version of MEGAHIT, with NVCC version 5.5 or higher.
Use GPU-accelerated version of MEGAHIT, with an CUDA-enabled GPU and NVCC version 5.5 or higher.

```
make use_gpu=1
python megahit -h # show the helping manual
python megahit [options] -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
% make use_gpu=1
% python ./megahit [options] -m <memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
```

We recommend to set `-m` as large as possible, but remember to leave some space for your server. For example, for a 64G server, use `-m 60000000000`, which is about 56GB. Typically, 56GB memory is quite enough for human guts samples containing 15-30G base-pairs.
To show the usage message, type the command

```
% python ./megahit -h # show the helping manual
```


Memory control
----------------
We recommend to set `-m` as large as possible. But remember to leave some space for your server. For example, for a server with 64GB free memory, you may try `-m 60000000000`, which is about 56GB. Typically, 56GB memory is quite enough for human guts samples containing 15-30G base-pairs.

Input files
--------------

MEGAHIT accepts **one** fasta or fastq file as input. The input file can be gzip'ed. Alternatively, you can use the option `--input-cmd` to input reads. Following the `--input-cmd` should be a command that output all reads to `STDOUT` in fasta or fastq format. A mixed of fasta and fastq is NOT supported. Some correct/wrong examples below.

###Correct examples
* Input from one fastq file named *reads.fastq*:
```
-r read.fastq
```
* Input from two fasta files with prefix *sample_1.fa* and *sample_2.fa*:
```
--input-cmd "cat sample_[12].fa"
```
* Input from all gzip'ed fastq files in current directory:
```
--input-cmd "zcat *.fastq.gz"
```
* Assume fastq-dump is installed, input from a sra file *xxx.sra*:
```
--input-cmd "fastq-dump -Z --fasta xxx.sra"
```

###Wrong examples
* Mixed fastq and fasta files to the input:
```
--input-cmd "cat *.fa *.fq"
```

Options
------------------------
###Choose *k*
MEGAHIT uses iterative *k*-mer strategy. Minimum *k*, maximum *k* and the step for iteration can be set by options `--k-min`, `--k-max` and `--k-step`. *k* must be odd numbers while the step must be an even number.
MEGAHIT uses multiple *k*-mer strategy. Minimum *k*, maximum *k* and the step for iteration can be set by options `--k-min`, `--k-max` and `--k-step` respectively. *k* must be odd numbers while the step must be an even number.

###Filter (*k_min*+1)-mer
(*k_min*+1)-mer with multiplicity lower than *d* (default 2, assigned by `--min-count` option) will be discarded. You should be cautious to set *d* less than 2. This will lead to a much larger and noisy graph. We recommend use the default value 2.
(*k_min*+1)-mer with multiplicity lower than *d* (default 2, assigned by `--min-count` option) will be discarded. You should be cautious to set *d* less than 2, which will lead to a much larger and noisy graph. We recommend use the default value 2.

###Mercy *k*-mer
This is specially designed for metagenomics assembly. You can disuse it by adding `--no-mercy` option.
This is specially designed for metagenomics assembly. You can disable this stategy by adding `--no-mercy` option.

License
------------------------

-----------------------
```
MEGAHIT
MEGAHIT
Copyright (C) 2014 The University of Hong Kong
This program is free software: you can redistribute it and/or modify
Expand All @@ -53,4 +89,4 @@ MEGAHIT
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
```
```
7 changes: 6 additions & 1 deletion assembly_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,17 @@ void PrintStat(long long genome_size) {
// total length
int64_t total_length = 0;
int64_t total_contigs = 0;
int64_t average_length = 0;
for (auto it = histogram.begin(); it != histogram.end(); ++it) {
total_length += it->first * it->second;
total_contigs += it->second;
}
if (genome_size == 0) { genome_size = total_length; }

if (total_contigs > 0) {
average_length = total_length / total_contigs;
}

// N50
int64_t n50 = -1;
int64_t acc_length = 0;
Expand All @@ -317,7 +322,7 @@ void PrintStat(long long genome_size) {
}
}

printf("Total length: %ld, N50: %ld, Mean: %ld, number of contigs: %ld\n", total_length, n50, total_length / total_contigs, total_contigs);
printf("Total length: %ld, N50: %ld, Mean: %ld, number of contigs: %ld\n", total_length, n50, average_length, total_contigs);
printf("Maximum length: %ld\n", histogram.size() > 0 ? histogram.rbegin()->first : 0);
}

Expand Down
26 changes: 24 additions & 2 deletions bit_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
* @author Yu Peng ([email protected])
* @version 1.0.0
* @date 2011-08-02
* @modified by Dinghua Li
* @date 2014-10-02
*/

#ifndef __BASIC_BIT_OPERATION_H_
Expand Down Expand Up @@ -55,11 +57,31 @@ inline void ReverseComplement(uint64_t &value)
value = ~value;
}

inline void ReverseComplement(uint8_t &value)
// inline void ReverseComplement(uint8_t &value)
// {
// value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
// value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
// value = ~value;
// }

// inline void Reverse(uint8_t &value)
// {
// value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
// value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
// }

inline uint8_t ReverseComplement(uint8_t value)
{
value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
value = ~value;
return ~value;
}

inline uint8_t Reverse(uint8_t value)
{
value = ((value & kSwap8Mask4) << 4) | ((value & ~kSwap8Mask4) >> 4);
value = ((value & kSwap8Mask2) << 2) | ((value & ~kSwap8Mask2) >> 2);
return value;
}

inline int BitCount(uint8_t x)
Expand Down
Loading

0 comments on commit 41cd2b3

Please sign in to comment.