Skip to content

Commit

Permalink
have canonicalization on by default
Browse files Browse the repository at this point in the history
  • Loading branch information
bingmann committed Nov 6, 2019
1 parent 3d5fced commit 27994a6
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 17 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ To construct a compact COBS index from these seven example documents run
src/cobs compact-construct tests/data/fasta/ example.cobs_compact
```
Check `--help` for many options.
Maybe the most important is `--canonicalize` to enable k-mer DNA canonicalization.

## Query an Index

Expand Down
2 changes: 1 addition & 1 deletion cobs/construction/classic_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct ClassicIndexParameters {
//! length of terms / k-mers
unsigned term_size = 31;
//! canonicalization flag for base pairs
uint8_t canonicalize = 0;
uint8_t canonicalize = 1;
//! number of hash functions, provided by user
unsigned num_hashes = 1;
//! false positive rate, provided by user
Expand Down
2 changes: 1 addition & 1 deletion cobs/construction/compact_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct CompactIndexParameters {
//! length of terms / k-mers
unsigned term_size = 31;
//! canonicalization flag for base pairs
uint8_t canonicalize = 0;
uint8_t canonicalize = 1;
//! number of hash functions, provided by user
unsigned num_hashes = 1;
//! false positive rate, provided by user
Expand Down
4 changes: 2 additions & 2 deletions python/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ PYBIND11_MODULE(cobs_index, m) {
"length of terms / k-mers, default 31")
.def_readwrite(
"canonicalize", &ClassicIndexParameters::canonicalize,
"canonicalization flag for base pairs, default false")
"canonicalization flag for base pairs, default true")
.def_readwrite(
"num_hashes", &ClassicIndexParameters::num_hashes,
"number of hash functions, provided by user, default 1")
Expand Down Expand Up @@ -299,7 +299,7 @@ Construct a COBS Classic Index from a pre-populated DocumentList object.
"length of terms / k-mers, default 31")
.def_readwrite(
"canonicalize", &CompactIndexParameters::canonicalize,
"canonicalization flag for base pairs, default false")
"canonicalization flag for base pairs, default true")
.def_readwrite(
"num_hashes", &CompactIndexParameters::num_hashes,
"number of hash functions, provided by user, default 1")
Expand Down
24 changes: 12 additions & 12 deletions src/cobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,10 @@ int doc_dump(int argc, char** argv) {
'k', "term-size", term_size,
"term size (k-mer size), default: 31");

bool canonicalize = false;
bool no_canonicalize = false;
cp.add_flag(
'c', "canonicalize", canonicalize,
"canonicalize DNA k-mers, default: false");
"no-canonicalize", no_canonicalize,
"don't canonicalize DNA k-mers, default: false");

std::string file_type = "any";
cp.add_string(
Expand All @@ -149,7 +149,7 @@ int doc_dump(int argc, char** argv) {
filelist[i].process_terms(
term_size,
[&](const cobs::string_view& t) {
if (canonicalize) {
if (!no_canonicalize) {
auto kmer = cobs::canonicalize_kmer(
t.data(), kmer_buffer.data(), term_size);
std::cout << std::string(kmer, term_size) << '\n';
Expand Down Expand Up @@ -208,10 +208,10 @@ int classic_construct(int argc, char** argv) {
"term size (k-mer size), default: "
+ std::to_string(index_params.term_size));

bool canonicalize = false;
bool no_canonicalize = false;
cp.add_flag(
'c', "canonicalize", canonicalize,
"canonicalize DNA k-mers, default: false");
"no-canonicalize", no_canonicalize,
"don't canonicalize DNA k-mers, default: false");

cp.add_flag(
'C', "clobber", index_params.clobber,
Expand Down Expand Up @@ -240,7 +240,7 @@ int classic_construct(int argc, char** argv) {
cp.print_result(std::cerr);

// bool to uint8_t
index_params.canonicalize = canonicalize;
index_params.canonicalize = !no_canonicalize;

// read file list
cobs::DocumentList filelist(input, StringToFileType(file_type));
Expand Down Expand Up @@ -347,10 +347,10 @@ int compact_construct(int argc, char** argv) {
"the page size of the compact the index, "
"default: sqrt(#documents)");

bool canonicalize = false;
bool no_canonicalize = false;
cp.add_flag(
'c', "canonicalize", canonicalize,
"canonicalize DNA k-mers, default: false");
"no-canonicalize", no_canonicalize,
"don't canonicalize DNA k-mers, default: false");

cp.add_flag(
'C', "clobber", index_params.clobber,
Expand Down Expand Up @@ -379,7 +379,7 @@ int compact_construct(int argc, char** argv) {
cp.print_result(std::cerr);

// bool to uint8_t
index_params.canonicalize = canonicalize;
index_params.canonicalize = !no_canonicalize;

// read file list
cobs::DocumentList filelist(input, StringToFileType(file_type));
Expand Down

0 comments on commit 27994a6

Please sign in to comment.