-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathclassic_index_construction.cpp
154 lines (133 loc) · 5.36 KB
/
classic_index_construction.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/*******************************************************************************
* tests/classic_index_construction.cpp
*
* Copyright (c) 2018 Florian Gauger
*
* All rights reserved. Published under the MIT License in the LICENSE file.
******************************************************************************/
#include "test_util.hpp"
#include <cobs/query/classic_index/mmap_search_file.hpp>
#include <cobs/util/calc_signature_size.hpp>
#include <cobs/util/file.hpp>
#include <cobs/util/fs.hpp>
#include <gtest/gtest.h>
namespace fs = cobs::fs;
static fs::path base_dir = "data/classic_index_construction";
static fs::path input_dir = base_dir / "input";
static fs::path index_dir = base_dir / "index";
static fs::path index_file = base_dir / "index.cobs_classic";
static fs::path tmp_path = base_dir / "tmp";
class classic_index_construction : public ::testing::Test
{
protected:
void SetUp() final {
cobs::error_code ec;
fs::remove_all(base_dir, ec);
}
void TearDown() final {
cobs::error_code ec;
fs::remove_all(base_dir, ec);
}
};
TEST_F(classic_index_construction, deserialization) {
// generate
std::string query = cobs::random_sequence(10000, 1);
auto documents = generate_documents_all(query, /* num_documents */ 33);
generate_test_case(documents, input_dir.string());
// get file names
std::vector<fs::path> paths;
std::copy_if(fs::recursive_directory_iterator(input_dir),
fs::recursive_directory_iterator(),
std::back_inserter(paths),
[](const auto& p) {
return cobs::file_has_header<cobs::KMerBufferHeader>(p);
});
std::sort(paths.begin(), paths.end());
// construct classic index
cobs::ClassicIndexParameters index_params;
index_params.num_hashes = 3;
index_params.false_positive_rate = 0.1;
cobs::classic_construct(
cobs::DocumentList(input_dir), index_file, tmp_path, index_params);
// read classic index and check header fields
std::vector<uint8_t> data;
cobs::ClassicIndexHeader h;
h.read_file(index_file, data);
ASSERT_EQ(h.file_names_.size(), 33u);
ASSERT_EQ(h.num_hashes_, 3u);
ASSERT_EQ(h.file_names_.size(), paths.size());
for (size_t i = 0; i < h.file_names_.size(); i++) {
ASSERT_EQ(h.file_names_[i], cobs::base_name(paths[i]));
}
// check ratio of zeros/ones
std::map<std::string, size_t> num_ones;
for (size_t j = 0; j < h.signature_size_; j++) {
for (size_t k = 0; k < h.row_size(); k++) {
uint8_t d = data[j * h.row_size() + k];
for (size_t o = 0; o < 8; o++) {
size_t file_names_index = k * 8 + o;
if (file_names_index < h.file_names_.size()) {
std::string file_name = h.file_names_[file_names_index];
num_ones[file_name] += (d & (1 << o)) >> o;
}
}
}
}
double set_bit_ratio =
cobs::calc_average_set_bit_ratio(h.signature_size_, 3, 0.1);
double num_ones_average = set_bit_ratio * h.signature_size_;
for (auto& no : num_ones) {
ASSERT_LE(no.second, num_ones_average * 1.01);
}
}
TEST_F(classic_index_construction, combine) {
using cobs::pad_index;
fs::create_directories(index_dir);
// generate 10 individual sets of documents and construct indices
using DocumentSet = std::vector<cobs::KMerBuffer<31> >;
std::vector<DocumentSet> doc_sets;
for (size_t i = 0; i < 10; ++i) {
std::string query = cobs::random_sequence(10000, /* seed */ i + 1);
auto documents = generate_documents_all(
query, /* num_documents */ 3, /* num_terms */ 100);
generate_test_case(
documents, /* prefix */ "set_" + pad_index(i) + "_",
input_dir / pad_index(i));
doc_sets.emplace_back(std::move(documents));
// construct classic index
cobs::ClassicIndexParameters index_params;
index_params.num_hashes = 3;
index_params.false_positive_rate = 0.1;
cobs::classic_construct(
cobs::DocumentList(input_dir / pad_index(i)),
index_dir / (pad_index(i) + ".cobs_classic"),
tmp_path, index_params);
}
fs::path result_file;
cobs::classic_combine(
index_dir, index_file, result_file,
/* mem_bytes */ 128 * 1024 * 1024, /* num_threads */ 4,
/* keep_temporary */ false);
// check result by querying for document terms
cobs::ClassicSearch s_base(
std::make_shared<cobs::ClassicIndexMMapSearchFile>(result_file));
std::vector<cobs::SearchResult> result;
for (size_t ds = 0; ds < 10; ++ds) {
for (size_t d = 0; d < doc_sets[ds].size(); ++d) {
for (size_t i = 0; i < doc_sets[ds][d].num_kmers(); ++i) {
std::string doc_match =
"set_" + pad_index(ds) + "_document_" + pad_index(d);
std::string kmer = doc_sets[ds][d][i].string();
LOG0 << kmer;
s_base.search(kmer, result);
bool found = false;
for (auto& r : result) {
if (r.doc_name == doc_match && r.score > 0)
found = true;
}
ASSERT_TRUE(found);
}
}
}
}
/******************************************************************************/