Skip to content

Commit

Permalink
more fixes for 0 row files
Browse files Browse the repository at this point in the history
  • Loading branch information
Damian Eads committed Sep 14, 2016
1 parent ef7e6e3 commit c33c386
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
16 changes: 15 additions & 1 deletion src/csv/colbased_loader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ namespace ParaText {
Returns the number of columns parsed by this loader.
*/
size_t get_num_columns() const {
return column_chunks_.size() == 0 ? 0 : column_chunks_[0].size();
return column_chunks_.size() == 0 ? header_parser_.get_num_columns() : column_chunks_[0].size();
}

/*
Expand Down Expand Up @@ -584,6 +584,20 @@ namespace ParaText {
thread_exception = workers[i]->get_exception();
}
}
#if 0
if (threads.size() == 0) {
column_chunks_.emplace_back();
for (size_t col = 0; col < column_infos_.size(); col++) {
auto fit = forced_semantics_.find(column_infos_[col].name);
if (fit == forced_semantics_.end()) {
column_chunks_.back().push_back(std::make_shared<ColBasedChunk>(column_infos_[col].name, params.max_level_name_length, params.max_levels, Semantics::UNKNOWN));
}
else {
column_chunks_.back().push_back(std::make_shared<ColBasedChunk>(column_infos_[col].name, params.max_level_name_length, params.max_levels, fit->second));
}
}
}
#endif
// We're now outside the parallel region.
if (thread_exception) {
std::rethrow_exception(thread_exception);
Expand Down
17 changes: 13 additions & 4 deletions src/generic/chunker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,15 @@ namespace ParaText {
#endif
for (size_t worker_id = 0; worker_id < maximum_chunks_; worker_id++) {
long end_of_chunk = std::min(lastpos_, start_of_chunk + (long)chunk_size);
if (end_of_chunk < start_of_chunk) {
start_of_chunk = lastpos_ + 1;
end_of_chunk = lastpos_ + 1;
start_of_chunk_.push_back(start_of_chunk);
end_of_chunk_.push_back(end_of_chunk);
break;
}
#ifdef PARALOAD_DEBUG
std::cout << "start_of_chunk: " << start_of_chunk << " end_of_chunk: " << end_of_chunk << std::endl;
std::cerr << ">>> start_of_chunk: " << start_of_chunk << " end_of_chunk: " << end_of_chunk << std::endl;
#endif
in_.clear();
in_.seekg(end_of_chunk - 1, std::ios_base::beg);
Expand Down Expand Up @@ -200,12 +207,12 @@ namespace ParaText {
std::vector<std::thread> threads;
std::vector<std::shared_ptr<QuoteNewlineAdjustmentWorker> > workers;
std::exception_ptr thread_exception;
for (size_t worker_id = 0; worker_id < maximum_chunks_; worker_id++) {
for (size_t worker_id = 0; worker_id < start_of_chunk_.size(); worker_id++) {
workers.push_back(std::make_shared<QuoteNewlineAdjustmentWorker>(start_of_chunk_[worker_id],
end_of_chunk_[worker_id]));
threads.emplace_back(&QuoteNewlineAdjustmentWorker::parse, workers.back(), filename_);
}
for (size_t thread_id = 0; thread_id < maximum_chunks_; thread_id++) {
for (size_t thread_id = 0; thread_id < threads.size(); thread_id++) {
threads[thread_id].join();
if (!thread_exception) {
thread_exception = workers[thread_id]->get_exception();
Expand All @@ -218,7 +225,9 @@ namespace ParaText {
size_t quotes_so_far = 0;
size_t cur_wid = 0;
size_t next_wid = 1;
quotes_so_far += workers[cur_wid]->get_num_quotes();
if (cur_wid < workers.size()) {
quotes_so_far += workers[cur_wid]->get_num_quotes();
}
while (cur_wid < workers.size()) {
if (end_of_chunk_[cur_wid] < -1 || start_of_chunk_[cur_wid] < -1) {
start_of_chunk_[cur_wid] = -1;
Expand Down
6 changes: 3 additions & 3 deletions tests/test_paratext.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_basic_3x1x(self):
assert_dictframe_almost_equal(actual, expected)


def test_basic_3x1x(self):
def test_basic_3x0x(self):
filedata = u"""A,B,C
"""
with generate_tempfile(filedata) as fn:
Expand All @@ -102,7 +102,7 @@ def run_case(self, num_rows, num_cats, num_floats, num_ints, num_threads):
assert_dictframe_almost_equal(actual, expected)

def test_mixed_frame(self):
for num_rows in [1, 2, 3, 5, 10, 100, 1000]:
for num_rows in [0, 1, 2, 3, 5, 10, 100, 1000]:
for num_cats in [1, 3, 5]:
for num_floats in [1, 3, 5]:
for num_ints in [0, 1, 5, 10, 50]:
Expand Down Expand Up @@ -131,7 +131,7 @@ def test_hell_frame(self):
for (frame_encoding, out_encoding) in formatting:
for include_null in [False, True]:
for allow_quoted_newlines in [False, True]:
for num_rows in [1,2,3,4,10,100,1000,3000]:
for num_rows in [0, 1,2,3,4,10,100,1000,3000]:
for num_cols in [1,2,3,4,5,10,20,30]:
for num_threads in [1,2,3,4,5,8,10,20,30]:
yield self.do_hell_frame, dos, frame_encoding, out_encoding, include_null, allow_quoted_newlines, num_rows, num_cols, num_threads

0 comments on commit c33c386

Please sign in to comment.