Skip to content

Commit

Permalink
improvements to escape characters
Browse files Browse the repository at this point in the history
  • Loading branch information
Damian Eads committed Sep 15, 2016
1 parent 6d210a9 commit 9a08026
Show file tree
Hide file tree
Showing 5 changed files with 228 additions and 60 deletions.
30 changes: 26 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,38 @@ Most CSV loading functions in ParaText have the following parameters:
* `block_size`: The number of bytes to read at a time in each worker
thread. The default is unlimited.

Escape Characters
-----------------

ParaText supports backslash escape characters:

* `\t': tab

* `\n': newline

* `\r': carriage return

* `\v': vertical tab

* `\0': null terminator (0x00)

* `\b': backspace

* '\xnn': an 8-bit character represented with a 2 digit hexidecimal number.

* '\unnnn': a Unicode code point represented as 4-digit hexidecimal number.

* '\Unnnnnnnn': a Unicode code point represented as 8-digit hexiecimal number.


Other Notes
-----------

ParaText is a work-in-progress. There are a few unimplemented features
that may prevent it from working on all CSV files. We note them below.

1. ParaText does not yet support escape characters or comments.

2. There is no way to supply type hints (e.g. `uint64` or `float`) of a
1. There is no way to supply type hints (e.g. `uint64` or `float`) of a
column. Only the interpretation of a column (numeric, categorical, or
text) can be forced.

3. DateTime support will be added in a future release.
2. DateTime support will be added in a future release.
38 changes: 38 additions & 0 deletions python/paratext/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,32 @@ def generate_hell_frame(num_rows, num_columns, include_null=False, fmt='arbitrar
return pandas.DataFrame(frame)

def save_frame(filename, frame, allow_quoted_newlines=True, out_format='arbitrary', dos=False):
"""
Saves a dictframe/DataFrame of sequences of the same size to a CSV file.
Parameters
----------
filename : str, unicode
The name of the filename to write.
frame : DataFrame, mapping, dict
This object must be DataFrame-like (ie implement .keys() and __getattr__).
allow_quoted_newlines : bool
Whether to allow newlines to be unescaped in a quoted string. If True, if newlines
are encountered, they will be escaped with two ASCII characters.
out_encoding : bool
The encoding to use. Valid options include:
- `utf-8`: UTF-8 data
- `arbitrary`: arbitrary bytes (values 0x00-0xFF)
- `printable_ascii`: values 0x20-0xFF. 0x0A is included if `allow_quoted_newlines`=True
- `ascii`: values 0x00-0x7F
If any values are outside of this range, they are backslash-escaped.
dos : bool
Whether to add a carriage return before a newline as done in Windows and DOS.
"""
f = open(filename, 'wb')
write_frame(f, frame, allow_quoted_newlines, out_format=out_format, dos=dos)
f.close()
Expand Down Expand Up @@ -162,6 +188,14 @@ def write_frame(stream, frame, allow_quoted_newlines=True, out_format='arbitrary

@contextmanager
def generate_tempfile(filedata):
"""
A context manager that generates a temporary file object that will be deleted
when the context goes out of scope. The mode of the file is "wb".
Parameters
----------
filedata : The data of the file to write as a bytes object.
"""
f = NamedTemporaryFile(delete=False, mode="wb", prefix="paratext-tests")
f.write(filedata)
name = f.name
Expand All @@ -171,6 +205,10 @@ def generate_tempfile(filedata):

@contextmanager
def generate_tempfilename():
"""
A context manager that generates a temporary filename that will be deleted
when the context goes out of scope.
"""
f = NamedTemporaryFile(delete=False, prefix="paratext-tests")
name = f.name
f.close()
Expand Down
74 changes: 65 additions & 9 deletions src/generic/chunker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,17 @@ namespace ParaText {
}

private:
long get_num_trailing_escapes(long start_of_chunk, long end_of_chunk) {
std::pair<long, char> get_num_trailing_escapes(long start_of_chunk, long end_of_chunk) {
long num_trailing_escapes = 0;
long k = end_of_chunk;
for (; k >= start_of_chunk; k++) {
char successor = 0;
if (end_of_chunk < lastpos_) {
in_.clear();
in_.seekg(end_of_chunk + 1, std::ios_base::beg);
in_.read(&successor, 1);
}

for (; k >= start_of_chunk; k--) {
in_.clear();
in_.seekg(k, std::ios_base::beg);
char buf;
Expand All @@ -124,7 +131,7 @@ namespace ParaText {
}
num_trailing_escapes++;
}
return num_trailing_escapes;
return std::make_pair(num_trailing_escapes, successor);
}

void compute_offsets(bool allow_quoted_newlines = true) {
Expand All @@ -149,13 +156,54 @@ namespace ParaText {
if (worker_id == maximum_chunks_ - 1) {
end_of_chunk = lastpos_;
}
long trailing_escapes = get_num_trailing_escapes(start_of_chunk, end_of_chunk);
long trailing_escapes;
char trailing_successor;
std::tie(trailing_escapes, trailing_successor) = get_num_trailing_escapes(start_of_chunk, end_of_chunk);
if (trailing_escapes % 2 == 1) {
if (end_of_chunk == lastpos_) {
throw std::logic_error("file ends with a trailing escape");
long extra = 0;
switch (trailing_successor) {
case 'x': /* \xYY */
extra = 3;
break;
case 'u': /* \uXXXX */
extra = 5;
break;
case 'U': /* \UXXXXXXXX */
extra = 9;
break;
case 'n':
case '0':
case 'r':
case 'v':
case 't':
case 'b':
case '\\':
case '\"':
case '\'':
case '{':
case '}':
case ' ':
case ',':
case ')':
case '(':
extra = 1;
break;
default:
{
std::ostringstream ostr;
ostr << "invalid escape character: \\" << ostr;
}
}
if (end_of_chunk + extra > lastpos_) {
std::ostringstream ostr;
ostr << "file ends with a trailing escape sequence \\" << trailing_successor;
throw std::logic_error(ostr.str());
}
else {
end_of_chunk++;
#ifdef PARALOAD_DEBUG
std::cerr << "cover escape: " << end_of_chunk << std::endl;
#endif
}
}
start_of_chunk_.push_back(start_of_chunk);
Expand Down Expand Up @@ -236,6 +284,11 @@ namespace ParaText {
thread_exception = workers[thread_id]->get_exception();
}
}
for (size_t chunk_id = 0; chunk_id < workers.size(); chunk_id++) {
#ifdef PARALOAD_DEBUG
std::cerr << "quotes>>> wid=" << chunk_id << " start_of_chunk: " << start_of_chunk_[chunk_id] << " end_of_chunk: " << end_of_chunk_[chunk_id] << " num_quotes: " << workers[chunk_id]->get_num_quotes() << std::endl;
#endif
}
// We're now outside the parallel region.
if (thread_exception) {
std::rethrow_exception(thread_exception);
Expand All @@ -254,10 +307,13 @@ namespace ParaText {
std::cerr << "negative chunk cur_wid=" << cur_wid << " next_wid=" << next_wid << std::endl;
#endif
cur_wid++;
next_wid = cur_wid + 1;
continue;
/* if (next_wid < workers.size()) {
quotes_so_far += workers[next_wid]->get_num_quotes();
next_wid++;
}
continue;*/
}
if (quotes_so_far % 2 == 0) {
else if (quotes_so_far % 2 == 0) {
if (next_wid < workers.size()) {
quotes_so_far += workers[next_wid]->get_num_quotes();
#ifdef PARALOAD_DEBUG
Expand Down
26 changes: 25 additions & 1 deletion src/generic/quote_adjustment_worker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q1:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = false;
i++;
break;
Expand All @@ -103,6 +106,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q2:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = true;
i++;
break;
Expand All @@ -111,7 +117,7 @@ class QuoteNewlineAdjustmentWorker {
first_unquoted_newline_ = current + i;
i++;
break;
}
}
}
}
}
Expand All @@ -126,6 +132,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q3:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = false;
i++;
break;
Expand All @@ -142,6 +151,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q4:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = true;
i++;
break;
Expand All @@ -165,6 +177,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q5:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = false;
i++;
break;
Expand All @@ -186,6 +201,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q6:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = true;
i++;
break;
Expand All @@ -208,6 +226,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q7:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = false;
i++;
break;
Expand All @@ -224,6 +245,9 @@ class QuoteNewlineAdjustmentWorker {
}
else if (buf[i] == '\"') {
num_quotes_++;
#ifdef PARATEXT_DEBUG_QUOTE
std::cerr << "[Q8:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
#endif
in_quote = true;
i++;
break;
Expand Down
Loading

0 comments on commit 9a08026

Please sign in to comment.