Skip to content

Commit

Permalink
ARROW-8111: [C++] User-defined timestamp parser option to CSV, new Ti…
Browse files Browse the repository at this point in the history
…mestampParser interface, and strptime-compatible impl

This builds on the work from #6631 while adding unit tests and additional benchmarks.

I also renamed arrow/util/parsing.h to arrow/util/value_parsing.h to make it slightly more discoverable.

Closes #7088 from wesm/ARROW-8111

Lead-authored-by: Wes McKinney <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Alexey Prutskov <[email protected]>
Co-authored-by: Artem Alekseev <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
4 people committed May 5, 2020
1 parent 4116516 commit 46a17cb
Show file tree
Hide file tree
Showing 30 changed files with 825 additions and 268 deletions.
25 changes: 25 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1986,3 +1986,28 @@ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

--------------------------------------------------------------------------------

The file cpp/src/arrow/vendored/musl/strptime.c has the following license

Copyright © 2005-2020 Rich Felker, et al.

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3 changes: 2 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ set(ARROW_SRCS
util/logging.cc
util/key_value_metadata.cc
util/memory.cc
util/parsing.cc
util/string.cc
util/string_builder.cc
util/task_group.cc
Expand All @@ -188,6 +187,7 @@ set(ARROW_SRCS
util/trie.cc
util/uri.cc
util/utf8.cc
util/value_parsing.cc
vendored/base64.cpp
vendored/datetime/tz.cpp
vendored/double-conversion/bignum.cc
Expand All @@ -200,6 +200,7 @@ set(ARROW_SRCS
vendored/double-conversion/strtod.cc)

set(ARROW_C_SRCS
vendored/musl/strptime.c
vendored/uriparser/UriCommon.c
vendored/uriparser/UriCompare.c
vendored/uriparser/UriEscape.c
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/c/bridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/parsing.h"
#include "arrow/util/string_view.h"
#include "arrow/util/value_parsing.h"
#include "arrow/visitor_inline.h"

namespace arrow {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/cast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@
#include "arrow/util/formatting.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/parsing.h" // IWYU pragma: keep
#include "arrow/util/time.h"
#include "arrow/util/utf8.h"
#include "arrow/util/value_parsing.h" // IWYU pragma: keep
#include "arrow/visitor_inline.h"

#include "arrow/compute/context.h"
Expand Down
88 changes: 78 additions & 10 deletions cpp/src/arrow/csv/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/parsing.h" // IWYU pragma: keep
#include "arrow/util/trie.h"
#include "arrow/util/utf8.h"
#include "arrow/util/value_parsing.h" // IWYU pragma: keep

namespace arrow {
namespace csv {

using internal::checked_cast;
using internal::StringConverter;
using internal::Trie;
using internal::TrieBuilder;
Expand Down Expand Up @@ -381,32 +383,98 @@ class NumericConverter : public ConcreteConverter {
/////////////////////////////////////////////////////////////////////////
// Concrete Converter for timestamps

namespace {

struct InlineISO8601 {
TimeUnit::type unit;

explicit InlineISO8601(TimeUnit::type unit) : unit(unit) {}

bool operator()(const char* s, size_t length, int64_t* out) const {
return internal::ParseTimestampISO8601(s, length, unit, out);
}
};

struct SingleTimestampParser {
const TimestampParser& parser;
TimeUnit::type unit;

SingleTimestampParser(const TimestampParser& parser, TimeUnit::type unit)
: parser(parser), unit(unit) {}

bool operator()(const char* s, size_t length, int64_t* out) const {
return this->parser(s, length, this->unit, out);
}
};

struct MultipleTimestampParsers {
std::vector<const TimestampParser*> parsers;
TimeUnit::type unit;

MultipleTimestampParsers(const std::vector<std::shared_ptr<TimestampParser>>& parsers,
TimeUnit::type unit)
: unit(unit) {
for (const auto& parser : parsers) {
this->parsers.push_back(parser.get());
}
}

bool operator()(const char* s, size_t length, int64_t* out) const {
for (const auto& parser : this->parsers) {
if (parser->operator()(s, length, this->unit, out)) {
return true;
}
}
return false;
}
};

} // namespace

class TimestampConverter : public ConcreteConverter {
public:
using ConcreteConverter::ConcreteConverter;

Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
int32_t col_index) override {
template <typename ConvertValue>
Status ConvertValuesWith(const BlockParser& parser, int32_t col_index,
const ConvertValue& converter, TimestampBuilder* builder) {
using value_type = TimestampType::c_type;

TimestampBuilder builder(type_, pool_);
StringConverter<TimestampType> converter(type_);

auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
value_type value = 0;
if (IsNull(data, size, quoted)) {
builder.UnsafeAppendNull();
builder->UnsafeAppendNull();
return Status::OK();
}

if (ARROW_PREDICT_FALSE(
!converter(reinterpret_cast<const char*>(data), size, &value))) {
return GenericConversionError(type_, data, size);
}
builder.UnsafeAppend(value);
builder->UnsafeAppend(value);
return Status::OK();
};
return parser.VisitColumn(col_index, visit);
}

Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
int32_t col_index) override {
TimestampBuilder builder(type_, pool_);
RETURN_NOT_OK(builder.Resize(parser.num_rows()));
RETURN_NOT_OK(parser.VisitColumn(col_index, visit));

TimeUnit::type unit = checked_cast<const TimestampType&>(*type_).unit();
if (options_.timestamp_parsers.size() == 0) {
// Default to ISO-8601
InlineISO8601 converter(unit);
RETURN_NOT_OK(ConvertValuesWith(parser, col_index, converter, &builder));
} else if (options_.timestamp_parsers.size() == 1) {
// Single user-supplied converter
SingleTimestampParser converter(*options_.timestamp_parsers[0], unit);
RETURN_NOT_OK(ConvertValuesWith(parser, col_index, converter, &builder));
} else {
// Multiple converters, must iterate for each value
MultipleTimestampParsers converter(options_.timestamp_parsers, unit);
RETURN_NOT_OK(ConvertValuesWith(parser, col_index, converter, &builder));
}

std::shared_ptr<Array> res;
RETURN_NOT_OK(builder.Finish(&res));
Expand Down
62 changes: 43 additions & 19 deletions cpp/src/arrow/csv/converter_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@
#include <sstream>
#include <string>

#include "arrow/buffer.h"
#include "arrow/csv/converter.h"
#include "arrow/csv/options.h"
#include "arrow/csv/parser.h"
#include "arrow/csv/reader.h"
#include "arrow/csv/test_common.h"
#include "arrow/io/memory.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/value_parsing.h"

namespace arrow {
namespace csv {

static std::shared_ptr<BlockParser> BuildInt64Data(int32_t num_rows) {
const std::vector<std::string> base_rows = {"123\n", "4\n", "-317005557\n",
"\n", "N/A\n", "0\n"};
static std::shared_ptr<BlockParser> BuildFromExamples(
const std::vector<std::string>& base_rows, int32_t num_rows) {
std::vector<std::string> rows;
for (int32_t i = 0; i < num_rows; ++i) {
rows.push_back(base_rows[i % base_rows.size()]);
Expand All @@ -42,36 +45,40 @@ static std::shared_ptr<BlockParser> BuildInt64Data(int32_t num_rows) {
return result;
}

static std::shared_ptr<BlockParser> BuildInt64Data(int32_t num_rows) {
const std::vector<std::string> base_rows = {"123\n", "4\n", "-317005557\n",
"\n", "N/A\n", "0\n"};
return BuildFromExamples(base_rows, num_rows);
}

static std::shared_ptr<BlockParser> BuildFloatData(int32_t num_rows) {
const std::vector<std::string> base_rows = {"0\n", "123.456\n", "-3170.55766\n", "\n",
"N/A\n"};
std::vector<std::string> rows;
for (int32_t i = 0; i < num_rows; ++i) {
rows.push_back(base_rows[i % base_rows.size()]);
}

std::shared_ptr<BlockParser> result;
MakeCSVParser(rows, &result);
return result;
return BuildFromExamples(base_rows, num_rows);
}

static std::shared_ptr<BlockParser> BuildDecimal128Data(int32_t num_rows) {
const std::vector<std::string> base_rows = {"0\n", "123.456\n", "-3170.55766\n",
"\n", "N/A\n", "1233456789.123456789"};
std::vector<std::string> rows;
for (int32_t i = 0; i < num_rows; ++i) {
rows.push_back(base_rows[i % base_rows.size()]);
}

std::shared_ptr<BlockParser> result;
MakeCSVParser(rows, &result);
return result;
return BuildFromExamples(base_rows, num_rows);
}

static std::shared_ptr<BlockParser> BuildStringData(int32_t num_rows) {
return BuildDecimal128Data(num_rows);
}

static std::shared_ptr<BlockParser> BuildISO8601Data(int32_t num_rows) {
const std::vector<std::string> base_rows = {
"1917-10-17\n", "2018-09-13\n", "1941-06-22 04:00\n", "1945-05-09 09:45:38\n"};
return BuildFromExamples(base_rows, num_rows);
}

static std::shared_ptr<BlockParser> BuildStrptimeData(int32_t num_rows) {
const std::vector<std::string> base_rows = {"10/17/1917\n", "9/13/2018\n",
"9/5/1945\n"};
return BuildFromExamples(base_rows, num_rows);
}

static void BenchmarkConversion(benchmark::State& state, // NOLINT non-const reference
BlockParser& parser,
const std::shared_ptr<DataType>& type,
Expand Down Expand Up @@ -119,10 +126,27 @@ static void StringConversion(benchmark::State& state) { // NOLINT non-const ref
BenchmarkConversion(state, *parser, utf8(), options);
}

static void TimestampConversionDefault(
benchmark::State& state) { // NOLINT non-const reference
auto parser = BuildISO8601Data(num_rows);
auto options = ConvertOptions::Defaults();
BenchmarkConversion(state, *parser, timestamp(TimeUnit::MILLI), options);
}

static void TimestampConversionStrptime(
benchmark::State& state) { // NOLINT non-const reference
auto parser = BuildStrptimeData(num_rows);
auto options = ConvertOptions::Defaults();
options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%m/%d/%Y"));
BenchmarkConversion(state, *parser, timestamp(TimeUnit::MILLI), options);
}

BENCHMARK(Int64Conversion);
BENCHMARK(FloatConversion);
BENCHMARK(Decimal128Conversion);
BENCHMARK(StringConversion);
BENCHMARK(TimestampConversionDefault);
BENCHMARK(TimestampConversionStrptime);

} // namespace csv
} // namespace arrow
16 changes: 16 additions & 0 deletions cpp/src/arrow/csv/converter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
#include "arrow/util/value_parsing.h"

namespace arrow {
namespace csv {
Expand Down Expand Up @@ -374,6 +375,21 @@ TEST(TimestampConversion, CustomNulls) {
{{true}, {false}, {false}}, options);
}

TEST(TimestampConversion, UserDefinedParsers) {
auto options = ConvertOptions::Defaults();
auto type = timestamp(TimeUnit::MILLI);

// Test a single parser
options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y")};
AssertConversion<TimestampType, int64_t>(type, {"01/02/1970,01/03/1970\n"},
{{86400000}, {172800000}}, options);

// Test multiple parsers
options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
AssertConversion<TimestampType, int64_t>(type, {"01/02/1970,1970-01-03\n"},
{{86400000}, {172800000}}, options);
}

Decimal128 Dec128(util::string_view value) {
Decimal128 dec;
int32_t scale = 0;
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/arrow/csv/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
namespace arrow {

class DataType;
class TimestampParser;

namespace csv {

Expand Down Expand Up @@ -100,6 +101,13 @@ struct ARROW_EXPORT ConvertOptions {
/// This option is ignored if `include_columns` is empty.
bool include_missing_columns = false;

/// User-defined timestamp parsers, using the virtual parser interface in
/// arrow/util/value_parsing.h. More than one parser can be specified, and
/// the CSV conversion logic will try parsing values starting from the
/// beginning of this vector. If no parsers are specified, we use the default
/// built-in ISO-8601 parser
std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;

/// Create conversion options with default values, including conventional
/// values for `null_values`, `true_values` and `false_values`
static ConvertOptions Defaults();
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/filesystem/hdfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "arrow/io/hdfs_internal.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/util/parsing.h"
#include "arrow/util/value_parsing.h"
#include "arrow/util/windows_fixup.h"

namespace arrow {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/ipc/json_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
#include "arrow/util/formatting.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/parsing.h"
#include "arrow/util/string.h"
#include "arrow/util/value_parsing.h"
#include "arrow/visitor_inline.h"

namespace arrow {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/ipc/json_simple.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/parsing.h"
#include "arrow/util/string_view.h"
#include "arrow/util/value_parsing.h"

#include "arrow/json/rapidjson_defs.h"

Expand Down
Loading

0 comments on commit 46a17cb

Please sign in to comment.