Skip to content

Commit

Permalink
ARROW-3800: [C++] Vendor a string_view backport
Browse files Browse the repository at this point in the history
Vendor the `std::string_view` backport from https://github.com/martinmoene/string-view-lite

Author: Antoine Pitrou <[email protected]>

Closes apache#2974 from pitrou/ARROW-3800-string-view-backport and squashes the following commits:

4353414 <Antoine Pitrou> ARROW-3800:  Vendor a string_view backport
  • Loading branch information
pitrou authored and wesm committed Nov 15, 2018
1 parent 3e84f99 commit 948e0fb
Show file tree
Hide file tree
Showing 19 changed files with 1,505 additions and 166 deletions.
28 changes: 28 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -769,3 +769,31 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

--------------------------------------------------------------------------------

The file cpp/src/util/string_view/string_view.hpp has the following license

Boost Software License - Version 1.0 - August 17th, 2003

Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:

The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ if (UNIX)
(item MATCHES "xxhash.h") OR
(item MATCHES "xxhash.cc") OR
(item MATCHES "config.h") OR
(item MATCHES "util/string_view/") OR
(item MATCHES "util/variant") OR
(item MATCHES "zmalloc.h") OR
(item MATCHES "gandiva/precompiled/date.h") OR
Expand Down
1 change: 1 addition & 0 deletions cpp/build-support/clang_format_exclusions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*pyarrow_lib.h
*python/config.h
*python/platform.h
*util/string_view/*
*util/variant.h
*util/variant/*
*thirdparty/ae/*
Expand Down
3 changes: 2 additions & 1 deletion cpp/build-support/lint_cpp_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ def lint_file(path):


EXCLUSIONS = [
'arrow/util/macros.h',
'arrow/python/iterators.h',
'arrow/util/macros.h',
'arrow/util/parallel.h',
'arrow/util/string_view/string_view.hpp',
'gandiva/cache.h',
'gandiva/jni',
'gandiva/precompiled/date.h',
Expand Down
10 changes: 4 additions & 6 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1386,9 +1386,8 @@ TEST_F(TestBinaryArray, TestGetValue) {
if (valid_bytes_[i] == 0) {
ASSERT_TRUE(strings_->IsNull(i));
} else {
int32_t len = -1;
const uint8_t* bytes = strings_->GetValue(i, &len);
ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len));
ASSERT_FALSE(strings_->IsNull(i));
ASSERT_EQ(strings_->GetString(i), expected_[i]);
}
}
}
Expand All @@ -1398,9 +1397,8 @@ TEST_F(TestBinaryArray, TestNullValuesInitialized) {
if (valid_bytes_[i] == 0) {
ASSERT_TRUE(strings_->IsNull(i));
} else {
int32_t len = -1;
const uint8_t* bytes = strings_->GetValue(i, &len);
ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len));
ASSERT_FALSE(strings_->IsNull(i));
ASSERT_EQ(strings_->GetString(i), expected_[i]);
}
}
TestInitialized(*strings_);
Expand Down
39 changes: 22 additions & 17 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h"
#include "arrow/util/visibility.h"

namespace arrow {
Expand Down Expand Up @@ -488,27 +489,33 @@ class ARROW_EXPORT BinaryArray : public FlatArray {
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = 0, int64_t offset = 0);

// Return the pointer to the given elements bytes
// TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy
// pointer + offset
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
const uint8_t* GetValue(int64_t i, int32_t* out_length) const {
// Account for base offset
i += data_->offset;

const int32_t pos = raw_value_offsets_[i];
*out_length = raw_value_offsets_[i + 1] - pos;
return raw_data_ + pos;
}

/// \brief Get binary value as a string_view
///
/// \param i the value index
/// \return the view over the selected value
util::string_view GetView(int64_t i) const {
// Account for base offset
i += data_->offset;
const int32_t pos = raw_value_offsets_[i];
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
raw_value_offsets_[i + 1] - pos);
}

/// \brief Get binary value as a std::string
///
/// \param i the value index
/// \return the value copied into a std::string
std::string GetString(int64_t i) const {
int32_t length = 0;
const uint8_t* bytes = GetValue(i, &length);
return std::string(reinterpret_cast<const char*>(bytes), static_cast<size_t>(length));
}
std::string GetString(int64_t i) const { return std::string(GetView(i)); }

/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
Expand Down Expand Up @@ -555,14 +562,6 @@ class ARROW_EXPORT StringArray : public BinaryArray {
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = 0, int64_t offset = 0);

// Construct a std::string
// TODO: std::bad_alloc possibility
std::string GetString(int64_t i) const {
int32_t nchars;
const uint8_t* str = GetValue(i, &nchars);
return std::string(reinterpret_cast<const char*>(str), nchars);
}
};

// ----------------------------------------------------------------------
Expand All @@ -583,6 +582,12 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
const uint8_t* GetValue(int64_t i) const;
const uint8_t* Value(int64_t i) const { return GetValue(i); }

util::string_view GetView(int64_t i) const {
return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
}

std::string GetString(int64_t i) const { return std::string(GetView(i)); }

int32_t byte_width() const { return byte_width_; }

const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
Expand Down
73 changes: 43 additions & 30 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,6 @@ Status BooleanBuilder::AppendValues(const std::vector<bool>& values) {
// DictionaryBuilder

using internal::DictionaryScalar;
using internal::WrappedBinary;

namespace {

Expand Down Expand Up @@ -809,32 +808,28 @@ struct DictionaryHashHelper<T, enable_if_binary<T>> {
using Scalar = typename DictionaryScalar<T>::type;

static Scalar GetDictionaryValue(const Builder& builder, int64_t index) {
int32_t v_length;
const uint8_t* v_ptr = builder.GetValue(index, &v_length);
return WrappedBinary(v_ptr, v_length);
return builder.GetView(index);
}

static int64_t HashValue(const Scalar& value, int byte_width) {
return HashUtil::Hash<SSE4_FLAG>(value.ptr_, value.length_, 0);
return HashUtil::Hash<SSE4_FLAG>(value.data(), static_cast<int32_t>(value.length()),
0);
}

static bool SlotDifferent(const Builder& builder, int64_t index, const Scalar& value) {
int32_t other_length;
const uint8_t* other_ptr = builder.GetValue(index, &other_length);
return value.length_ != other_length ||
memcmp(value.ptr_, other_ptr, other_length) != 0;
const Scalar other = GetDictionaryValue(builder, index);
return value.length() != other.length() ||
memcmp(value.data(), other.data(), other.length()) != 0;
}

static Status AppendValue(Builder& builder, const Scalar& value) {
return builder.Append(value.ptr_, value.length_);
return builder.Append(value);
}

static Status AppendArray(Builder& builder, const Array& in_array) {
const auto& array = checked_cast<const BinaryArray&>(in_array);
for (uint64_t index = 0, limit = array.length(); index < limit; ++index) {
int32_t length;
const uint8_t* ptr = array.GetValue(index, &length);
RETURN_NOT_OK(builder.Append(ptr, length));
RETURN_NOT_OK(builder.Append(array.GetView(index)));
}
return Status::OK();
}
Expand Down Expand Up @@ -1033,12 +1028,12 @@ Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
}

const auto& numeric_array = checked_cast<const FixedSizeBinaryArray&>(array);
const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
RETURN_NOT_OK(AppendNull());
} else {
RETURN_NOT_OK(Append(numeric_array.Value(i)));
RETURN_NOT_OK(Append(typed_array.GetValue(i)));
}
}
return Status::OK();
Expand Down Expand Up @@ -1087,21 +1082,20 @@ Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* o
// StringType and BinaryType specializations
//

#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \
\
template <> \
Status DictionaryBuilder<Type>::AppendArray(const Array& array) { \
const BinaryArray& binary_array = checked_cast<const BinaryArray&>(array); \
WrappedBinary value(nullptr, 0); \
for (int64_t i = 0; i < array.length(); i++) { \
if (array.IsNull(i)) { \
RETURN_NOT_OK(AppendNull()); \
} else { \
value.ptr_ = binary_array.GetValue(i, &value.length_); \
RETURN_NOT_OK(Append(value)); \
} \
} \
return Status::OK(); \
#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \
\
template <> \
Status DictionaryBuilder<Type>::AppendArray(const Array& array) { \
using ArrayType = typename TypeTraits<Type>::ArrayType; \
const ArrayType& binary_array = checked_cast<const ArrayType&>(array); \
for (int64_t i = 0; i < array.length(); i++) { \
if (array.IsNull(i)) { \
RETURN_NOT_OK(AppendNull()); \
} else { \
RETURN_NOT_OK(Append(binary_array.GetView(i))); \
} \
} \
return Status::OK(); \
}

BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
Expand Down Expand Up @@ -1314,6 +1308,19 @@ const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const {
return value_data_builder_.data() + offset;
}

util::string_view BinaryBuilder::GetView(int64_t i) const {
const int32_t* offsets = offsets_builder_.data();
int32_t offset = offsets[i];
int32_t value_length;
if (i == (length_ - 1)) {
value_length = static_cast<int32_t>(value_data_builder_.length()) - offset;
} else {
value_length = offsets[i + 1] - offset;
}
return util::string_view(
reinterpret_cast<const char*>(value_data_builder_.data() + offset), value_length);
}

StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {}

Status StringBuilder::AppendValues(const std::vector<std::string>& values,
Expand Down Expand Up @@ -1455,6 +1462,12 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const {
return data_ptr + i * byte_width_;
}

util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const {
const uint8_t* data_ptr = byte_builder_.data();
return util::string_view(reinterpret_cast<const char*>(data_ptr + i * byte_width_),
byte_width_);
}

// ----------------------------------------------------------------------
// Struct

Expand Down
Loading

0 comments on commit 948e0fb

Please sign in to comment.