Skip to content

Commit

Permalink
ARROW-2411: [C++] Add StringBuilder::Append(const char **values)
Browse files Browse the repository at this point in the history
This implementation uses not NULL-terminated C strings and the length of values instead of NULL-terminated C strings. Because other builder uses the interface such as `PrimitiveBuilder::Append()`.

Author: Kouhei Sutou <[email protected]>
Author: Antoine Pitrou <[email protected]>

Closes apache#1852 from kou/cpp-string-builder-append-cstrings and squashes the following commits:

df00afc <Antoine Pitrou> Nit: fix docstring wording
ed8203a <Kouhei Sutou>  Process NULL char * as a null value
6d4669b <Kouhei Sutou>  Fix expected value
d697e49 <Kouhei Sutou>  Format
001ea51 <Kouhei Sutou>  Process nullptr char * as a NULL value
0a11f7b <Kouhei Sutou>  Use "nul-terminated char *" instead of "C string"
f06fd76 <Kouhei Sutou>  Add StringBuilder::Append(const char **values)
  • Loading branch information
kou authored and pitrou committed Apr 10, 2018
1 parent 6633cc9 commit 91ec792
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 0 deletions.
66 changes: 66 additions & 0 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,72 @@ TEST_F(TestStringBuilder, TestAppendVector) {
}
}

TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) {
const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""};
vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1};

int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
int reps = 1000;

for (int j = 0; j < reps; ++j) {
ASSERT_OK(builder_->Append(strings, N, valid_bytes.data()));
}
Done();

ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(reps * 3, result_->null_count());
ASSERT_EQ(reps * 3, result_->value_data()->size());

int32_t length;
int32_t pos = 0;
for (int i = 0; i < N * reps; ++i) {
auto string = strings[i % N];
if (string && valid_bytes[i % N]) {
ASSERT_FALSE(result_->IsNull(i));
result_->GetValue(i, &length);
ASSERT_EQ(pos, result_->value_offset(i));
ASSERT_EQ(static_cast<int32_t>(strlen(string)), length);
ASSERT_EQ(strings[i % N], result_->GetString(i));

pos += length;
} else {
ASSERT_TRUE(result_->IsNull(i));
}
}
}

TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) {
const char* strings[] = {"", "bb", "a", nullptr, "ccc"};

int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
int reps = 1000;

for (int j = 0; j < reps; ++j) {
ASSERT_OK(builder_->Append(strings, N));
}
Done();

ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(reps, result_->null_count());
ASSERT_EQ(reps * 6, result_->value_data()->size());

int32_t length;
int32_t pos = 0;
for (int i = 0; i < N * reps; ++i) {
if (strings[i % N]) {
ASSERT_FALSE(result_->IsNull(i));
result_->GetValue(i, &length);
ASSERT_EQ(pos, result_->value_offset(i));
ASSERT_EQ(static_cast<int32_t>(strlen(strings[i % N])), length);
ASSERT_EQ(strings[i % N], result_->GetString(i));

pos += length;
} else {
ASSERT_TRUE(result_->IsNull(i));
}
}
}

TEST_F(TestStringBuilder, TestZeroLength) {
// All buffers are null
Done();
Expand Down
58 changes: 58 additions & 0 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1413,6 +1413,64 @@ Status StringBuilder::Append(const std::vector<std::string>& values,
return Status::OK();
}

Status StringBuilder::Append(const char** values, int64_t length,
const uint8_t* valid_bytes) {
std::size_t total_length = 0;
std::vector<std::size_t> value_lengths(length);
bool have_null_value = false;
for (int64_t i = 0; i < length; ++i) {
if (values[i]) {
auto value_length = strlen(values[i]);
value_lengths[i] = value_length;
total_length += value_length;
} else {
have_null_value = true;
}
}
RETURN_NOT_OK(Reserve(length));
RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
RETURN_NOT_OK(offsets_builder_.Reserve(length));

if (valid_bytes) {
int64_t valid_bytes_offset = 0;
for (int64_t i = 0; i < length; ++i) {
RETURN_NOT_OK(AppendNextOffset());
if (valid_bytes[i]) {
if (values[i]) {
RETURN_NOT_OK(value_data_builder_.Append(
reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
} else {
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset);
UnsafeAppendToBitmap(false);
valid_bytes_offset = i + 1;
}
}
}
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
} else {
if (have_null_value) {
std::vector<uint8_t> valid_vector(length, 0);
for (int64_t i = 0; i < length; ++i) {
RETURN_NOT_OK(AppendNextOffset());
if (values[i]) {
RETURN_NOT_OK(value_data_builder_.Append(
reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
valid_vector[i] = 1;
}
}
UnsafeAppendToBitmap(valid_vector.data(), length);
} else {
for (int64_t i = 0; i < length; ++i) {
RETURN_NOT_OK(AppendNextOffset());
RETURN_NOT_OK(value_data_builder_.Append(
reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
}
UnsafeAppendToBitmap(nullptr, length);
}
}
return Status::OK();
}

// ----------------------------------------------------------------------
// Fixed width binary

Expand Down
12 changes: 12 additions & 0 deletions cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,18 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder {

Status Append(const std::vector<std::string>& values,
const uint8_t* valid_bytes = NULLPTR);

/// \brief Append a sequence of nul-terminated strings in one shot.
/// If one of the values is NULL, it is processed as a null
/// value even if the corresponding valid_bytes entry is 1.
///
/// \param[in] values a contiguous C array of nul-terminated char *
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status Append(const char** values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
};

// ----------------------------------------------------------------------
Expand Down

0 comments on commit 91ec792

Please sign in to comment.