Skip to content

Commit

Permalink
ARROW-1785: [Format/C++/Java] Remove VectorLayout from serialized sch…
Browse files Browse the repository at this point in the history
…emas

What's here so far removes this code from the Flatbuffers schema and the C++ implementation. This logic is a little bit entangled on the Java side, I need some help from @icexelloss or @BryanCutler or @julienledem or someone else to handle the Java refactoring. It might be easiest to preserve the ArrowVectorType/TypeLayout/VectorLayout objects for now but simply remove the Flatbuffers dependency (we do need the names of the vectors in the JSON files used for integration testing)

cc @trxcllnt since we may want to roll in the JS changes in this patch

Author: Li Jin <[email protected]>
Author: Wes McKinney <[email protected]>
Author: Li Jin <[email protected]>

Closes apache#1297 from wesm/ARROW-1785 and squashes the following commits:

c1e7ea9 [Li Jin] Fix comment
43ff4e3 [Li Jin] Remove Json annotation
95e8736 [Li Jin] (Refactor) Move TypeLayout and VectorLayout from ipc.message to top level. Rename VectorLayout to BufferLayout.
31cbd48 [Li Jin] Fix TypeLayout.java
890a08d [Li Jin] Integration test passing
2024db5 [Wes McKinney] Remove VectorLayout from Flatbuffers, C++ implementation
  • Loading branch information
icexelloss authored and wesm committed Dec 4, 2017
1 parent fe6f60c commit 611a4b9
Show file tree
Hide file tree
Showing 20 changed files with 145 additions and 534 deletions.
39 changes: 0 additions & 39 deletions cpp/src/arrow/ipc/json-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,6 @@ namespace ipc {
namespace json {
namespace internal {

static std::string GetBufferTypeName(BufferType type) {
switch (type) {
case BufferType::DATA:
return "DATA";
case BufferType::OFFSET:
return "OFFSET";
case BufferType::TYPE:
return "TYPE";
case BufferType::VALIDITY:
return "VALIDITY";
default:
break;
}
return "UNKNOWN";
}

static std::string GetFloatingPrecisionName(FloatingPoint::Precision precision) {
switch (precision) {
case FloatingPoint::HALF:
Expand Down Expand Up @@ -174,12 +158,9 @@ class SchemaWriter {
RETURN_NOT_OK(WriteDictionaryMetadata(dict_type));

const DataType& dictionary_type = *dict_type.dictionary()->type();
const DataType& index_type = *dict_type.index_type();
RETURN_NOT_OK(WriteChildren(dictionary_type.children()));
WriteBufferLayout(index_type.GetBufferLayout());
} else {
RETURN_NOT_OK(WriteChildren(type.children()));
WriteBufferLayout(type.GetBufferLayout());
}

writer_->EndObject();
Expand Down Expand Up @@ -301,26 +282,6 @@ class SchemaWriter {
return Status::OK();
}

void WriteBufferLayout(const std::vector<BufferDescr>& buffer_layout) {
writer_->Key("typeLayout");
writer_->StartObject();
writer_->Key("vectors");
writer_->StartArray();

for (const BufferDescr& buffer : buffer_layout) {
writer_->StartObject();
writer_->Key("type");
writer_->String(GetBufferTypeName(buffer.type()));

writer_->Key("typeBitWidth");
writer_->Int(buffer.bit_width());

writer_->EndObject();
}
writer_->EndArray();
writer_->EndObject();
}

Status WriteChildren(const std::vector<std::shared_ptr<Field>>& children) {
writer_->Key("children");
writer_->StartArray();
Expand Down
34 changes: 2 additions & 32 deletions cpp/src/arrow/ipc/metadata-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ using DictionaryOffset = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
using FieldOffset = flatbuffers::Offset<flatbuf::Field>;
using KeyValueOffset = flatbuffers::Offset<flatbuf::KeyValue>;
using RecordBatchOffset = flatbuffers::Offset<flatbuf::RecordBatch>;
using VectorLayoutOffset = flatbuffers::Offset<arrow::flatbuf::VectorLayout>;
using Offset = flatbuffers::Offset<void>;
using FBString = flatbuffers::Offset<flatbuffers::String>;

Expand Down Expand Up @@ -341,34 +340,8 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
// TODO(wesm): Convert this to visitor pattern
static Status TypeToFlatbuffer(FBB& fbb, const DataType& type,
std::vector<FieldOffset>* children,
std::vector<VectorLayoutOffset>* layout,
flatbuf::Type* out_type, DictionaryMemo* dictionary_memo,
Offset* offset) {
std::vector<BufferDescr> buffer_layout = type.GetBufferLayout();
for (const BufferDescr& descr : buffer_layout) {
flatbuf::VectorType vector_type;
switch (descr.type()) {
case BufferType::OFFSET:
vector_type = flatbuf::VectorType_OFFSET;
break;
case BufferType::DATA:
vector_type = flatbuf::VectorType_DATA;
break;
case BufferType::VALIDITY:
vector_type = flatbuf::VectorType_VALIDITY;
break;
case BufferType::TYPE:
vector_type = flatbuf::VectorType_TYPE;
break;
default:
vector_type = flatbuf::VectorType_DATA;
break;
}
auto offset = flatbuf::CreateVectorLayout(
fbb, static_cast<int16_t>(descr.bit_width()), vector_type);
layout->push_back(offset);
}

const DataType* value_type = &type;

if (type.id() == Type::DICTIONARY) {
Expand Down Expand Up @@ -543,14 +516,11 @@ static Status FieldToFlatbuffer(FBB& fbb, const Field& field,

flatbuf::Type type_enum;
Offset type_offset;
Offset type_layout;
std::vector<FieldOffset> children;
std::vector<VectorLayoutOffset> layout;

RETURN_NOT_OK(TypeToFlatbuffer(fbb, *field.type(), &children, &layout, &type_enum,
RETURN_NOT_OK(TypeToFlatbuffer(fbb, *field.type(), &children, &type_enum,
dictionary_memo, &type_offset));
auto fb_children = fbb.CreateVector(children);
auto fb_layout = fbb.CreateVector(layout);

DictionaryOffset dictionary = 0;
if (field.type()->id() == Type::DICTIONARY) {
Expand All @@ -560,7 +530,7 @@ static Status FieldToFlatbuffer(FBB& fbb, const Field& field,

// TODO: produce the list of VectorTypes
*offset = flatbuf::CreateField(fbb, fb_name, field.nullable(), type_enum, type_offset,
dictionary, fb_children, fb_layout);
dictionary, fb_children);

return Status::OK();
}
Expand Down
37 changes: 0 additions & 37 deletions cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -474,43 +474,6 @@ std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) {
return std::make_shared<Decimal128Type>(precision, scale);
}

static const BufferDescr kValidityBuffer(BufferType::VALIDITY, 1);
static const BufferDescr kOffsetBuffer(BufferType::OFFSET, 32);
static const BufferDescr kTypeBuffer(BufferType::TYPE, 32);
static const BufferDescr kBooleanBuffer(BufferType::DATA, 1);
static const BufferDescr kValues64(BufferType::DATA, 64);
static const BufferDescr kValues32(BufferType::DATA, 32);
static const BufferDescr kValues16(BufferType::DATA, 16);
static const BufferDescr kValues8(BufferType::DATA, 8);

std::vector<BufferDescr> FixedWidthType::GetBufferLayout() const {
return {kValidityBuffer, BufferDescr(BufferType::DATA, bit_width())};
}

std::vector<BufferDescr> NullType::GetBufferLayout() const { return {}; }

std::vector<BufferDescr> BinaryType::GetBufferLayout() const {
return {kValidityBuffer, kOffsetBuffer, kValues8};
}

std::vector<BufferDescr> FixedSizeBinaryType::GetBufferLayout() const {
return {kValidityBuffer, BufferDescr(BufferType::DATA, bit_width())};
}

std::vector<BufferDescr> ListType::GetBufferLayout() const {
return {kValidityBuffer, kOffsetBuffer};
}

std::vector<BufferDescr> StructType::GetBufferLayout() const { return {kValidityBuffer}; }

std::vector<BufferDescr> UnionType::GetBufferLayout() const {
if (mode_ == UnionMode::SPARSE) {
return {kValidityBuffer, kTypeBuffer};
} else {
return {kValidityBuffer, kTypeBuffer, kOffsetBuffer};
}
}

std::string Decimal128Type::ToString() const {
std::stringstream s;
s << "decimal(" << precision_ << ", " << scale_ << ")";
Expand Down
30 changes: 0 additions & 30 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,20 +133,6 @@ struct Type {
};
};

enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY };

class BufferDescr {
public:
BufferDescr(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {}

BufferType type() const { return type_; }
int bit_width() const { return bit_width_; }

private:
BufferType type_;
int bit_width_;
};

class ARROW_EXPORT DataType {
public:
explicit DataType(Type::type id) : id_(id) {}
Expand Down Expand Up @@ -176,8 +162,6 @@ class ARROW_EXPORT DataType {
/// \since 0.7.0
virtual std::string name() const = 0;

virtual std::vector<BufferDescr> GetBufferLayout() const = 0;

Type::type id() const { return id_; }

protected:
Expand All @@ -201,8 +185,6 @@ class ARROW_EXPORT FixedWidthType : public DataType {
using DataType::DataType;

virtual int bit_width() const = 0;

std::vector<BufferDescr> GetBufferLayout() const override;
};

class ARROW_EXPORT PrimitiveCType : public FixedWidthType {
Expand Down Expand Up @@ -319,8 +301,6 @@ class ARROW_EXPORT NullType : public DataType, public NoExtraMeta {
std::string ToString() const override;

std::string name() const override { return "null"; }

std::vector<BufferDescr> GetBufferLayout() const override;
};

class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta {
Expand Down Expand Up @@ -425,8 +405,6 @@ class ARROW_EXPORT ListType : public NestedType {
std::string ToString() const override;

std::string name() const override { return "list"; }

std::vector<BufferDescr> GetBufferLayout() const override;
};

// BinaryType type is represents lists of 1-byte values.
Expand All @@ -440,8 +418,6 @@ class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta {
std::string ToString() const override;
std::string name() const override { return "binary"; }

std::vector<BufferDescr> GetBufferLayout() const override;

protected:
// Allow subclasses to change the logical type.
explicit BinaryType(Type::type logical_type) : DataType(logical_type) {}
Expand All @@ -461,8 +437,6 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri
std::string ToString() const override;
std::string name() const override { return "fixed_size_binary"; }

std::vector<BufferDescr> GetBufferLayout() const override;

int32_t byte_width() const { return byte_width_; }
int bit_width() const override;

Expand Down Expand Up @@ -494,8 +468,6 @@ class ARROW_EXPORT StructType : public NestedType {
Status Accept(TypeVisitor* visitor) const override;
std::string ToString() const override;
std::string name() const override { return "struct"; }

std::vector<BufferDescr> GetBufferLayout() const override;
};

class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
Expand Down Expand Up @@ -541,8 +513,6 @@ class ARROW_EXPORT UnionType : public NestedType {
std::string name() const override { return "union"; }
Status Accept(TypeVisitor* visitor) const override;

std::vector<BufferDescr> GetBufferLayout() const override;

const std::vector<uint8_t>& type_codes() const { return type_codes_; }

UnionMode::type mode() const { return mode_; }
Expand Down
31 changes: 1 addition & 30 deletions format/Schema.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -211,32 +211,6 @@ union Type {
Map
}

/// ----------------------------------------------------------------------
/// The possible types of a vector

enum VectorType: short {
/// used in List type, Dense Union and variable length primitive types (String, Binary)
OFFSET,
/// actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector
DATA,
/// Bit vector indicating if each value is null
VALIDITY,
/// Type vector used in Union type
TYPE
}

/// ----------------------------------------------------------------------
/// represents the physical layout of a buffer
/// buffers have fixed width slots of a given type

table VectorLayout {
/// the width of a slot in the buffer (typically 1, 8, 16, 32 or 64)
bit_width: short;
/// the purpose of the vector
type: VectorType;
}


/// ----------------------------------------------------------------------
/// user defined key value pairs to add custom metadata to arrow
/// key namespacing is the responsibility of the user
Expand Down Expand Up @@ -285,10 +259,7 @@ table Field {

// children apply only to Nested data types like Struct, List and Union
children: [Field];
/// layout of buffers produced for this type (as derived from the Type)
/// does not include children
/// each recordbatch will return instances of those Buffers.
layout: [ VectorLayout ];

// User-defined metadata
custom_metadata: [ KeyValue ];
}
Expand Down
28 changes: 6 additions & 22 deletions integration/data/simple.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,20 @@
{
"name": "foo",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": true, "children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
"nullable": true,
"children": []
},
{
"name": "bar",
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
"nullable": true, "children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 64}
]
}
"nullable": true,
"children": []
},
{
"name": "baz",
"type": {"name": "utf8"},
"nullable": true, "children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "OFFSET", "typeBitWidth": 32},
{"type": "DATA", "typeBitWidth": 8}
]
}
"nullable": true,
"children": []
}
]
},
Expand Down
Loading

0 comments on commit 611a4b9

Please sign in to comment.