From 6926d11e6c15f31600ef1cc5550008a6fc0952a2 Mon Sep 17 00:00:00 2001 From: Naveen Michaud-Agrawal Date: Thu, 1 Feb 2018 19:33:39 -0500 Subject: [PATCH 1/3] Handle validity bitmap in array data --- packages/perspective/src/cpp/main.cpp | 281 ++++++++++++--------- packages/perspective/src/js/perspective.js | 19 +- 2 files changed, 168 insertions(+), 132 deletions(-) diff --git a/packages/perspective/src/cpp/main.cpp b/packages/perspective/src/cpp/main.cpp index 942ac2fe4e..4f2ec9d357 100644 --- a/packages/perspective/src/cpp/main.cpp +++ b/packages/perspective/src/cpp/main.cpp @@ -143,24 +143,82 @@ _get_aggspecs(val j_aggs) */ -void -vecFromTypedArray(const val &typedArray, void* data, t_int32 length) { - val memory = val::module_property("buffer"); - val memoryView = typedArray["constructor"].new_(memory, reinterpret_cast(data), length); - memoryView.call("set", typedArray.call("slice", 0, length)); -} +namespace arrow { + + void + vecFromTypedArray(const val &typedArray, void* data, t_int32 length) { + val memory = val::module_property("buffer"); + val memoryView = typedArray["constructor"].new_(memory, reinterpret_cast(data), length); + memoryView.call("set", typedArray.call("slice", 0, length)); + } + + void + fill_col_valid(val dcol, t_col_sptr col) + { + //dcol should be the Uint8Array containing the null bitmap + t_uindex nrows = col->size(); + // arrow packs bools into a bitmap + for (auto i = 0; i < nrows; ++i) + { + t_uint8 elem = dcol[i / 8].as(); + t_bool v = elem & (1 << (i % 8)); + col->set_valid(i, v); + } + } + + template + void + fill_col_dict(t_uint32 nrows, val dcol, val vkeys, t_col_sptr col) + { + t_int32 ksize = vkeys["length"].as(); + std::vector keys; + keys.reserve(ksize); + keys.resize(ksize); + vecFromTypedArray(vkeys, keys.data(), ksize); + + val values = dcol["data"]["values"]; + val vdata = values["data"]; + t_int32 vsize = vdata["length"].as(); + std::vector data; + data.reserve(vsize); + data.resize(vsize); + vecFromTypedArray(vdata, data.data(), vsize); + + val voffsets = values["offsets"]; + t_int32 osize = voffsets["length"].as(); + std::vector offsets; + offsets.reserve(osize); + offsets.resize(osize); + vecFromTypedArray(voffsets, offsets.data(), osize); + + t_str elem; + + for (t_int32 i = 0; i < nrows; ++i) { + T idx = keys[i]; + + if (idx == -1) { + col->clear(i); + } else { + t_int32 bidx = offsets[idx]; + std::size_t s = offsets[idx+1] - bidx; + elem.assign(reinterpret_cast(data.data())+bidx, s); + col->set_nth(i, elem); + } + } + } +} template void -_fill_col(val dcol, t_col_sptr col) +_fill_col(val dcol, t_col_sptr col, t_bool is_arrow) { t_uindex nrows = col->size(); - if (!dcol["buffer"].isUndefined()) { + if (is_arrow) { + val data = dcol["data"]; t_lstore* lstore = col->_get_data_lstore(); - vecFromTypedArray(dcol, lstore->get_ptr(0), nrows); - col->valid_raw_fill(true); + arrow::vecFromTypedArray(data, lstore->get_ptr(0), nrows); } else { for (auto i = 0; i < nrows; ++i) { @@ -172,31 +230,31 @@ _fill_col(val dcol, t_col_sptr col) template<> void -_fill_col(val dcol, t_col_sptr col) +_fill_col(val dcol, t_col_sptr col, t_bool is_arrow) { t_uindex nrows = col->size(); - if (dcol["constructor"]["name"].as() == "Int32Array") { + if (is_arrow) { + val data = dcol["data"]; t_lstore* lstore = col->_get_data_lstore(); // arrow packs 64 bit into two 32 bit ints - vecFromTypedArray(dcol, lstore->get_ptr(0), nrows * 2); - col->valid_raw_fill(true); + arrow::vecFromTypedArray(data, lstore->get_ptr(0), nrows * 2); } else { - throw std::logic_error("Unreachable"); + throw std::logic_error("Unreachable - can't have DTYPE_INT64 column from non-arrow data"); } } template<> void -_fill_col(val dcol, t_col_sptr col) +_fill_col(val dcol, t_col_sptr col, t_bool is_arrow) { t_uindex nrows = col->size(); - if (dcol["constructor"]["name"].as() == "Uint32Array") { + if (is_arrow) { + val data = dcol["data"]; t_lstore* lstore = col->_get_data_lstore(); // arrow packs 64 bit into two 32 bit ints - vecFromTypedArray(dcol, lstore->get_ptr(0), nrows*2); - col->valid_raw_fill(true); + arrow::vecFromTypedArray(data, lstore->get_ptr(0), nrows*2); } else { for (auto i = 0; i < nrows; ++i) { @@ -208,15 +266,16 @@ _fill_col(val dcol, t_col_sptr col) template<> void -_fill_col(val dcol, t_col_sptr col) +_fill_col(val dcol, t_col_sptr col, t_bool is_arrow) { t_uindex nrows = col->size(); - if (dcol["constructor"]["name"].as() == "Uint8Array") { + if (is_arrow) { // arrow packs bools into a bitmap + val data = dcol["data"]; for (auto i = 0; i < nrows; ++i) { - t_uint8 elem = dcol[i / 8].as(); + t_uint8 elem = data[i / 8].as(); t_bool v = elem & (1 << (i % 8)); col->set_nth(i, v); } @@ -230,99 +289,59 @@ _fill_col(val dcol, t_col_sptr col) } -template -void -_fill_col_dict(t_uint32 nrows, val dcol, val vkeys, t_col_sptr col) -{ - t_int32 ksize = vkeys["length"].as(); - std::vector keys; - keys.reserve(ksize); - keys.resize(ksize); - vecFromTypedArray(vkeys, keys.data(), ksize); - - val values = dcol["data"]["values"]; - val vdata = values["data"]; - t_int32 vsize = vdata["length"].as(); - std::vector data; - data.reserve(vsize); - data.resize(vsize); - vecFromTypedArray(vdata, data.data(), vsize); - - val voffsets = values["offsets"]; - t_int32 osize = voffsets["length"].as(); - std::vector offsets; - offsets.reserve(osize); - offsets.resize(osize); - vecFromTypedArray(voffsets, offsets.data(), osize); - - t_str elem; - - for (t_int32 i = 0; i < nrows; ++i) { - T idx = keys[i]; - - if (idx == -1) { - col->clear(i); - } else { - t_int32 bidx = offsets[idx]; - std::size_t s = offsets[idx+1] - bidx; - elem.assign(reinterpret_cast(data.data())+bidx, s); - col->set_nth(i, elem); - } - } -} - template<> void -_fill_col(val dcol, t_col_sptr col) +_fill_col(val dcol, t_col_sptr col, t_bool is_arrow) { t_uindex nrows = col->size(); - if (dcol["constructor"]["name"].as() == "DictionaryVector") { - val vkeys = dcol["keys"]["data"]; - - auto width = vkeys["constructor"]["BYTES_PER_ELEMENT"].as(); - switch (width) { - case 1: - _fill_col_dict(nrows, dcol, vkeys, col); - break; - case 2: - _fill_col_dict(nrows, dcol, vkeys, col); - break; - case 4: - _fill_col_dict(nrows, dcol, vkeys, col); - break; - default: - break; - } - - } else if (dcol["constructor"]["name"].as() == "Utf8Vector") { - val values = dcol["values"]; - - val vdata = values["data"]; - t_int32 vsize = vdata["length"].as(); - std::vector data; - data.reserve(vsize); - data.resize(vsize); - vecFromTypedArray(vdata, data.data(), vsize); - - val voffsets = values["offsets"]; - t_int32 osize = voffsets["length"].as(); - std::vector offsets; - offsets.reserve(osize); - offsets.resize(osize); - vecFromTypedArray(voffsets, offsets.data(), osize); - - t_str elem; - - for (t_int32 i = 0; i < nrows; ++i) { - t_int32 bidx = offsets[i]; - std::size_t es = offsets[i+1] - bidx; - if (es > 0) { - elem.assign(reinterpret_cast(data.data())+bidx, es); - col->set_nth(i, elem); - } else { - col->clear(i); + if (is_arrow) { + if (dcol["constructor"]["name"].as() == "DictionaryVector") { + val vkeys = dcol["keys"]["data"]; + + auto width = vkeys["constructor"]["BYTES_PER_ELEMENT"].as(); + switch (width) { + case 1: + arrow::fill_col_dict(nrows, dcol, vkeys, col); + break; + case 2: + arrow::fill_col_dict(nrows, dcol, vkeys, col); + break; + case 4: + arrow::fill_col_dict(nrows, dcol, vkeys, col); + break; + default: + break; + } + } else if (dcol["constructor"]["name"].as() == "Utf8Vector") { + val values = dcol["values"]; + + val vdata = values["data"]; + t_int32 vsize = vdata["length"].as(); + std::vector data; + data.reserve(vsize); + data.resize(vsize); + arrow::vecFromTypedArray(vdata, data.data(), vsize); + + val voffsets = values["offsets"]; + t_int32 osize = voffsets["length"].as(); + std::vector offsets; + offsets.reserve(osize); + offsets.resize(osize); + arrow::vecFromTypedArray(voffsets, offsets.data(), osize); + + t_str elem; + + for (t_int32 i = 0; i < nrows; ++i) { + t_int32 bidx = offsets[i]; + std::size_t es = offsets[i+1] - bidx; + if (es > 0) { + elem.assign(reinterpret_cast(data.data())+bidx, es); + col->set_nth(i, elem); + } else { + col->clear(i); + } } } } else { @@ -350,7 +369,8 @@ _fill_data(t_table_sptr tbl, t_svec ocolnames, val j_data, std::vector odt, - t_uint32 offset) + t_uint32 offset, + t_bool is_arrow) { std::vector data_cols = vecFromJSArray(j_data); for (auto cidx = 0; cidx < ocolnames.size(); ++cidx) @@ -364,52 +384,68 @@ _fill_data(t_table_sptr tbl, { case DTYPE_INT8: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_INT16: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_INT32: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_INT64: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_BOOL: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_FLOAT32: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_FLOAT64: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_TIME: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; case DTYPE_STR: { - _fill_col(dcol, col); + _fill_col(dcol, col, is_arrow); } break; default: break; } + if (is_arrow) { + // Fill validity bitmap + t_uint32 null_count = dcol["nullCount"].as(); + + if (null_count == 0) { + col->valid_raw_fill(true); + } else { + val validity = dcol; + if (dcol["constructor"]["name"].as() == "Utf8Vector") { + validity = dcol["values"]["validity"]["data"]; + } else { + validity = dcol["validity"]["data"]; + } + arrow::fill_col_valid(validity, col); + } + } } } @@ -439,7 +475,8 @@ make_table( val j_data, t_uint32 offset, t_str index, - t_dtype tindex + t_dtype tindex, + t_bool is_arrow ) { // Create the input and port schemas t_svec colnames = vecFromJSArray(j_colnames); @@ -451,7 +488,7 @@ make_table( tbl->init(); tbl->extend(size); - _fill_data(tbl, colnames, j_data, dtypes, offset); + _fill_data(tbl, colnames, j_data, dtypes, offset, is_arrow); // Set up pkey and op columns auto op_col = tbl->add_column("psp_op", DTYPE_UINT8, false); @@ -700,7 +737,7 @@ scalar_to_val(const t_tscalvec& scalars, t_uint32 idx) } case DTYPE_NONE: { - return val::undefined(); + return val::null(); } case DTYPE_STR: default: diff --git a/packages/perspective/src/js/perspective.js b/packages/perspective/src/js/perspective.js index 1656600cf9..cf8ad890f9 100644 --- a/packages/perspective/src/js/perspective.js +++ b/packages/perspective/src/js/perspective.js @@ -261,6 +261,7 @@ function parse_data(data, names, types) { return { row_count: row_count, + is_arrow: false, names: names, types: types, cdata: cdata @@ -323,19 +324,13 @@ function load_arrow_buffer(data, names, types) { continue; break; } - switch (column.type) { - case 'Utf8': - cdata.push(column); - break; - default: - cdata.push(column.slice()); - break; - } + cdata.push(column); names.push(column.name); } return { row_count: arrow.length, + is_arrow: true, names: names, types: types, cdata: cdata @@ -979,7 +974,9 @@ table.prototype.update = function (data) { let tbl; try { - tbl = __MODULE__.make_table(pdata.row_count || 0, pdata.names, pdata.types, pdata.cdata, this.gnode.get_table().size(), this.index || "", this.tindex); + tbl = __MODULE__.make_table(pdata.row_count || 0, + pdata.names, pdata.types, pdata.cdata, this.gnode.get_table().size(), this.index || "", this.tindex, + pdata.is_arrow); __MODULE__.fill(this.pool, this.gnode, tbl); this.initialized = true; } catch (e) { @@ -1215,7 +1212,9 @@ const perspective = { pool = new __MODULE__.t_pool({_update_callback: function() {} } ); // Fill t_table with data - tbl = __MODULE__.make_table(pdata.row_count || 0, pdata.names, pdata.types, pdata.cdata, 0, options.index, tindex); + tbl = __MODULE__.make_table(pdata.row_count || 0, + pdata.names, pdata.types, pdata.cdata, 0, options.index, tindex, + pdata.is_arrow); gnode = __MODULE__.make_gnode(tbl); pool.register_gnode(gnode); From 83751ee426fe506c934e599aa1f9f04cab5916bc Mon Sep 17 00:00:00 2001 From: Naveen Michaud-Agrawal Date: Thu, 1 Feb 2018 22:46:37 -0500 Subject: [PATCH 2/3] Add arrow test for null data --- packages/perspective/test/arrow/test-null.arrow | Bin 0 -> 4994 bytes packages/perspective/test/js/constructors.js | 6 +++--- packages/perspective/test/js/pivots.js | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) create mode 100644 packages/perspective/test/arrow/test-null.arrow diff --git a/packages/perspective/test/arrow/test-null.arrow b/packages/perspective/test/arrow/test-null.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d4bc34af2d52872eeb2480334362cab91797e54d GIT binary patch literal 4994 zcmeHLF^C&S6rI!R&O(UfIGk{p5X5qYF^6+L3vsxyOj3kkh%pWcI7Zo%R`v#I_mEa5 z_E|V(iWDi+1ddcLQlxN^@<>vIR4zhrnKFSCA=On;-kX^(uV(jDyU>W)+5i5Zd9#21 z{N2&Dn>XM8@RCxhb4ICkv^DjLT2rg4qVI88@ieYf6*{HL&{Wa>UFJR1sKhI{QpbKj zTisK%hkg`y!_V5Cq(2_SBmYYM_MPpzZ^X9Ng$}C4Lq84%AyyGGR=4lK*?>5jFcp=SR0gJ}+BpS|f zXmk<9ev?^E*gT zhpQ8$;oW2ybpq{%c|I(X>p*Ot@t`fe^j2!D&kI%mowD7B%^0a;ULkWtt57#kcwt*q|hx`WDbw@^7V=39=%IkNpn^Q#r& zEZu=FK`T&vi=#FfllmA^u_#2Z{R*V z<|B=pxxjrY!UMc_itQ6?OD_5a$wS>g#r8LATNm5QyXsqOy9Lbkdj-C<{#a>ApJqe- zZu)q9#_YU*)ObnHqX#>#$=TVO7=PF`@#@fK`4~A|J;o2$na_eZn3iyPib`{q8W!5T zYHONal{`fk)~TG8>TGFs7;3z#Hqk>~#n*vlmxv~$J+1I@S)`RvA%=*aEj8tAp^nSQ zu_YXW{Z5xQ*P(dvT(8eRm*;7&76;H}e*S&nod2rTvUx3YFY}XD=3Tcof}gjtv^O06 zM~?n;NB@PDW&9&Y|HR6iE6We2k%{?95#0Jyj+8J>=HeIBMsc z3jsTR`O{mv{qwJeZr@qIqT8e9Rkn=L8g?X;5nm%rZ+wbdxH2Bd#bMsH!E?hpq(#sc z-X7psz#_3;*Ag$w<3R4@QS_DGd@nDL e1NysbOXua~ae(h>{Jrtg9|w-`UcILOd;1@&e#UJ8 literal 0 HcmV?d00001 diff --git a/packages/perspective/test/js/constructors.js b/packages/perspective/test/js/constructors.js index f9efd208e8..38e816c265 100644 --- a/packages/perspective/test/js/constructors.js +++ b/packages/perspective/test/js/constructors.js @@ -45,14 +45,14 @@ var meta_3 = { 'z': "boolean" }; -import arrow_buffer from "../arrow/test.arrow"; +import arrow from "../arrow/test-null.arrow"; var arrow_result = [ {"f32": 1.5, "f64": 1.5, "i64": 1, "i32": 1, "i16": 1, "i8": 1, "bool": true, "char": "a", "dict": "a", "datetime": +(new Date("2018-01-25"))}, {"f32": 2.5, "f64": 2.5, "i64": 2, "i32": 2, "i16": 2, "i8": 2, "bool": false, "char": "b", "dict": "b", "datetime": +(new Date("2018-01-26"))}, {"f32": 3.5, "f64": 3.5, "i64": 3, "i32": 3, "i16": 3, "i8": 3, "bool": true, "char": "c", "dict": "c", "datetime": +(new Date("2018-01-27"))}, {"f32": 4.5, "f64": 4.5, "i64": 4, "i32": 4, "i16": 4, "i8": 4, "bool": false, "char": "d", "dict": "d", "datetime": +(new Date("2018-01-28"))}, - {"f32": 5.5, "f64": 5.5, "i64": 5, "i32": 5, "i16": 5, "i8": 5, "bool": true, "char": "d", "dict": "d", "datetime": +(new Date("2018-01-29"))} + {"f32": null, "f64": null, "i64": null, "i32": null, "i16": null, "i8": null, "bool": null, "char": null, "dict": null, "datetime": null} ]; var dt = new Date(); @@ -107,7 +107,7 @@ module.exports = (perspective) => { }); it("Arrow constructor", async function () { - var table = perspective.table(arrow_buffer); + var table = perspective.table(arrow); var view = table.view(); let result = await view.to_json(); expect(arrow_result).toEqual(result); diff --git a/packages/perspective/test/js/pivots.js b/packages/perspective/test/js/pivots.js index d3c36abcc9..d892d2c3a0 100644 --- a/packages/perspective/test/js/pivots.js +++ b/packages/perspective/test/js/pivots.js @@ -188,10 +188,10 @@ module.exports = (perspective) => { }); var answer = [ {"__ROW_PATH__":[],"a,x":1,"a,y":1,"a,z":1,"b,x":1,"b,y":1,"b,z":1,"c,x":1,"c,y":1,"c,z":1,"d,x":1,"d,y":1,"d,z":1}, - {"__ROW_PATH__":[1],"a,x":1,"a,y":1,"a,z":1,"b,x":undefined,"b,y":undefined,"b,z":undefined,"c,x":undefined,"c,y":undefined,"c,z":undefined,"d,x":undefined,"d,y":undefined,"d,z":undefined}, - {"__ROW_PATH__":[2],"a,x":undefined,"a,y":undefined,"a,z":undefined,"b,x":1,"b,y":1,"b,z":1,"c,x":undefined,"c,y":undefined,"c,z":undefined,"d,x":undefined,"d,y":undefined,"d,z":undefined}, - {"__ROW_PATH__":[3],"a,x":undefined,"a,y":undefined,"a,z":undefined,"b,x":undefined,"b,y":undefined,"b,z":undefined,"c,x":1,"c,y":1,"c,z":1,"d,x":undefined,"d,y":undefined,"d,z":undefined}, - {"__ROW_PATH__":[4],"a,x":undefined,"a,y":undefined,"a,z":undefined,"b,x":undefined,"b,y":undefined,"b,z":undefined,"c,x":undefined,"c,y":undefined,"c,z":undefined,"d,x":1,"d,y":1,"d,z":1} + {"__ROW_PATH__":[1],"a,x":1,"a,y":1,"a,z":1,"b,x":null,"b,y":null,"b,z":null,"c,x":null,"c,y":null,"c,z":null,"d,x":null,"d,y":null,"d,z":null}, + {"__ROW_PATH__":[2],"a,x":null,"a,y":null,"a,z":null,"b,x":1,"b,y":1,"b,z":1,"c,x":null,"c,y":null,"c,z":null,"d,x":null,"d,y":null,"d,z":null}, + {"__ROW_PATH__":[3],"a,x":null,"a,y":null,"a,z":null,"b,x":null,"b,y":null,"b,z":null,"c,x":1,"c,y":1,"c,z":1,"d,x":null,"d,y":null,"d,z":null}, + {"__ROW_PATH__":[4],"a,x":null,"a,y":null,"a,z":null,"b,x":null,"b,y":null,"b,z":null,"c,x":null,"c,y":null,"c,z":null,"d,x":1,"d,y":1,"d,z":1} ]; let result2 = await view.to_json(); expect(answer).toEqual(result2); From 67efb48abb3d2db84c52b7f7ce72e8be8ae1f174 Mon Sep 17 00:00:00 2001 From: Naveen Michaud-Agrawal Date: Thu, 1 Feb 2018 23:14:35 -0500 Subject: [PATCH 3/3] Use column api to get access to underlying memory --- packages/perspective/src/cpp/main.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/packages/perspective/src/cpp/main.cpp b/packages/perspective/src/cpp/main.cpp index 4f2ec9d357..862f8772bd 100644 --- a/packages/perspective/src/cpp/main.cpp +++ b/packages/perspective/src/cpp/main.cpp @@ -217,8 +217,7 @@ _fill_col(val dcol, t_col_sptr col, t_bool is_arrow) if (is_arrow) { val data = dcol["data"]; - t_lstore* lstore = col->_get_data_lstore(); - arrow::vecFromTypedArray(data, lstore->get_ptr(0), nrows); + arrow::vecFromTypedArray(data, col->get_nth(0), nrows); } else { for (auto i = 0; i < nrows; ++i) { @@ -236,9 +235,8 @@ _fill_col(val dcol, t_col_sptr col, t_bool is_arrow) if (is_arrow) { val data = dcol["data"]; - t_lstore* lstore = col->_get_data_lstore(); // arrow packs 64 bit into two 32 bit ints - arrow::vecFromTypedArray(data, lstore->get_ptr(0), nrows * 2); + arrow::vecFromTypedArray(data, col->get_nth(0), nrows * 2); } else { throw std::logic_error("Unreachable - can't have DTYPE_INT64 column from non-arrow data"); } @@ -252,9 +250,8 @@ _fill_col(val dcol, t_col_sptr col, t_bool is_arrow) if (is_arrow) { val data = dcol["data"]; - t_lstore* lstore = col->_get_data_lstore(); // arrow packs 64 bit into two 32 bit ints - arrow::vecFromTypedArray(data, lstore->get_ptr(0), nrows*2); + arrow::vecFromTypedArray(data, col->get_nth(0), nrows*2); } else { for (auto i = 0; i < nrows; ++i) {