Skip to content

Commit

Permalink
ARROW-6232: [C++] Rename Argsort kernel to SortToIndices
Browse files Browse the repository at this point in the history
"Argsort" is NumPy specific name. Other languages/libraries use
different name:

  * R: order
    * https://cran.r-project.org/doc/manuals/r-release/fullrefman.pdf#Rfn.order

  * MATLAB: sort
    * https://mathworks.com/help/matlab/ref/sort.html
    * "sort" returns sorted array and indices to sort array

  * Julia: sortperm
    * https://pkg.julialang.org/docs/julia/THl1k/1.1.1/base/sort.html#Base.sortperm

It's better that we use general name because Arrow C++ isn't a NumPy
compatible library.

"SortToIndices" means "sort that returns indices array".

Closes apache#5080 from kou/cpp-argsort-to-sort-indicies and squashes the following commits:

2c26473 <Sutou Kouhei>  Rename Argsort kernel to SortToIndices

Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
kou committed Aug 23, 2019
1 parent dfe34c7 commit af64b7b
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 90 deletions.
2 changes: 1 addition & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,11 @@ if(ARROW_COMPUTE)
compute/kernels/hash.cc
compute/kernels/filter.cc
compute/kernels/mean.cc
compute/kernels/sort_to_indices.cc
compute/kernels/sum.cc
compute/kernels/take.cc
compute/kernels/isin.cc
compute/kernels/util_internal.cc
compute/kernels/argsort.cc
compute/operations/cast.cc
compute/operations/literal.cc)
endif()
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ arrow_install_all_headers("arrow/compute/kernels")
add_arrow_test(boolean_test PREFIX "arrow-compute")
add_arrow_test(cast_test PREFIX "arrow-compute")
add_arrow_test(hash_test PREFIX "arrow-compute")
add_arrow_test(argsort_test PREFIX "arrow-compute")
add_arrow_test(isin_test PREFIX "arrow-compute")
add_arrow_test(sort_to_indices_test PREFIX "arrow-compute")
add_arrow_test(util_internal_test PREFIX "arrow-compute")
add_arrow_benchmark(argsort_benchmark PREFIX "arrow-compute")
add_arrow_benchmark(sort_to_indices_benchmark PREFIX "arrow-compute")

# Aggregates
add_arrow_test(aggregate_test PREFIX "arrow-compute")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,49 @@
// specific language governing permissions and limitations
// under the License.

#include "arrow/compute/kernels/argsort.h"
#include "arrow/compute/kernels/sort_to_indices.h"

#include <algorithm>
#include <numeric>
#include <vector>

#include "arrow/builder.h"
#include "arrow/compute/context.h"
#include "arrow/compute/expression.h"
#include "arrow/compute/logical_type.h"
#include "arrow/type_traits.h"

namespace arrow {

class Array;

namespace compute {

/// \brief UnaryKernel implementing Argsort operation
class ARROW_EXPORT ArgsortKernel : public UnaryKernel {
/// \brief UnaryKernel implementing SortToIndices operation
class ARROW_EXPORT SortToIndicesKernel : public UnaryKernel {
protected:
std::shared_ptr<DataType> type_;

public:
/// \brief UnaryKernel interface
///
/// delegates to subclasses via Argsort()
/// delegates to subclasses via SortToIndices()
Status Call(FunctionContext* ctx, const Datum& values, Datum* offsets) override = 0;

/// \brief output type of this kernel
std::shared_ptr<DataType> out_type() const override { return uint64(); }

/// \brief single-array implementation
virtual Status Argsort(FunctionContext* ctx, const std::shared_ptr<Array>& values,
std::shared_ptr<Array>* offsets) = 0;
virtual Status SortToIndices(FunctionContext* ctx, const std::shared_ptr<Array>& values,
std::shared_ptr<Array>* offsets) = 0;

/// \brief factory for ArgsortKernel
/// \brief factory for SortToIndicesKernel
///
/// \param[in] value_type constructed ArgsortKernel will support sorting
/// \param[in] value_type constructed SortToIndicesKernel will support sorting
/// values of this type
/// \param[out] out created kernel
static Status Make(const std::shared_ptr<DataType>& value_type,
std::unique_ptr<ArgsortKernel>* out);
std::unique_ptr<SortToIndicesKernel>* out);
};

template <typename ArrayType>
Expand All @@ -68,24 +71,24 @@ bool CompareViews(const ArrayType& array, uint64_t lhs, uint64_t rhs) {
}

template <typename ArrowType, typename Comparator>
class ArgsortKernelImpl : public ArgsortKernel {
class SortToIndicesKernelImpl : public SortToIndicesKernel {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;

public:
explicit ArgsortKernelImpl(Comparator compare) : compare_(compare) {}
explicit SortToIndicesKernelImpl(Comparator compare) : compare_(compare) {}

Status Argsort(FunctionContext* ctx, const std::shared_ptr<Array>& values,
std::shared_ptr<Array>* offsets) {
return ArgsortImpl(ctx, std::static_pointer_cast<ArrayType>(values), offsets);
Status SortToIndices(FunctionContext* ctx, const std::shared_ptr<Array>& values,
std::shared_ptr<Array>* offsets) {
return SortToIndicesImpl(ctx, std::static_pointer_cast<ArrayType>(values), offsets);
}

Status Call(FunctionContext* ctx, const Datum& values, Datum* offsets) {
if (!values.is_array()) {
return Status::Invalid("ArgsortKernel expects array values");
return Status::Invalid("SortToIndicesKernel expects array values");
}
auto values_array = values.make_array();
std::shared_ptr<Array> offsets_array;
RETURN_NOT_OK(this->Argsort(ctx, values_array, &offsets_array));
RETURN_NOT_OK(this->SortToIndices(ctx, values_array, &offsets_array));
*offsets = offsets_array;
return Status::OK();
}
Expand All @@ -95,8 +98,8 @@ class ArgsortKernelImpl : public ArgsortKernel {
private:
Comparator compare_;

Status ArgsortImpl(FunctionContext* ctx, const std::shared_ptr<ArrayType>& values,
std::shared_ptr<Array>* offsets) {
Status SortToIndicesImpl(FunctionContext* ctx, const std::shared_ptr<ArrayType>& values,
std::shared_ptr<Array>* offsets) {
std::shared_ptr<Buffer> indices_buf;
int64_t buf_size = values->length() * sizeof(uint64_t);
RETURN_NOT_OK(AllocateBuffer(ctx->memory_pool(), buf_size, &indices_buf));
Expand All @@ -121,49 +124,50 @@ class ArgsortKernelImpl : public ArgsortKernel {
};

template <typename ArrowType, typename Comparator>
ArgsortKernelImpl<ArrowType, Comparator>* MakeArgsortKernelImpl(Comparator comparator) {
return new ArgsortKernelImpl<ArrowType, Comparator>(comparator);
SortToIndicesKernelImpl<ArrowType, Comparator>* MakeSortToIndicesKernelImpl(
Comparator comparator) {
return new SortToIndicesKernelImpl<ArrowType, Comparator>(comparator);
}

Status ArgsortKernel::Make(const std::shared_ptr<DataType>& value_type,
std::unique_ptr<ArgsortKernel>* out) {
ArgsortKernel* kernel;
Status SortToIndicesKernel::Make(const std::shared_ptr<DataType>& value_type,
std::unique_ptr<SortToIndicesKernel>* out) {
SortToIndicesKernel* kernel;
switch (value_type->id()) {
case Type::UINT8:
kernel = MakeArgsortKernelImpl<UInt8Type>(CompareValues<UInt8Array>);
kernel = MakeSortToIndicesKernelImpl<UInt8Type>(CompareValues<UInt8Array>);
break;
case Type::INT8:
kernel = MakeArgsortKernelImpl<Int8Type>(CompareValues<Int8Array>);
kernel = MakeSortToIndicesKernelImpl<Int8Type>(CompareValues<Int8Array>);
break;
case Type::UINT16:
kernel = MakeArgsortKernelImpl<UInt16Type>(CompareValues<UInt16Array>);
kernel = MakeSortToIndicesKernelImpl<UInt16Type>(CompareValues<UInt16Array>);
break;
case Type::INT16:
kernel = MakeArgsortKernelImpl<Int16Type>(CompareValues<Int16Array>);
kernel = MakeSortToIndicesKernelImpl<Int16Type>(CompareValues<Int16Array>);
break;
case Type::UINT32:
kernel = MakeArgsortKernelImpl<UInt32Type>(CompareValues<UInt32Array>);
kernel = MakeSortToIndicesKernelImpl<UInt32Type>(CompareValues<UInt32Array>);
break;
case Type::INT32:
kernel = MakeArgsortKernelImpl<Int32Type>(CompareValues<Int32Array>);
kernel = MakeSortToIndicesKernelImpl<Int32Type>(CompareValues<Int32Array>);
break;
case Type::UINT64:
kernel = MakeArgsortKernelImpl<UInt64Type>(CompareValues<UInt64Array>);
kernel = MakeSortToIndicesKernelImpl<UInt64Type>(CompareValues<UInt64Array>);
break;
case Type::INT64:
kernel = MakeArgsortKernelImpl<Int64Type>(CompareValues<Int64Array>);
kernel = MakeSortToIndicesKernelImpl<Int64Type>(CompareValues<Int64Array>);
break;
case Type::FLOAT:
kernel = MakeArgsortKernelImpl<FloatType>(CompareValues<FloatArray>);
kernel = MakeSortToIndicesKernelImpl<FloatType>(CompareValues<FloatArray>);
break;
case Type::DOUBLE:
kernel = MakeArgsortKernelImpl<DoubleType>(CompareValues<DoubleArray>);
kernel = MakeSortToIndicesKernelImpl<DoubleType>(CompareValues<DoubleArray>);
break;
case Type::BINARY:
kernel = MakeArgsortKernelImpl<BinaryType>(CompareViews<BinaryArray>);
kernel = MakeSortToIndicesKernelImpl<BinaryType>(CompareViews<BinaryArray>);
break;
case Type::STRING:
kernel = MakeArgsortKernelImpl<StringType>(CompareViews<StringArray>);
kernel = MakeSortToIndicesKernelImpl<StringType>(CompareViews<StringArray>);
break;
default:
return Status::NotImplemented("Sorting of ", *value_type, " arrays");
Expand All @@ -172,16 +176,16 @@ Status ArgsortKernel::Make(const std::shared_ptr<DataType>& value_type,
return Status::OK();
}

Status Argsort(FunctionContext* ctx, const Datum& values, Datum* offsets) {
std::unique_ptr<ArgsortKernel> kernel;
RETURN_NOT_OK(ArgsortKernel::Make(values.type(), &kernel));
Status SortToIndices(FunctionContext* ctx, const Datum& values, Datum* offsets) {
std::unique_ptr<SortToIndicesKernel> kernel;
RETURN_NOT_OK(SortToIndicesKernel::Make(values.type(), &kernel));
return kernel->Call(ctx, values, offsets);
}

Status Argsort(FunctionContext* ctx, const Array& values,
std::shared_ptr<Array>* offsets) {
Status SortToIndices(FunctionContext* ctx, const Array& values,
std::shared_ptr<Array>* offsets) {
Datum offsets_datum;
RETURN_NOT_OK(Argsort(ctx, Datum(values.data()), &offsets_datum));
RETURN_NOT_OK(SortToIndices(ctx, Datum(values.data()), &offsets_datum));
*offsets = offsets_datum.make_array();
return Status::OK();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class FunctionContext;
/// \param[in] values array to sort
/// \param[out] offsets indices that would sort an array
ARROW_EXPORT
Status Argsort(FunctionContext* ctx, const Array& values,
std::shared_ptr<Array>* offsets);
Status SortToIndices(FunctionContext* ctx, const Array& values,
std::shared_ptr<Array>* offsets);

} // namespace compute
} // namespace arrow
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "benchmark/benchmark.h"

#include "arrow/compute/kernels/argsort.h"
#include "arrow/compute/kernels/sort_to_indices.h"

#include "arrow/compute/benchmark_util.h"
#include "arrow/compute/test_util.h"
Expand All @@ -28,28 +28,28 @@ namespace arrow {
namespace compute {
constexpr auto kSeed = 0x0ff1ce;

static void ArgsortBenchmark(benchmark::State& state,
const std::shared_ptr<Array>& values) {
static void SortToIndicesBenchmark(benchmark::State& state,
const std::shared_ptr<Array>& values) {
FunctionContext ctx;
for (auto _ : state) {
std::shared_ptr<Array> out;
ABORT_NOT_OK(Argsort(&ctx, *values, &out));
ABORT_NOT_OK(SortToIndices(&ctx, *values, &out));
benchmark::DoNotOptimize(out);
}
}

static void ArgsortInt64(benchmark::State& state) {
static void SortToIndicesInt64(benchmark::State& state) {
RegressionArgs args(state);

const int64_t array_size = args.size / sizeof(int64_t);
auto rand = random::RandomArrayGenerator(kSeed);

auto values = rand.Int64(array_size, -100, 100, args.null_proportion);

ArgsortBenchmark(state, values);
SortToIndicesBenchmark(state, values);
}

BENCHMARK(ArgsortInt64)
BENCHMARK(SortToIndicesInt64)
->Apply(RegressionSetArgs)
->Args({1 << 20, 1})
->Args({1 << 23, 1})
Expand Down
Loading

0 comments on commit af64b7b

Please sign in to comment.