Skip to content

Commit

Permalink
apacheGH-37017: [C++] Guard unexpected uses of BMI2 instructions (apa…
Browse files Browse the repository at this point in the history
…che#37610)

### Rationale for this change

Some functions introduced with Acero only check for AVX2 availability, but they actually invoke BMI2 instructions.
This can have two negative consequences:
* compiling BMI2 intrinsics may fail because BMI2 was not explicitly enabled on the compiler (apachegh-37017)
* some rare CPUs (Via CPUs perhaps) may support AVX2 but not BMI2; other CPUs by AMD have a very inefficient implementation of some BMI2 instructions

### What changes are included in this PR?

1. Ensure that the suitable compiler flag is passed when compiling code with BMI2 intrinsics
2. Make sure the CPU supports BMI2 adequately before invoking functions featuring BMI2 instructions

### Are these changes tested?

Yes, assuming CI covers enough diversity of target platforms.

### Are there any user-facing changes?

No, but performance might change (positively or negatively) depending on the CPU and platform.

* Closes: apache#37017

Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
pitrou authored Sep 8, 2023
1 parent a8a604c commit 50015f0
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 23 deletions.
10 changes: 8 additions & 2 deletions cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,16 @@ if(ARROW_CPU_FLAG STREQUAL "x86")
if(MSVC)
set(ARROW_SSE4_2_FLAG "")
set(ARROW_AVX2_FLAG "/arch:AVX2")
# MSVC has no specific flag for BMI2, it seems to be enabled with AVX2
set(ARROW_BMI2_FLAG "/arch:AVX2")
set(ARROW_AVX512_FLAG "/arch:AVX512")
set(CXX_SUPPORTS_SSE4_2 TRUE)
else()
set(ARROW_SSE4_2_FLAG "-msse4.2")
set(ARROW_AVX2_FLAG "-march=haswell")
set(ARROW_BMI2_FLAG "-mbmi2")
# skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ
set(ARROW_AVX512_FLAG "-march=skylake-avx512 -mbmi2")
set(ARROW_AVX512_FLAG "-march=skylake-avx512")
# Append the avx2/avx512 subset option also, fix issue ARROW-9877 for homebrew-cpp
set(ARROW_AVX2_FLAG "${ARROW_AVX2_FLAG} -mavx2")
set(ARROW_AVX512_FLAG
Expand Down Expand Up @@ -95,13 +98,16 @@ if(ARROW_CPU_FLAG STREQUAL "x86")
set(ARROW_HAVE_RUNTIME_SSE4_2 ON)
add_definitions(-DARROW_HAVE_RUNTIME_SSE4_2)
endif()
# Note: for now we assume that AVX2 support should also enable BMI2 support,
# at least at compile-time (more care may be required for runtime dispatch).
if(CXX_SUPPORTS_AVX2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(AVX2|AVX512|MAX)$")
set(ARROW_HAVE_RUNTIME_AVX2 ON)
set(ARROW_HAVE_RUNTIME_BMI2 ON)
add_definitions(-DARROW_HAVE_RUNTIME_AVX2 -DARROW_HAVE_RUNTIME_BMI2)
endif()
if(CXX_SUPPORTS_AVX512 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$")
set(ARROW_HAVE_RUNTIME_AVX512 ON)
add_definitions(-DARROW_HAVE_RUNTIME_AVX512 -DARROW_HAVE_RUNTIME_BMI2)
add_definitions(-DARROW_HAVE_RUNTIME_AVX512)
endif()
if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT")
set(ARROW_SIMD_LEVEL "SSE4_2")
Expand Down
13 changes: 11 additions & 2 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,15 @@ macro(append_runtime_avx2_src SRC)
endif()
endmacro()

macro(append_runtime_avx2_bmi2_src SRC)
if(ARROW_HAVE_RUNTIME_AVX2 AND ARROW_HAVE_RUNTIME_BMI2)
list(APPEND ARROW_SRCS ${SRC})
set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS
"${ARROW_AVX2_FLAG} ${ARROW_BMI2_FLAG}")
endif()
endmacro()

macro(append_runtime_avx512_src SRC)
if(ARROW_HAVE_RUNTIME_AVX512)
list(APPEND ARROW_SRCS ${SRC})
Expand Down Expand Up @@ -433,10 +442,10 @@ list(APPEND
compute/util.cc)

append_runtime_avx2_src(compute/key_hash_avx2.cc)
append_runtime_avx2_src(compute/key_map_avx2.cc)
append_runtime_avx2_bmi2_src(compute/key_map_avx2.cc)
append_runtime_avx2_src(compute/row/compare_internal_avx2.cc)
append_runtime_avx2_src(compute/row/encode_internal_avx2.cc)
append_runtime_avx2_src(compute/util_avx2.cc)
append_runtime_avx2_bmi2_src(compute/util_avx2.cc)

if(ARROW_COMPUTE)
# Include the remaining kernels
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/arrow/compute/key_map.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
namespace arrow {

using bit_util::CountLeadingZeros;
using internal::CpuInfo;

namespace compute {

Expand Down Expand Up @@ -133,9 +134,10 @@ void SwissTable::extract_group_ids(const int num_keys, const uint16_t* optional_

// Optimistically use simplified lookup involving only a start block to find
// a single group id candidate for every input.
#if defined(ARROW_HAVE_RUNTIME_AVX2)
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
int num_group_id_bytes = num_group_id_bits / 8;
if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && !optional_selection) {
if ((hardware_flags_ & CpuInfo::AVX2) && CpuInfo::GetInstance()->HasEfficientBmi2() &&
!optional_selection) {
num_processed = extract_group_ids_avx2(num_keys, hashes, local_slots, out_group_ids,
sizeof(uint64_t), 8 + 8 * num_group_id_bytes,
num_group_id_bytes);
Expand Down Expand Up @@ -301,8 +303,8 @@ void SwissTable::early_filter(const int num_keys, const uint32_t* hashes,
// Optimistically use simplified lookup involving only a start block to find
// a single group id candidate for every input.
int num_processed = 0;
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
if ((hardware_flags_ & CpuInfo::AVX2) && CpuInfo::GetInstance()->HasEfficientBmi2()) {
if (log_blocks_ <= 4) {
num_processed = early_filter_imp_avx2_x32(num_keys, hashes, out_match_bitvector,
out_local_slots);
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/key_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,8 @@ class ARROW_EXPORT SwissTable {
//
void early_filter_imp(const int num_keys, const uint32_t* hashes,
uint8_t* out_match_bitvector, uint8_t* out_local_slots) const;
#if defined(ARROW_HAVE_RUNTIME_AVX2)
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
// The functions below use BMI2 instructions, be careful before calling!
int early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* hashes,
uint8_t* out_match_bitvector,
uint8_t* out_local_slots) const;
Expand Down
19 changes: 10 additions & 9 deletions cpp/src/arrow/compute/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
namespace arrow {

using bit_util::CountTrailingZeros;
using internal::CpuInfo;

namespace util {

Expand Down Expand Up @@ -118,8 +119,8 @@ void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
// 64 bits at a time
constexpr int unroll = 64;
int tail = num_bits % unroll;
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
if ((hardware_flags & CpuInfo::AVX2) && CpuInfo::GetInstance()->HasEfficientBmi2()) {
if (filter_input_indexes) {
avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
num_indexes, indexes);
Expand All @@ -141,7 +142,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes);
}
}
#if defined(ARROW_HAVE_RUNTIME_AVX2)
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
}
#endif
// Optionally process the last partial word with masking out bits outside range
Expand Down Expand Up @@ -253,8 +254,8 @@ void bits_to_bytes(int64_t hardware_flags, const int num_bits, const uint8_t* bi
}

int num_processed = 0;
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
if ((hardware_flags & CpuInfo::AVX2) && CpuInfo::GetInstance()->HasEfficientBmi2()) {
// The function call below processes whole 32 bit chunks together.
num_processed = num_bits - (num_bits % 32);
avx2::bits_to_bytes_avx2(num_processed, bits, bytes);
Expand Down Expand Up @@ -309,8 +310,8 @@ void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* by
}

int num_processed = 0;
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
if ((hardware_flags & CpuInfo::AVX2) && CpuInfo::GetInstance()->HasEfficientBmi2()) {
// The function call below processes whole 32 bit chunks together.
num_processed = num_bits - (num_bits % 32);
avx2::bytes_to_bits_avx2(num_processed, bytes, bits);
Expand Down Expand Up @@ -339,8 +340,8 @@ void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* by

bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes) {
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
if ((hardware_flags & CpuInfo::AVX2) && CpuInfo::GetInstance()->HasEfficientBmi2()) {
return avx2::are_all_bytes_zero_avx2(bytes, num_bytes);
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes);

#if defined(ARROW_HAVE_RUNTIME_AVX2)
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
// The functions below use BMI2 instructions, be careful before calling!

namespace avx2 {
ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
Expand Down
12 changes: 8 additions & 4 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,14 @@ if(ARROW_HAVE_RUNTIME_AVX2)
#
# TODO: Use COMPILE_OPTIONS instead of COMPILE_FLAGS when we require
# CMake 3.11 or later.
set(BMI2_FLAGS "${AVX2_FLAGS} -DARROW_HAVE_BMI2 -mbmi2")
set_source_files_properties(level_conversion_bmi2.cc
PROPERTIES SKIP_PRECOMPILE_HEADERS ON COMPILE_FLAGS
"${BMI2_FLAGS}")
if(ARROW_HAVE_RUNTIME_BMI2)
# Need to pass ARROW_HAVE_BMI2 for level_conversion_inc.h to compile
# the BMI2 path.
set(BMI2_FLAGS "${AVX2_FLAGS} ${ARROW_BMI2_FLAG} -DARROW_HAVE_BMI2")
set_source_files_properties(level_conversion_bmi2.cc
PROPERTIES SKIP_PRECOMPILE_HEADERS ON COMPILE_FLAGS
"${BMI2_FLAGS}")
endif()
endif()

if(PARQUET_REQUIRE_ENCRYPTION)
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/parquet/level_conversion_inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
#include "arrow/util/simd.h"
#include "parquet/exception.h"
#include "parquet/level_comparison.h"

#ifndef PARQUET_IMPL_NAMESPACE
#error "PARQUET_IMPL_NAMESPACE must be defined"
#endif

namespace parquet::internal::PARQUET_IMPL_NAMESPACE {

// clang-format off
Expand Down

0 comments on commit 50015f0

Please sign in to comment.