From 863dc86201d5eb9103591b3072138e678c8fbb86 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sat, 22 Jun 2024 18:20:30 +0200 Subject: [PATCH] Move x86 CPUID code from cpuid.hpp to cpuid.cpp (#150) --- CMakeLists.txt | 37 +- ChangeLog | 8 + cmake/auto_vectorization.cmake | 4 +- cmake/libatomic.cmake | 5 +- cmake/multiarch_avx512_vbmi2.cmake | 14 +- cmake/multiarch_x86_popcnt.cmake | 53 +++ include/primesieve.h | 4 +- include/primesieve.hpp | 4 +- include/primesieve/CpuInfo.hpp | 3 +- include/primesieve/Erat.hpp | 4 +- include/primesieve/PrimeGenerator.hpp | 20 +- .../primesieve/cpu_supports_avx512_vbmi2.hpp | 66 +--- include/primesieve/cpu_supports_popcnt.hpp | 39 +- include/primesieve/ctz.hpp | 160 ++++++++ include/primesieve/intrinsics.hpp | 345 ------------------ include/primesieve/popcnt.hpp | 214 +++++++++++ scripts/build_clang_multiarch_win_x64.bat | 2 +- scripts/build_mingw64_x64.sh | 2 +- src/CpuInfo.cpp | 33 +- src/PrimeGenerator.cpp | 14 +- src/SievingPrimes.cpp | 3 +- src/app/main.cpp | 19 +- src/popcount.cpp | 2 +- src/x86/cpuid.cpp | 125 +++++++ test/CMakeLists.txt | 2 +- test/cpuid.cpp | 53 +-- 26 files changed, 667 insertions(+), 568 deletions(-) create mode 100644 cmake/multiarch_x86_popcnt.cmake create mode 100644 include/primesieve/ctz.hpp delete mode 100644 include/primesieve/intrinsics.hpp create mode 100644 include/primesieve/popcnt.hpp create mode 100644 src/x86/cpuid.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 84bc037c7..232cce299 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.4...3.27) project(primesieve CXX) -set(PRIMESIEVE_VERSION "12.3") -set(PRIMESIEVE_SOVERSION "12.3.0") +set(PRIMESIEVE_VERSION "12.4") +set(PRIMESIEVE_SOVERSION "12.4.0") # Build options ###################################################### @@ -47,7 +47,7 @@ if(NOT isMultiConfig AND NOT CMAKE_BUILD_TYPE) endif() if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(ENABLE_ASSERT "ENABLE_ASSERT") + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_ASSERT") endif() # primesieve binary source files ##################################### @@ -82,6 +82,17 @@ set(LIB_SRC src/api-c.cpp src/RiemannR.cpp src/SievingPrimes.cpp) +# Check if compiler supports CPU multiarch ########################### + +if(WITH_MULTIARCH) + include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake") + + if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2) + set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp) + endif() +endif() + # Required includes ################################################## include(GNUInstallDirs) @@ -107,12 +118,6 @@ if(WITH_AUTO_VECTORIZATION) include("${PROJECT_SOURCE_DIR}/cmake/auto_vectorization.cmake") endif() -# Check if compiler supports x64 multiarch ########################### - -if(WITH_MULTIARCH) - include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake") -endif() - # libprimesieve (shared library) ##################################### find_package(Threads REQUIRED QUIET) @@ -120,13 +125,13 @@ find_package(Threads REQUIRED QUIET) if(BUILD_SHARED_LIBS) add_library(libprimesieve SHARED ${LIB_SRC}) set_target_properties(libprimesieve PROPERTIES OUTPUT_NAME primesieve) - target_link_libraries(libprimesieve PRIVATE Threads::Threads ${LIBATOMIC}) + target_link_libraries(libprimesieve PRIVATE Threads::Threads ${PRIMESIEVE_LINK_LIBRARIES}) string(REPLACE "." ";" SOVERSION_LIST ${PRIMESIEVE_SOVERSION}) list(GET SOVERSION_LIST 0 PRIMESIEVE_SOVERSION_MAJOR) set_target_properties(libprimesieve PROPERTIES SOVERSION ${PRIMESIEVE_SOVERSION_MAJOR}) set_target_properties(libprimesieve PROPERTIES VERSION ${PRIMESIEVE_SOVERSION}) - target_compile_options(libprimesieve PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG}) - target_compile_definitions(libprimesieve PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") + target_compile_options(libprimesieve PRIVATE ${PRIMESIEVE_COMPILE_OPTIONS}) + target_compile_definitions(libprimesieve PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) if(WIN32_MSVC_COMPATIBLE) # On Windows the shared library will be named primesieve.dll @@ -162,9 +167,9 @@ endif() if(BUILD_STATIC_LIBS) add_library(libprimesieve-static STATIC ${LIB_SRC}) set_target_properties(libprimesieve-static PROPERTIES OUTPUT_NAME primesieve) - target_link_libraries(libprimesieve-static PRIVATE Threads::Threads ${LIBATOMIC}) - target_compile_options(libprimesieve-static PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG}) - target_compile_definitions(libprimesieve-static PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") + target_link_libraries(libprimesieve-static PRIVATE Threads::Threads ${PRIMESIEVE_LINK_LIBRARIES}) + target_compile_options(libprimesieve-static PRIVATE ${PRIMESIEVE_COMPILE_OPTIONS}) + target_compile_definitions(libprimesieve-static PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) if(WITH_MSVC_CRT_STATIC) set_target_properties(libprimesieve-static PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded") @@ -219,7 +224,7 @@ endif() if(BUILD_PRIMESIEVE) add_executable(primesieve ${BIN_SRC}) target_link_libraries(primesieve primesieve::primesieve Threads::Threads) - target_compile_definitions(primesieve PRIVATE "${ENABLE_ASSERT}") + target_compile_definitions(primesieve PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) target_compile_features(primesieve PRIVATE cxx_auto_type) install(TARGETS primesieve DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/ChangeLog b/ChangeLog index 4a07fa953..ded64abb2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Changes in version 12.4, 22/06/2024 +=================================== + +* Move x86 CPUID code from cpuid.hpp to src/x86/cpuid.cpp. +* multiarch_x86_popcnt.cmake: Detect x86 POPCNT support. +* CMakeLists.txt: Use CMake list for all compile time definitions. +* CMakeLists.txt: Use CMake list for all link libraries. + Changes in version 12.3, 15/04/2024 =================================== diff --git a/cmake/auto_vectorization.cmake b/cmake/auto_vectorization.cmake index 84897cc45..4a243f70c 100644 --- a/cmake/auto_vectorization.cmake +++ b/cmake/auto_vectorization.cmake @@ -16,7 +16,7 @@ check_cxx_compiler_flag(-ftree-vectorize ftree_vectorize) cmake_pop_check_state() if(ftree_vectorize) - set(FTREE_VECTORIZE_FLAG "-ftree-vectorize") + list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-ftree-vectorize") cmake_push_check_state() set(CMAKE_REQUIRED_FLAGS -Werror) @@ -24,6 +24,6 @@ if(ftree_vectorize) cmake_pop_check_state() if(fvect_cost_model) - set(FVECT_COST_MODEL_FLAG "-fvect-cost-model=dynamic") + list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-fvect-cost-model=dynamic") endif() endif() diff --git a/cmake/libatomic.cmake b/cmake/libatomic.cmake index 333926f2b..acd7ca993 100644 --- a/cmake/libatomic.cmake +++ b/cmake/libatomic.cmake @@ -57,7 +57,10 @@ if(NOT atomic64) }" atomic64_with_libatomic) - if (NOT atomic64_with_libatomic) + if(atomic64_with_libatomic) + list(APPEND PRIMESIEVE_LINK_LIBRARIES "${LIBATOMIC}") + else() + set(LIBATOMIC "") message(FATAL_ERROR "Failed to compile std::atomic, libatomic likely not found!") endif() endif() diff --git a/cmake/multiarch_avx512_vbmi2.cmake b/cmake/multiarch_avx512_vbmi2.cmake index f56d5e933..c0b7c0e96 100644 --- a/cmake/multiarch_avx512_vbmi2.cmake +++ b/cmake/multiarch_avx512_vbmi2.cmake @@ -7,7 +7,7 @@ include(CheckCXXSourceCompiles) include(CMakePushCheckState) cmake_push_check_state() -set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include") +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") check_cxx_source_compiles(" // GCC/Clang function multiversioning for AVX512 is not needed if @@ -20,19 +20,19 @@ check_cxx_source_compiles(" Error: AVX512VBMI2 multiarch not needed! #endif - #include + #include #include #include class PrimeGenerator { public: __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) - void fillNextPrimes_avx512(uint64_t* primes64); + void fillNextPrimes_avx512_vbmi2(uint64_t* primes64); void fillNextPrimes_default(uint64_t* primes64); void fillNextPrimes(uint64_t* primes64) { - if (cpu_supports_avx512_vbmi2) - fillNextPrimes_avx512(primes64); + if (primesieve::has_cpuid_avx512_vbmi2()) + fillNextPrimes_avx512_vbmi2(primes64); else fillNextPrimes_default(primes64); } @@ -44,7 +44,7 @@ check_cxx_source_compiles(" } __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) - void PrimeGenerator::fillNextPrimes_avx512(uint64_t* primes64) + void PrimeGenerator::fillNextPrimes_avx512_vbmi2(uint64_t* primes64) { __m512i bytes_0_to_7 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); __m512i base = _mm512_set1_epi64(123); @@ -64,7 +64,7 @@ check_cxx_source_compiles(" " multiarch_avx512_vbmi2) if(multiarch_avx512_vbmi2) - set(ENABLE_MULTIARCH_AVX512 "ENABLE_MULTIARCH_AVX512") + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_VBMI2") endif() cmake_pop_check_state() diff --git a/cmake/multiarch_x86_popcnt.cmake b/cmake/multiarch_x86_popcnt.cmake new file mode 100644 index 000000000..707a8e470 --- /dev/null +++ b/cmake/multiarch_x86_popcnt.cmake @@ -0,0 +1,53 @@ +# On x86 CPUs we need to enable the use of cpuid.cpp. +# If cpuid.cpp compiles we assume it is a x86 CPU. + +include(CheckCXXSourceCompiles) +include(CMakePushCheckState) + +cmake_push_check_state() +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") + +check_cxx_source_compiles(" + // Enable CPUID for POPCNT on x86 and x86-64 CPUs. + // This is required because not all x86 and x86-64 CPUs + // support the POPCNT instruction. + #if !(defined(__x86_64__) || \ + defined(__i386__) || \ + defined(_M_X64) || \ + defined(_M_IX86)) + Error: x86 POPCNT multiarch not needed! + #endif + + // Both GCC and Clang (even Clang on Windows) define the __POPCNT__ + // macro if the user compiles with -mpopcnt. The __POPCNT__ + // macro is even defined if the user compiles with other flags + // such as -mavx or -march=native. + #if defined(__POPCNT__) + Error: x86 POPCNT multiarch not needed! + + // The MSVC compiler does not support a POPCNT macro, but if the user + // compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines + // the __AVX__ macro and POPCNT is also supported. + #elif defined(_MSC_VER) && defined(__AVX__) + Error: x86 POPCNT multiarch not needed! + #endif + + #include + #include + + int main() + { + if (primesieve::has_cpuid_popcnt()) + std::cout << \"CPU supports POPCNT!\" << std::endl; + else + std::cout << \"CPU does not support POPCNT!\" << std::endl; + + return 0; + } +" multiarch_x86_popcnt) + +if(multiarch_x86_popcnt) + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_x86_POPCNT") +endif() + +cmake_pop_check_state() diff --git a/include/primesieve.h b/include/primesieve.h index d67cac87a..28bf8336c 100644 --- a/include/primesieve.h +++ b/include/primesieve.h @@ -15,9 +15,9 @@ #ifndef PRIMESIEVE_H #define PRIMESIEVE_H -#define PRIMESIEVE_VERSION "12.3" +#define PRIMESIEVE_VERSION "12.4" #define PRIMESIEVE_VERSION_MAJOR 12 -#define PRIMESIEVE_VERSION_MINOR 3 +#define PRIMESIEVE_VERSION_MINOR 4 #include diff --git a/include/primesieve.hpp b/include/primesieve.hpp index fd3d35f9b..c0295245b 100644 --- a/include/primesieve.hpp +++ b/include/primesieve.hpp @@ -13,9 +13,9 @@ #ifndef PRIMESIEVE_HPP #define PRIMESIEVE_HPP -#define PRIMESIEVE_VERSION "12.3" +#define PRIMESIEVE_VERSION "12.4" #define PRIMESIEVE_VERSION_MAJOR 12 -#define PRIMESIEVE_VERSION_MINOR 3 +#define PRIMESIEVE_VERSION_MINOR 4 #include #include diff --git a/include/primesieve/CpuInfo.hpp b/include/primesieve/CpuInfo.hpp index 426cc6323..5ccd1835a 100644 --- a/include/primesieve/CpuInfo.hpp +++ b/include/primesieve/CpuInfo.hpp @@ -1,7 +1,7 @@ /// /// @file CpuInfo.hpp /// -/// Copyright (C) 2023 Kim Walisch, +/// Copyright (C) 2024 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. @@ -22,7 +22,6 @@ class CpuInfo public: CpuInfo(); bool hasCpuName() const; - bool hasAVX512() const; bool hasLogicalCpuCores() const; bool hasL1Cache() const; bool hasL2Cache() const; diff --git a/include/primesieve/Erat.hpp b/include/primesieve/Erat.hpp index bfd952863..32c0f221d 100644 --- a/include/primesieve/Erat.hpp +++ b/include/primesieve/Erat.hpp @@ -1,7 +1,7 @@ /// /// @file Erat.hpp /// -/// Copyright (C) 2023 Kim Walisch, +/// Copyright (C) 2024 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. @@ -15,8 +15,8 @@ #include "EratMedium.hpp" #include "EratBig.hpp" #include "macros.hpp" -#include "intrinsics.hpp" #include "Vector.hpp" +#include "ctz.hpp" #include diff --git a/include/primesieve/PrimeGenerator.hpp b/include/primesieve/PrimeGenerator.hpp index 59f7567d9..aa0cecdc6 100644 --- a/include/primesieve/PrimeGenerator.hpp +++ b/include/primesieve/PrimeGenerator.hpp @@ -27,9 +27,9 @@ defined(__AVX512VBMI__) && \ defined(__AVX512VBMI2__) && \ __has_include() - #define ENABLE_AVX512 + #define ENABLE_AVX512_VBMI2 -#elif defined(ENABLE_MULTIARCH_AVX512) && \ +#elif defined(ENABLE_MULTIARCH_AVX512_VBMI2) && \ __has_include() #include "cpu_supports_avx512_vbmi2.hpp" #define ENABLE_DEFAULT @@ -50,11 +50,11 @@ class PrimeGenerator : public Erat ALWAYS_INLINE void fillNextPrimes(Vector& primes, std::size_t* size) { - #if defined(ENABLE_AVX512) - fillNextPrimes_avx512(primes, size); - #elif defined(ENABLE_MULTIARCH_AVX512) + #if defined(ENABLE_AVX512_VBMI2) + fillNextPrimes_avx512_vbmi2(primes, size); + #elif defined(ENABLE_MULTIARCH_AVX512_VBMI2) if (cpu_supports_avx512_vbmi2) - fillNextPrimes_avx512(primes, size); + fillNextPrimes_avx512_vbmi2(primes, size); else fillNextPrimes_default(primes, size); #else @@ -68,13 +68,13 @@ class PrimeGenerator : public Erat void fillNextPrimes_default(Vector& primes, std::size_t* size); #endif -#if defined(ENABLE_AVX512) || \ - defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_AVX512_VBMI2) || \ + defined(ENABLE_MULTIARCH_AVX512_VBMI2) - #if defined(ENABLE_MULTIARCH_AVX512) + #if defined(ENABLE_MULTIARCH_AVX512_VBMI2) __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) #endif - void fillNextPrimes_avx512(Vector& primes, std::size_t* size); + void fillNextPrimes_avx512_vbmi2(Vector& primes, std::size_t* size); #endif diff --git a/include/primesieve/cpu_supports_avx512_vbmi2.hpp b/include/primesieve/cpu_supports_avx512_vbmi2.hpp index 5ab575eb0..0e59e47ce 100644 --- a/include/primesieve/cpu_supports_avx512_vbmi2.hpp +++ b/include/primesieve/cpu_supports_avx512_vbmi2.hpp @@ -11,74 +11,16 @@ #ifndef CPU_SUPPORTS_AVX512_VBMI2_HPP #define CPU_SUPPORTS_AVX512_VBMI2_HPP -#include "cpuid.hpp" +namespace primesieve { -#if defined(_MSC_VER) - #include -#endif - -// %ebx bit flags -#define bit_AVX512F (1 << 16) - -// %ecx bit flags -#define bit_AVX512VBMI (1 << 1) -#define bit_AVX512VBMI2 (1 << 6) +bool has_cpuid_avx512_vbmi2(); -// xgetbv bit flags -#define XSTATE_SSE (1 << 1) -#define XSTATE_YMM (1 << 2) -#define XSTATE_ZMM (7 << 5) +} // namespace namespace { -// Get Value of Extended Control Register -inline int get_xcr0() -{ - int xcr0; - -#if defined(_MSC_VER) - xcr0 = (int) _xgetbv(0); -#else - __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); -#endif - - return xcr0; -} - -inline bool run_cpuid_avx512_vbmi2() -{ - int abcd[4]; - - run_cpuid(1, 0, abcd); - - int osxsave_mask = (1 << 27); - - // Ensure OS supports extended processor state management - if ((abcd[2] & osxsave_mask) != osxsave_mask) - return false; - - int ymm_mask = XSTATE_SSE | XSTATE_YMM; - int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; - - int xcr0 = get_xcr0(); - - // Check AVX OS support - if ((xcr0 & ymm_mask) != ymm_mask) - return false; - - // Check AVX512 OS support - if ((xcr0 & zmm_mask) != zmm_mask) - return false; - - run_cpuid(7, 0, abcd); - - // PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2 - return ((abcd[1] & bit_AVX512F) == bit_AVX512F && - (abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2)); -} - /// Initialized at startup -bool cpu_supports_avx512_vbmi2 = run_cpuid_avx512_vbmi2(); +bool cpu_supports_avx512_vbmi2 = primesieve::has_cpuid_avx512_vbmi2(); } // namespace diff --git a/include/primesieve/cpu_supports_popcnt.hpp b/include/primesieve/cpu_supports_popcnt.hpp index 212c0f3b2..ded65c3ed 100644 --- a/include/primesieve/cpu_supports_popcnt.hpp +++ b/include/primesieve/cpu_supports_popcnt.hpp @@ -11,48 +11,17 @@ #ifndef CPU_SUPPORTS_POPCNT_HPP #define CPU_SUPPORTS_POPCNT_HPP -// Enable CPUID on x86 and x86-64 CPUs -#if defined(__x86_64__) || \ - defined(__i386__) || \ - defined(_M_X64) || \ - defined(_M_IX86) - -// Both GCC and Clang (even Clang on Windows) define the __POPCNT__ -// macro if the user compiles with -mpopcnt. The __POPCNT__ -// macro is even defined if the user compiles with other flags -// such as -mavx or -march=native. -#if defined(__POPCNT__) - #define HAS_POPCNT -// The MSVC compiler does not support a POPCNT macro, but if the user -// compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines -// the __AVX__ macro and POPCNT is also supported. -#elif defined(_MSC_VER) && defined(__AVX__) - #define HAS_POPCNT -#endif +namespace primesieve { -#if !defined(HAS_POPCNT) +bool has_cpuid_popcnt(); -#include "cpuid.hpp" -#define ENABLE_CPUID_POPCNT +} // namespace namespace { -inline bool run_cpuid_supports_popcnt() -{ - int abcd[4]; - run_cpuid(1, 0, abcd); - - // %ecx POPCNT bit flag - int bit_POPCNT = 1 << 23; - return (abcd[2] & bit_POPCNT) == bit_POPCNT; -} - /// Initialized at startup -bool cpu_supports_popcnt = run_cpuid_supports_popcnt(); +bool cpu_supports_popcnt = primesieve::has_cpuid_popcnt(); } // namespace -#endif // !defined(HAS_POPCNT) -#endif // CPUID - #endif diff --git a/include/primesieve/ctz.hpp b/include/primesieve/ctz.hpp new file mode 100644 index 000000000..c45eedcab --- /dev/null +++ b/include/primesieve/ctz.hpp @@ -0,0 +1,160 @@ +/// +/// @file ctz.hpp +/// @brief Count the number of trailing zeros. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef CTZ_HPP +#define CTZ_HPP + +#include "macros.hpp" +#include + +// GCC/Clang & MSVC +#if defined(__x86_64__) || \ + defined(_M_X64) + #define IS_X64 +#endif + +// On x64 CPUs: +// GCC & Clang enable TZCNT with -mbmi. +// MSVC enables TZCNT with /arch:AVX2 or later. +#if defined(__BMI__) || \ + (defined(_MSC_VER) && defined(__AVX2__)) + #define HAS_TZCNT +#endif + +#if (defined(__GNUC__) || \ + defined(__clang__)) && \ + defined(__x86_64__) + +#define HAS_CTZ64 +#define CTZ64_SUPPORTS_ZERO + +namespace { + +inline uint64_t ctz64(uint64_t x) +{ +#if defined(HAS_TZCNT) + // No undefined behavior, TZCNT(0) = 64 + __asm__("tzcnt %1, %0" : "=r"(x) : "r"(x)); + return x; +#else + // REP BSF uses the TZCNT instruction on x64 CPUs with the BMI1 + // instruction set (>= 2013) and the BSF instruction on older x64 + // CPUs. BSF(0) is undefined behavior, it leaves the destination + // register unmodified. Fortunately, it is possible to avoid this + // undefined behavior by always setting the destination register + // to the same value before executing BSF(0). This works on all + // AMD & Intel CPUs since the i586 (from 1993), the Linux kernel + // also relies on this behavior, see this Linux commit: + // https://github.com/torvalds/linux/commit/ca3d30cc02f780f68771087040ce935add6ba2b7 + // + // The constraint "0" for input operand 1 says that it must occupy + // the same location as output operand 0. Hence the assembly below + // uses the same input & output register. This ensures that + // BSF(0) = 0, hence there is no undefined behavior. However, you + // cannot rely on ctz64(0) = 0 since TZCNT(0) = 64. + __asm__("rep bsf %1, %0" : "=r"(x) : "0"(x)); + ASSERT(x <= 64); + return x; +#endif +} + +} // namespace + +#elif (defined(__GNUC__) || \ + defined(__clang__)) && \ + defined(__aarch64__) + +#define HAS_CTZ64 +#define CTZ64_SUPPORTS_ZERO + +namespace { + +inline uint64_t ctz64(uint64_t x) +{ + // No undefined behavior, CTZ(0) = 64. + // ARM64 has no CTZ instruction, we have to emulate it. + __asm__("rbit %0, %1 \n\t" + "clz %0, %0 \n\t" + : "=r" (x) + : "r" (x)); + + return x; +} + +} // namespace + +#elif defined(_MSC_VER) && \ + defined(HAS_TZCNT) && \ + __has_include() + +#include +#define HAS_CTZ64 +#define CTZ64_SUPPORTS_ZERO + +// This allows us to generate the TZCNT instruction for MSVC +// without C++20 support, hence without std::countr_zero(x). +// No undefined behavior, _tzcnt_u64(0) = 64. +#define ctz64(x) _tzcnt_u64(x) + +#elif __cplusplus >= 202002L && \ + __has_include() && \ + (!defined(IS_X64) || defined(HAS_TZCNT)) + +#include + +// No undefined behavior, std::countr_zero(0) = 64 +#define CTZ64_SUPPORTS_ZERO +#define HAS_CTZ64 + +namespace { + +inline int ctz64(uint64_t x) +{ + // In 2022 std::countr_zero(x) generates good assembly for + // most compilers & CPU architectures, except for: + // 1) GCC & Clang on x64 without __BMI__. + // 2) MSVC on x64 without __AVX2__. + // Hence on x64 CPUs we only use std::countr_zero(x) if + // the compiler generates the TZCNT instruction. + return std::countr_zero(x); +} + +} // namespace + +#elif defined(__GNUC__) || \ + __has_builtin(__builtin_ctzl) + +#define HAS_CTZ64 + +namespace { + +inline int ctz64(uint64_t x) +{ + // __builtin_ctz(0) is undefined behavior, + // we don't define CTZ64_SUPPORTS_ZERO. + ASSERT(x != 0); + +#if __cplusplus >= 201703L + if constexpr(sizeof(int) >= sizeof(uint64_t)) + return __builtin_ctz(x); + else if constexpr(sizeof(long) >= sizeof(uint64_t)) + return __builtin_ctzl(x); + else if constexpr(sizeof(long long) >= sizeof(uint64_t)) + return __builtin_ctzll(x); +#else + return __builtin_ctzll(x); +#endif +} + +} // namespace + +#endif + +#endif // CTZ_HPP diff --git a/include/primesieve/intrinsics.hpp b/include/primesieve/intrinsics.hpp deleted file mode 100644 index 3ce0725d1..000000000 --- a/include/primesieve/intrinsics.hpp +++ /dev/null @@ -1,345 +0,0 @@ -/// -/// @file intrinsics.hpp -/// @brief Wrappers for compiler intrinsics. -/// -/// Copyright (C) 2024 Kim Walisch, -/// -/// This file is distributed under the BSD License. See the COPYING -/// file in the top level directory. -/// - -#ifndef INTRINSICS_HPP -#define INTRINSICS_HPP - -#include "cpu_supports_popcnt.hpp" -#include "macros.hpp" - -#include - -namespace { - -/// This uses fewer arithmetic operations than any other known -/// implementation on machines with fast multiplication. -/// It uses 12 arithmetic operations, one of which is a multiply. -/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation -/// -inline uint64_t popcnt64_bitwise(uint64_t x) -{ - uint64_t m1 = 0x5555555555555555ll; - uint64_t m2 = 0x3333333333333333ll; - uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; - uint64_t h01 = 0x0101010101010101ll; - - x -= (x >> 1) & m1; - x = (x & m2) + ((x >> 2) & m2); - x = (x + (x >> 4)) & m4; - - return (x * h01) >> 56; -} - -} // namespace - -// GCC & Clang -#if defined(__GNUC__) || \ - __has_builtin(__builtin_popcountl) - -// CPUID is only enabled on x86 and x86-64 CPUs -// if the user compiles without -mpopcnt. -#if defined(ENABLE_CPUID_POPCNT) -#if defined(__x86_64__) - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ - // On my AMD EPYC 7642 CPU using GCC 12 this runtime - // check incurs an overall overhead of about 1%. - if_likely(cpu_supports_popcnt) - { - __asm__("popcnt %1, %0" : "=r"(x) : "r"(x)); - return x; - } - else - { - // On x86 and x64 CPUs when using the GCC compiler - // __builtin_popcount*(x) is slow (not inlined function call) - // when compiling without -mpopcnt. Therefore we avoid - // using __builtin_popcount*(x) here. - return popcnt64_bitwise(x); - } -} - -} // namespace - -#elif defined(__i386__) - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ - if_likely(cpu_supports_popcnt) - { - uint32_t x0 = uint32_t(x); - uint32_t x1 = uint32_t(x >> 32); - __asm__("popcnt %1, %0" : "=r"(x0) : "r"(x0)); - __asm__("popcnt %1, %0" : "=r"(x1) : "r"(x1)); - return x0 + x1; - } - else - { - // On x86 and x64 CPUs when using the GCC compiler - // __builtin_popcount*(x) is slow (not inlined function call) - // when compiling without -mpopcnt. Therefore we avoid - // using __builtin_popcount*(x) here. - return popcnt64_bitwise(x); - } -} - -} // namespace - -#endif // i386 - -#else // GCC & Clang (no CPUID, not x86) - -namespace { - -inline int popcnt64(uint64_t x) -{ -#if __cplusplus >= 201703L - if constexpr(sizeof(int) >= sizeof(uint64_t)) - return __builtin_popcount(x); - else if constexpr(sizeof(long) >= sizeof(uint64_t)) - return __builtin_popcountl(x); - else if constexpr(sizeof(long long) >= sizeof(uint64_t)) - return __builtin_popcountll(x); -#else - return __builtin_popcountll(x); -#endif -} - -} // namespace - -#endif // GCC & Clang - -#elif defined(_MSC_VER) && \ - defined(_M_X64) && \ - __has_include() - -#include - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ -#if defined(HAS_POPCNT) - return __popcnt64(x); -#elif defined(ENABLE_CPUID_POPCNT) - if_likely(cpu_supports_popcnt) - return __popcnt64(x); - else - return popcnt64_bitwise(x); -#else - return popcnt64_bitwise(x); -#endif -} - -} // namespace - -#elif defined(_MSC_VER) && \ - defined(_M_IX86) && \ - __has_include() - -#include - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ -#if defined(HAS_POPCNT) - return __popcnt(uint32_t(x)) + - __popcnt(uint32_t(x >> 32)); -#elif defined(ENABLE_CPUID_POPCNT) - if_likely(cpu_supports_popcnt) - return __popcnt(uint32_t(x)) + - __popcnt(uint32_t(x >> 32)); - else - return popcnt64_bitwise(x); -#else - return popcnt64_bitwise(x); -#endif -} - -} // namespace - -#elif #if __cplusplus >= 202002L && \ - __has_include() - -#include - -namespace { - -/// We only use the C++ standard library as a fallback if there -/// are no compiler intrinsics available for POPCNT. -/// Compiler intrinsics often generate faster assembly. -inline int popcnt64(uint64_t x) -{ - return std::popcount(x); -} - -} // namespace - -#else - -namespace { - -/// Portable (but slow) popcount algorithm -inline uint64_t popcnt64(uint64_t x) -{ - return popcnt64_bitwise(x); -} - -} // namespace - -#endif // popcnt64() - -// GCC/Clang & MSVC -#if defined(__x86_64__) || \ - defined(_M_X64) - #define IS_X64 -#endif - -// On x64 CPUs: -// GCC & Clang enable TZCNT with -mbmi. -// MSVC enables TZCNT with /arch:AVX2 or later. -#if defined(__BMI__) || \ - (defined(_MSC_VER) && defined(__AVX2__)) - #define HAS_TZCNT -#endif - -// In 2022 std::countr_zero(x) generates good assembly for -// most compilers & CPU architectures, except for: -// 1) GCC & Clang on x64 without __BMI__. -// 2) MSVC on x64 without __AVX2__. -// Hence on x64 CPUs we only use std::countr_zero(x) if -// the compiler generates the TZCNT instruction. -#if defined(HAS_CPP20_BIT_HEADER) && \ - (defined(HAS_TZCNT) || !defined(IS_X64)) - -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -namespace { - -inline int ctz64(uint64_t x) -{ - // No undefined behavior, std::countr_zero(0) = 64 - return std::countr_zero(x); -} - -} // namespace - -#elif (defined(__GNUC__) || \ - defined(__clang__)) && \ - defined(__x86_64__) - -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -namespace { - -inline uint64_t ctz64(uint64_t x) -{ -#if defined(HAS_TZCNT) - // No undefined behavior, TZCNT(0) = 64 - __asm__("tzcnt %1, %0" : "=r"(x) : "r"(x)); - return x; -#else - // REP BSF uses the TZCNT instruction on x64 CPUs with the BMI1 - // instruction set (>= 2013) and the BSF instruction on older x64 - // CPUs. BSF(0) is undefined behavior, it leaves the destination - // register unmodified. Fortunately, it is possible to avoid this - // undefined behavior by always setting the destination register - // to the same value before executing BSF(0). This works on all - // AMD & Intel CPUs since the i586 (from 1993), the Linux kernel - // also relies on this behavior, see this Linux commit: - // https://github.com/torvalds/linux/commit/ca3d30cc02f780f68771087040ce935add6ba2b7 - // - // The constraint "0" for input operand 1 says that it must occupy - // the same location as output operand 0. Hence the assembly below - // uses the same input & output register. This ensures that - // BSF(0) = 0, hence there is no undefined behavior. However, you - // cannot rely on ctz64(0) = 0 since TZCNT(0) = 64. - __asm__("rep bsf %1, %0" : "=r"(x) : "0"(x)); - ASSERT(x <= 64); - return x; -#endif -} - -} // namespace - -#elif (defined(__GNUC__) || \ - defined(__clang__)) && \ - defined(__aarch64__) - -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -namespace { - -inline uint64_t ctz64(uint64_t x) -{ - // No undefined behavior, CTZ(0) = 64. - // ARM64 has no CTZ instruction, we have to emulate it. - __asm__("rbit %0, %1 \n\t" - "clz %0, %0 \n\t" - : "=r" (x) - : "r" (x)); - - return x; -} - -} // namespace - -#elif defined(_MSC_VER) && \ - defined(HAS_TZCNT) && \ - __has_include() - -#include -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -// This allows us to generate the TZCNT instruction for MSVC -// without C++20 support, hence without std::countr_zero(x). -// No undefined behavior, _tzcnt_u64(0) = 64. -#define ctz64(x) _tzcnt_u64(x) - -#elif defined(__GNUC__) || \ - __has_builtin(__builtin_ctzl) - -#define HAS_CTZ64 - -namespace { - -inline int ctz64(uint64_t x) -{ - // __builtin_ctz(0) is undefined behavior, - // we don't define CTZ64_SUPPORTS_ZERO. - ASSERT(x != 0); - -#if __cplusplus >= 201703L - if constexpr(sizeof(int) >= sizeof(uint64_t)) - return __builtin_ctz(x); - else if constexpr(sizeof(long) >= sizeof(uint64_t)) - return __builtin_ctzl(x); - else if constexpr(sizeof(long long) >= sizeof(uint64_t)) - return __builtin_ctzll(x); -#else - return __builtin_ctzll(x); -#endif -} - -} // namespace - -#endif - -#endif // INTRINSICS_HPP diff --git a/include/primesieve/popcnt.hpp b/include/primesieve/popcnt.hpp new file mode 100644 index 000000000..c81b9f389 --- /dev/null +++ b/include/primesieve/popcnt.hpp @@ -0,0 +1,214 @@ +/// +/// @file popcnt.hpp +/// @brief Functions to count the number of 1 bits inside +/// a 64-bit variable. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef POPCNT_HPP +#define POPCNT_HPP + +#include "macros.hpp" +#include + +#if defined(ENABLE_MULTIARCH_x86_POPCNT) + #include "cpu_supports_popcnt.hpp" +#endif + +namespace { + +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// +inline uint64_t popcnt64_bitwise(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ll; + uint64_t m2 = 0x3333333333333333ll; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; + uint64_t h01 = 0x0101010101010101ll; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +} // namespace + +// GCC & Clang +#if defined(__GNUC__) || \ + __has_builtin(__builtin_popcountl) + +// CPUID is only enabled on x86 and x86-64 CPUs +// if the user compiles without -mpopcnt. +#if defined(ENABLE_MULTIARCH_x86_POPCNT) +#if defined(__x86_64__) + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + // On my AMD EPYC 7642 CPU using GCC 12 this runtime + // check incurs an overall overhead of about 1%. + if_likely(cpu_supports_popcnt) + { + __asm__("popcnt %1, %0" : "=r"(x) : "r"(x)); + return x; + } + else + { + // On x86 and x64 CPUs when using the GCC compiler + // __builtin_popcount*(x) is slow (not inlined function call) + // when compiling without -mpopcnt. Therefore we avoid + // using __builtin_popcount*(x) here. + return popcnt64_bitwise(x); + } +} + +} // namespace + +#elif defined(__i386__) + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + if_likely(cpu_supports_popcnt) + { + uint32_t x0 = uint32_t(x); + uint32_t x1 = uint32_t(x >> 32); + __asm__("popcnt %1, %0" : "=r"(x0) : "r"(x0)); + __asm__("popcnt %1, %0" : "=r"(x1) : "r"(x1)); + return x0 + x1; + } + else + { + // On x86 and x64 CPUs when using the GCC compiler + // __builtin_popcount*(x) is slow (not inlined function call) + // when compiling without -mpopcnt. Therefore we avoid + // using __builtin_popcount*(x) here. + return popcnt64_bitwise(x); + } +} + +} // namespace + +#endif // i386 + +#else // GCC & Clang (no CPUID, not x86) + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ +#if __cplusplus >= 201703L + if constexpr(sizeof(int) >= sizeof(uint64_t)) + return (uint64_t) __builtin_popcount(x); + else if constexpr(sizeof(long) >= sizeof(uint64_t)) + return (uint64_t) __builtin_popcountl(x); + else if constexpr(sizeof(long long) >= sizeof(uint64_t)) + return (uint64_t) __builtin_popcountll(x); +#else + return (uint64_t) __builtin_popcountll(x); +#endif +} + +} // namespace + +#endif // GCC & Clang + +#elif defined(_MSC_VER) && \ + defined(_M_X64) && \ + __has_include() + +#include + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ +#if defined(__POPCNT__) || \ + defined(__AVX__) + return __popcnt64(x); + +#elif defined(ENABLE_MULTIARCH_x86_POPCNT) + if_likely(cpu_supports_popcnt) + return __popcnt64(x); + else + return popcnt64_bitwise(x); + +#else + return popcnt64_bitwise(x); +#endif +} + +} // namespace + +#elif defined(_MSC_VER) && \ + defined(_M_IX86) && \ + __has_include() + +#include + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ +#if defined(__POPCNT__) || \ + defined(__AVX__) + return __popcnt(uint32_t(x)) + + __popcnt(uint32_t(x >> 32)); + +#elif defined(ENABLE_MULTIARCH_x86_POPCNT) + if_likely(cpu_supports_popcnt) + return __popcnt(uint32_t(x)) + + __popcnt(uint32_t(x >> 32)); + else + return popcnt64_bitwise(x); + +#else + return popcnt64_bitwise(x); +#endif +} + +} // namespace + +#elif __cplusplus >= 202002L && \ + __has_include() + +#include + +namespace { + +/// We only use the C++ standard library as a fallback if there +/// are no compiler intrinsics available for POPCNT. +/// Compiler intrinsics often generate faster assembly. +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + return std::popcount(x); +} + +} // namespace + +#else + +namespace { + +/// Portable (but slow) popcount algorithm +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + return popcnt64_bitwise(x); +} + +} // namespace + +#endif + +#endif // POPCNT_HPP diff --git a/scripts/build_clang_multiarch_win_x64.bat b/scripts/build_clang_multiarch_win_x64.bat index d7b53231d..85c599762 100644 --- a/scripts/build_clang_multiarch_win_x64.bat +++ b/scripts/build_clang_multiarch_win_x64.bat @@ -1 +1 @@ -clang++ -I../include -O3 -DNDEBUG -DENABLE_MULTIARCH_AVX512 ../src/*.cpp ../src/app/*.cpp -o primesieve.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" +clang++ -I../include -O3 -mpopcnt -DNDEBUG -DENABLE_MULTIARCH_AVX512_VBMI2 ../src/*.cpp ../src/x86/*.cpp ../src/app/*.cpp -o primesieve.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" diff --git a/scripts/build_mingw64_x64.sh b/scripts/build_mingw64_x64.sh index c219e09d0..187f24ff1 100755 --- a/scripts/build_mingw64_x64.sh +++ b/scripts/build_mingw64_x64.sh @@ -45,7 +45,7 @@ git pull mkdir build-release cd build-release -g++ -static -O3 -flto -DNDEBUG -D_WIN32_WINNT=0x0A00 -Wall -Wextra -pedantic -I ../include ../src/*.cpp ../src/app/*.cpp -o primesieve.exe +g++ -static -O3 -mpopcnt -flto -DNDEBUG -D_WIN32_WINNT=0x0A00 -Wall -Wextra -pedantic -DENABLE_MULTIARCH_AVX512_VBMI2 -I ../include ../src/*.cpp ../src/x86/*.cpp ../src/app/*.cpp -o primesieve.exe strip primesieve.exe # Create a release zip archive diff --git a/src/CpuInfo.cpp b/src/CpuInfo.cpp index 365a24396..15a33e9a1 100644 --- a/src/CpuInfo.cpp +++ b/src/CpuInfo.cpp @@ -39,20 +39,12 @@ #define APPLE_SYSCTL #endif -#if defined(__i386__) || \ - defined(__x86_64__) || \ - defined(_M_IX86) || \ - defined(_M_X64) - #include - #include - #define HAS_CPUID -#endif - #if defined(_WIN32) #include #include +#include #include #include @@ -62,25 +54,29 @@ std::string getCpuName() { std::string cpuName; -#if defined(HAS_CPUID) +#if defined(__i386__) || \ + defined(__x86_64__) || \ + defined(_M_IX86) || \ + defined(_M_X64) + // Get the CPU name using CPUID. // Example: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz // https://en.wikipedia.org/wiki/CPUID int cpuInfo[4] = { 0, 0, 0, 0 }; - run_cpuid(0x80000000, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000000, 0); std::vector vect; // check if CPU name is supported if ((unsigned) cpuInfo[0] >= 0x80000004u) { - run_cpuid(0x80000002, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000002, 0); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); - run_cpuid(0x80000003, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000003, 0); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); - run_cpuid(0x80000004, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000004, 0); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); vect.push_back(0); @@ -755,15 +751,6 @@ std::string CpuInfo::cpuName() const } } -bool CpuInfo::hasAVX512() const -{ - #if defined(HAS_CPUID) - return cpu_supports_avx512_vbmi2; - #else - return false; - #endif -} - size_t CpuInfo::logicalCpuCores() const { return logicalCpuCores_; diff --git a/src/PrimeGenerator.cpp b/src/PrimeGenerator.cpp index 3be1e0a1b..5406b298b 100644 --- a/src/PrimeGenerator.cpp +++ b/src/PrimeGenerator.cpp @@ -27,16 +27,16 @@ #include #include #include +#include #include -#include #include #include #include #include -#if defined(ENABLE_AVX512) || \ - defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_AVX512_VBMI2) || \ + defined(ENABLE_MULTIARCH_AVX512_VBMI2) #include #endif @@ -458,8 +458,8 @@ void PrimeGenerator::fillNextPrimes_default(Vector& primes, std::size_ #endif -#if defined(ENABLE_AVX512) || \ - defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_AVX512_VBMI2) || \ + defined(ENABLE_MULTIARCH_AVX512_VBMI2) /// This algorithm converts 1 bits from the sieve array into primes /// using AVX512. The algorithm is a modified version of the AVX512 @@ -474,10 +474,10 @@ void PrimeGenerator::fillNextPrimes_default(Vector& primes, std::size_ /// benchmarks this algorithm ran about 10% faster than the default /// fillNextPrimes() algorithm which uses __builtin_ctzll(). /// -#if defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_MULTIARCH_AVX512_VBMI2) __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) #endif -void PrimeGenerator::fillNextPrimes_avx512(Vector& primes, std::size_t* size) +void PrimeGenerator::fillNextPrimes_avx512_vbmi2(Vector& primes, std::size_t* size) { *size = 0; diff --git a/src/SievingPrimes.cpp b/src/SievingPrimes.cpp index ee140cd48..a28bcd8be 100644 --- a/src/SievingPrimes.cpp +++ b/src/SievingPrimes.cpp @@ -2,7 +2,7 @@ /// @file SievingPrimes.cpp /// Generates the sieving primes up n^(1/2). /// -/// Copyright (C) 2022 Kim Walisch, +/// Copyright (C) 2024 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/src/app/main.cpp b/src/app/main.cpp index 6b25675da..ce0525621 100644 --- a/src/app/main.cpp +++ b/src/app/main.cpp @@ -35,6 +35,16 @@ #include #include +#if defined(ENABLE_MULTIARCH_AVX512_VBMI2) + +namespace primesieve { + +bool has_cpuid_avx512_vbmi2(); + +} // namespace + +#endif + void help(int exitCode); void version(); void stressTest(const CmdOptions& opts); @@ -223,14 +233,9 @@ void cpuInfo() else std::cout << "Logical CPU cores: unknown" << std::endl; - // Enable on x86 CPUs - #if defined(__x86_64__) || \ - defined(__i386__) || \ - defined(_M_X64) || \ - defined(_M_IX86) || \ - defined(__AVX512F__) + #if defined(ENABLE_MULTIARCH_AVX512_VBMI2) - if (cpu.hasAVX512()) + if (primesieve::has_cpuid_avx512_vbmi2()) std::cout << "Has AVX512: yes" << std::endl; else std::cout << "Has AVX512: no" << std::endl; diff --git a/src/popcount.cpp b/src/popcount.cpp index 93eab08ce..adede0c35 100644 --- a/src/popcount.cpp +++ b/src/popcount.cpp @@ -8,7 +8,7 @@ /// file in the top level directory. /// -#include +#include #include #include diff --git a/src/x86/cpuid.cpp b/src/x86/cpuid.cpp new file mode 100644 index 000000000..b571057c9 --- /dev/null +++ b/src/x86/cpuid.cpp @@ -0,0 +1,125 @@ +/// +/// @file cpuid.cpp +/// @brief CPUID for x86 and x86-64 CPUs. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#include + +#if defined(_MSC_VER) + #include + #include +#endif + +// CPUID bits documentation: +// https://en.wikipedia.org/wiki/CPUID + +// %ebx bit flags +#define bit_AVX512F (1 << 16) + +// %ecx bit flags +#define bit_AVX512VBMI (1 << 1) +#define bit_AVX512VBMI2 (1 << 6) +#define bit_POPCNT (1 << 23) + +// xgetbv bit flags +#define XSTATE_SSE (1 << 1) +#define XSTATE_YMM (1 << 2) +#define XSTATE_ZMM (7 << 5) + +namespace { + +void run_cpuid(int eax, int ecx, int* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + int ebx = 0; + int edx = 0; + + #if defined(__i386__) && \ + defined(__PIC__) + // In case of PIC under 32-bit EBX cannot be clobbered + asm volatile("movl %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "+a" (eax), + "=D" (ebx), + "+c" (ecx), + "=d" (edx)); + #else + asm volatile("cpuid" + : "+a" (eax), + "+b" (ebx), + "+c" (ecx), + "=d" (edx)); + #endif + + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +// Get Value of Extended Control Register +uint64_t get_xcr0() +{ +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax; + uint32_t edx; + + asm volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); + return eax | (uint64_t(edx) << 32); +#endif +} + +} // namespace + +namespace primesieve { + +bool has_cpuid_popcnt() +{ + int abcd[4]; + run_cpuid(1, 0, abcd); + return (abcd[2] & bit_POPCNT) == bit_POPCNT; +} + +bool has_cpuid_avx512_vbmi2() +{ + int abcd[4]; + + run_cpuid(1, 0, abcd); + + int osxsave_mask = (1 << 27); + + // Ensure OS supports extended processor state management + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return false; + + uint64_t ymm_mask = XSTATE_SSE | XSTATE_YMM; + uint64_t zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + uint64_t xcr0 = get_xcr0(); + + // Check AVX OS support + if ((xcr0 & ymm_mask) != ymm_mask) + return false; + + // Check AVX512 OS support + if ((xcr0 & zmm_mask) != zmm_mask) + return false; + + run_cpuid(7, 0, abcd); + + // PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2 + return ((abcd[1] & bit_AVX512F) == bit_AVX512F && + (abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2)); +} + +} // namespace diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 190ed9e7e..77fb8845c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,6 +5,6 @@ foreach(file ${files}) get_filename_component(binary_name ${file} NAME_WE) add_executable(${binary_name} ${file}) target_link_libraries(${binary_name} primesieve::primesieve) - target_compile_definitions(${binary_name} PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") + target_compile_definitions(${binary_name} PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) add_test(NAME ${binary_name} COMMAND ${binary_name}) endforeach() diff --git a/test/cpuid.cpp b/test/cpuid.cpp index 64c0d9361..6ed4b85c1 100644 --- a/test/cpuid.cpp +++ b/test/cpuid.cpp @@ -8,7 +8,10 @@ /// file in the top level directory. /// -#include +#if defined(ENABLE_MULTIARCH_x86_POPCNT) + #include +#endif + #include int main() @@ -18,52 +21,22 @@ int main() defined(_M_X64) || \ defined(_M_IX86) - #if defined(__POPCNT__) - #if defined(HAS_POPCNT) - std::cout << "OK: __POPCNT__ and HAS_POPCNT are defined!" << std::endl; - #else - std::cerr << "Error: HAS_POPCNT must be defined if __POPCNT__ is defined!" << std::endl; - return 1; - #endif - #endif - - #if defined(__AVX__) - #if defined(HAS_POPCNT) - std::cout << "OK: __AVX__ and HAS_POPCNT are defined!" << std::endl; - #else - std::cerr << "Error: HAS_POPCNT must be defined if __AVX__ is defined!" << std::endl; - return 1; - #endif + #if defined(__POPCNT__) && defined(ENABLE_MULTIARCH_x86_POPCNT) + std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __POPCNT__ is defined!" << std::endl; #endif - #if defined(__AVX2__) - #if defined(HAS_POPCNT) - std::cout << "OK: __AVX2__ and HAS_POPCNT are defined!" << std::endl; - #else - std::cerr << "Error: HAS_POPCNT must be defined if __AVX2__ is defined!" << std::endl; - return 1; - #endif - #endif + #if defined(_MSC_VER) && \ + !defined(__POPCNT__) - #if defined(HAS_POPCNT) - #if !defined(ENABLE_CPUID_POPCNT) - std::cout << "OK: HAS_POPCNT is defined but ENABLE_CPUID_POPCNT is not defined!" << std::endl; - #else - std::cerr << "Error: ENABLE_CPUID_POPCNT must not be defined if HAS_POPCNT is defined!" << std::endl; - return 1; + #if defined(__AVX__) && defined(ENABLE_MULTIARCH_x86_POPCNT) + std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __AVX__ is defined!" << std::endl; #endif - #endif - - #if !defined(HAS_POPCNT) - #if defined(ENABLE_CPUID_POPCNT) - std::cout << "OK: HAS_POPCNT is not defined but ENABLE_CPUID_POPCNT is defined!" << std::endl; - #else - std::cerr << "Error: ENABLE_CPUID_POPCNT must be defined if HAS_POPCNT is not defined!" << std::endl; - return 1; + #if defined(__AVX2__) && defined(ENABLE_MULTIARCH_x86_POPCNT) + std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __AVX2__ is defined!" << std::endl; #endif #endif - #if defined(ENABLE_CPUID_POPCNT) + #if defined(ENABLE_MULTIARCH_x86_POPCNT) std::cout << "CPU supports POPCNT: " << (cpu_supports_popcnt ? "yes" : "no") << std::endl; #endif