[MicroBenchmarks] Add initial loop vectorization benchmarks.

This patch adds initial micro-benchmarks with interesting loop-vectorization cases. To start with, it includes benchmarks using libm math functions. For each math function, there's a benchmark for the auto-vectorized version and a version with vectorization disabled. The auto-vec version of the benchmark also compares the results of the auto-vectorized functions to the scalar versions. Reviewed By: Meinersbur, lebedev.ri Differential Revision: https://reviews.llvm.org/D101844
yxsamliu · May 17, 2021 · 3af2314 · 3af2314
1 parent fb038de
commit 3af2314
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 0 deletions.
diff --git a/MicroBenchmarks/CMakeLists.txt b/MicroBenchmarks/CMakeLists.txt
@@ -7,4 +7,5 @@ add_subdirectory(LCALS)
 add_subdirectory(harris)
 add_subdirectory(ImageProcessing)
 add_subdirectory(LoopInterchange)
+add_subdirectory(LoopVectorization)
 add_subdirectory(MemFunctions)
diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt
@@ -0,0 +1,16 @@
+llvm_test_run(WORKDIR ${CMAKE_CURRENT_BINARY_DIR})
+
+# Only enable verification of results if neither 'benchmarking only' has been
+# selected nor -ffast-math is passed.
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
+set(COMBINED_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${CPPFLAGS}")
+if (NOT TEST_SUITE_BENCHMARKING_ONLY AND
+    NOT ${COMBINED_CXX_FLAGS} MATCHES ".*-ffast-math.*")
+  list(APPEND CPPFLAGS -DBENCH_AND_VERIFY)
+endif()
+
+llvm_test_executable(LoopVectorizationBenchmarks
+  main.cpp
+  MathFunctions.cpp)
+
+target_link_libraries(LoopVectorizationBenchmarks benchmark)
diff --git a/MicroBenchmarks/LoopVectorization/MathFunctions.cpp b/MicroBenchmarks/LoopVectorization/MathFunctions.cpp
@@ -0,0 +1,128 @@
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <random>
+
+#include "benchmark/benchmark.h"
+
+#define N 10000
+
+// Apply Fn(A[i]) + Fn(B[i]) in loop, with default loop vectorization settings.
+template <typename T> static void run_fn_autovec(T *A, T *B, T *C, T (*Fn)(T)) {
+  for (unsigned i = 0; i < N; i++) {
+    C[i] = Fn(A[i]) + Fn(B[i]);
+  }
+}
+
+// Apply Fn(A[i]) + Fn(B[i]) in loop, with loop vectorization disabled.
+template <typename T> static void run_fn_novec(T *A, T *B, T *C, T (*Fn)(T)) {
+#pragma clang loop vectorize(disable) interleave(disable)
+  for (unsigned i = 0; i < N; i++) {
+    C[i] = Fn(A[i]) + Fn(B[i]);
+  }
+}
+
+// Initialize arrays A, B and T with random numbers.
+template <typename T> static void init_data(T *A, T *B, T *C) {
+  std::uniform_real_distribution<T> dist(-100, 100);
+  std::mt19937 rng(12345);
+  for (unsigned i = 0; i < N; i++) {
+    A[i] = dist(rng);
+    B[i] = dist(rng);
+    C[i] = dist(rng);
+  }
+}
+
+// Benchmark auto-vectorized version using Fn.
+template <typename T>
+static void __attribute__((always_inline))
+benchmark_fn_autovec(benchmark::State &state, T (*Fn)(T)) {
+  std::unique_ptr<T[]> A(new T[N]);
+  std::unique_ptr<T[]> B(new T[N]);
+  std::unique_ptr<T[]> C(new T[N]);
+  init_data(&A[0], &B[0], &C[0]);
+
+#ifdef BENCH_AND_VERIFY
+  // Verify the vectorized and un-vectorized versions produce the same results.
+  {
+    std::unique_ptr<T[]> CNovec(new T[N]);
+    for (unsigned i = 0; i < N; i++)
+      CNovec[i] = C[i];
+
+    run_fn_novec(&A[0], &B[0], &CNovec[0], Fn);
+    run_fn_autovec(&A[0], &B[0], &C[0], Fn);
+    for (unsigned i = 0; i < N; i++)
+      // If there's a value mismatch, fall back to fpclassify.
+      if (C[i] != CNovec[i] && fpclassify(C[i]) != fpclassify(CNovec[i])) {
+        std::cerr << "ERROR: autovec result different to scalar result " << C[i]
+                  << " != " << CNovec[i] << " at index " << i << "\n";
+        exit(1);
+      }
+  }
+#endif
+
+  for (auto _ : state) {
+    run_fn_autovec(&A[0], &B[0], &C[0], Fn);
+    benchmark::DoNotOptimize(A);
+    benchmark::DoNotOptimize(B);
+    benchmark::DoNotOptimize(C);
+    benchmark::ClobberMemory();
+  }
+}
+
+// Benchmark version using Fn with vectorization disabled.
+template <typename T>
+static void __attribute__((always_inline))
+benchmark_fn_novec(benchmark::State &state, T (*Fn)(T)) {
+  std::unique_ptr<T[]> A(new T[N]);
+  std::unique_ptr<T[]> B(new T[N]);
+  std::unique_ptr<T[]> C(new T[N]);
+  init_data(&A[0], &B[0], &C[0]);
+
+  for (auto _ : state) {
+    run_fn_novec(&A[0], &B[0], &C[0], Fn);
+    benchmark::DoNotOptimize(A);
+    benchmark::DoNotOptimize(B);
+    benchmark::DoNotOptimize(C);
+  }
+}
+
+// Add add auto-vectorized and disabled vectorization benchmarks for math
+// function fn and type ty.
+#define ADD_BENCHMARK(fn, ty)                                                  \
+  void BENCHMARK_##fn##_autovec_##ty##_(benchmark::State &state) {             \
+    benchmark_fn_autovec<ty>(state, fn);                                       \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_##fn##_autovec_##ty##_)->Unit(benchmark::kMicrosecond);  \
+                                                                               \
+  void BENCHMARK_##fn##_novec_##ty##_(benchmark::State &state) {               \
+    benchmark_fn_novec<ty>(state, fn);                                         \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_##fn##_novec_##ty##_)->Unit(benchmark::kMicrosecond);
+
+ADD_BENCHMARK(expf, float)
+ADD_BENCHMARK(exp, double)
+
+ADD_BENCHMARK(acosf, float)
+ADD_BENCHMARK(acos, double)
+
+ADD_BENCHMARK(asinf, float)
+ADD_BENCHMARK(asin, double)
+
+ADD_BENCHMARK(atanf, float)
+ADD_BENCHMARK(atan, double)
+
+ADD_BENCHMARK(cbrtf, float)
+ADD_BENCHMARK(cbrt, double)
+
+ADD_BENCHMARK(erff, float)
+ADD_BENCHMARK(erf, double)
+
+ADD_BENCHMARK(cosf, float)
+ADD_BENCHMARK(cos, double)
+
+ADD_BENCHMARK(sinf, float)
+ADD_BENCHMARK(sin, double)
+
+ADD_BENCHMARK(sinhf, float)
+ADD_BENCHMARK(sinh, double)
diff --git a/MicroBenchmarks/LoopVectorization/main.cpp b/MicroBenchmarks/LoopVectorization/main.cpp
@@ -0,0 +1,8 @@
+#include "benchmark/benchmark.h"
+
+int main(int argc, char *argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  benchmark::RunSpecifiedBenchmarks();
+  return EXIT_SUCCESS;
+}