Templated perf monitor and benchmark (cmu-db#1123)

awfeequdng · Aug 26, 2020 · e105d4e · e105d4e
1 parent f7b60df
commit e105d4e
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 94 deletions.
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -11,6 +11,7 @@ if (TERRIER_BUILD_BENCHMARKS)
     # benchmarks
 
     add_subdirectory(catalog)
+    add_subdirectory(common)
     add_subdirectory(integration)
     add_subdirectory(metrics)
     add_subdirectory(parser)

diff --git a/benchmark/common/CMakeLists.txt b/benchmark/common/CMakeLists.txt
@@ -0,0 +1 @@
+ADD_TERRIER_BENCHMARKS()
diff --git a/benchmark/common/perf_monitor_benchmark.cpp b/benchmark/common/perf_monitor_benchmark.cpp
@@ -0,0 +1,45 @@
+#include "benchmark/benchmark.h"
+#include "common/perf_monitor.h"
+
+namespace terrier {
+
+/**
+ * These benchmarks exist to verify the performance difference between grouped and ungrouped perf counters. We do not
+ * include them in our CI regression checks since their behavior is determined more by the OS than our wrapper.
+ */
+class PerfMonitorBenchmark : public benchmark::Fixture {};
+
+/**
+ * Benchmark with inherit flag (count children) false
+ */
+// NOLINTNEXTLINE
+BENCHMARK_DEFINE_F(PerfMonitorBenchmark, Basic)(benchmark::State &state) {
+  common::PerfMonitor<false> monitor;
+  // NOLINTNEXTLINE
+  for (auto _ : state) {
+    monitor.Start();
+    monitor.Stop();
+    monitor.Counters();
+  }
+  state.SetItemsProcessed(state.iterations());
+}
+
+/**
+ * Benchmark with inherit flag (count children) true
+ */
+// NOLINTNEXTLINE
+BENCHMARK_DEFINE_F(PerfMonitorBenchmark, Inherit)(benchmark::State &state) {
+  common::PerfMonitor<true> monitor;
+  // NOLINTNEXTLINE
+  for (auto _ : state) {
+    monitor.Start();
+    monitor.Stop();
+    monitor.Counters();
+  }
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK_REGISTER_F(PerfMonitorBenchmark, Basic);
+
+BENCHMARK_REGISTER_F(PerfMonitorBenchmark, Inherit);
+}  // namespace terrier
diff --git a/src/include/common/perf_monitor.h b/src/include/common/perf_monitor.h
@@ -19,14 +19,23 @@ namespace terrier::common {
  * Wrapper around hw perf events provided by the Linux kernel. Instantiating and destroying PerfMonitors are a bit
  * expensive because they open multiple file descriptors (read: syscalls). Ideally you want to keep a PerfMonitor object
  * around for a portion of code you want to profile, and then just rely on Start() and Stop().
+ * @tparam inherit true means that any threads spawned from this thread after the perf counter instantiation will be
+ * accumulated into the parents' counters. This has performance implications. false otherwise (only count this thread's
+ * counters, regardless of spawned threads)
  */
+template <bool inherit>
 class PerfMonitor {
  public:
   /**
    * Represents the struct read_format with PERF_FORMAT_GROUP enabled, PERF_FORMAT_TOTAL_TIME_ENABLED and
    * PERF_FORMAT_TOTAL_TIME_RUNNING disabled. http://www.man7.org/linux/man-pages/man2/perf_event_open.2.html
    */
   struct PerfCounters {
+    /**
+     * Should always be NUM_HW_EVENTS after a read since that's how many counters we have.
+     */
+    uint64_t num_counters_;
+
     /**
      * Total cycles. Be wary of what happens during CPU frequency scaling.
      */
@@ -89,11 +98,9 @@ class PerfMonitor {
   };
 
   /**
-   * @param count_children_tasks true if spawned threads should inherit perf counters. Calling counters on parent will
-   * accumulate all.
-   * @warning a true arg seems to result in garbage counters if any are separately created in children tasks.
+   * Create a perf monitor and open all of the necessary file descriptors.
    */
-  explicit PerfMonitor(const bool count_children_tasks) {
+  PerfMonitor() {
 #if __APPLE__
     // Apple doesn't support perf events and currently doesn't expose an equivalent kernel API
     valid_ = false;
@@ -106,16 +113,25 @@ class PerfMonitor {
     pe.disabled = 1;
     pe.exclude_kernel = 1;
     pe.exclude_hv = 1;
-    if (count_children_tasks) {
+    if constexpr (inherit) {  // NOLINT
+      // Count your children's counters
       pe.inherit = 1;
       pe.inherit_stat = 1;
+    } else {  // NOLINT
+      // Don't read children thread counters, can optimize to read this thread's counters in group fashion
+      pe.read_format = PERF_FORMAT_GROUP;
     }
-    //    pe.read_format = PERF_FORMAT_GROUP;
 
-    // Open file descriptors for each perf_event that we want. We reuse the first entry of the array as the group fd.
+    // Open file descriptors for each perf_event that we want.
     for (uint8_t i = 0; i < NUM_HW_EVENTS; i++) {
       pe.config = HW_EVENTS[i];
-      event_files_[i] = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
+      if constexpr (inherit) {  // NOLINT
+        // Each counter is its own group (-1 group fd)
+        event_files_[i] = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
+      } else {  // NOLINT
+        //  We reuse the first entry of the array as the group fd.
+        event_files_[i] = syscall(__NR_perf_event_open, &pe, 0, -1, event_files_[0], 0);
+      }
       valid_ = valid_ && event_files_[i] > 2;  // 0, 1, 2 are reserved for stdin, stdout, stderr respectively
     }
 #endif
@@ -141,11 +157,20 @@ class PerfMonitor {
     // do nothing
 #else
     if (valid_) {
-      // Iterate through all of the events' file descriptors resetting and starting them
-      for (const auto i : event_files_) {
-        auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_RESET);
+      if constexpr (inherit) {  // NOLINT
+        // Iterate through all of the events' file descriptors resetting and starting them
+        for (const auto i : event_files_) {
+          auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_RESET);
+          TERRIER_ASSERT(result >= 0, "Failed to reset events.");
+          result = ioctl(i, PERF_EVENT_IOC_ENABLE);
+          TERRIER_ASSERT(result >= 0, "Failed to enable events.");
+        }
+      } else {  // NOLINT
+        // Reset all of the counters out with a single syscall.
+        auto result UNUSED_ATTRIBUTE = ioctl(event_files_[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
         TERRIER_ASSERT(result >= 0, "Failed to reset events.");
-        result = ioctl(i, PERF_EVENT_IOC_ENABLE);
+        // Start all of the counters out with a single syscall.
+        result = ioctl(event_files_[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
         TERRIER_ASSERT(result >= 0, "Failed to enable events.");
       }
       running_ = true;
@@ -162,10 +187,15 @@ class PerfMonitor {
 #else
     if (valid_) {
       TERRIER_ASSERT(running_, "StopEvents() called without StartEvents() first.");
-
-      // Iterate through all of the events' file descriptors stopping them
-      for (const auto i : event_files_) {
-        auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_DISABLE);
+      if constexpr (inherit) {  // NOLINT
+        // Iterate through all of the events' file descriptors stopping them
+        for (const auto i : event_files_) {
+          auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_DISABLE);
+          TERRIER_ASSERT(result >= 0, "Failed to disable events.");
+        }
+      } else {  // NOLINT
+        // Stop all of the counters out with a single syscall.
+        auto result UNUSED_ATTRIBUTE = ioctl(event_files_[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
         TERRIER_ASSERT(result >= 0, "Failed to disable events.");
       }
       running_ = false;
@@ -175,30 +205,37 @@ class PerfMonitor {
 
   /**
    * Read out counters for the profiled period
-   * @return
+   * @return struct representing the counters
    */
   PerfCounters Counters() const {
     PerfCounters counters{};  // zero initialization
     if (valid_) {
-      // Iterate through all of the events' file descriptors reading them
+      if constexpr (inherit) {  // NOLINT
+        // Iterate through all of the events' file descriptors reading them
 
-      auto bytes_read UNUSED_ATTRIBUTE = read(event_files_[0], &counters.cpu_cycles_, sizeof(uint64_t));
-      TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
+        auto bytes_read UNUSED_ATTRIBUTE = read(event_files_[0], &counters.cpu_cycles_, sizeof(uint64_t));
+        TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
 
-      bytes_read = read(event_files_[1], &counters.instructions_, sizeof(uint64_t));
-      TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
+        bytes_read = read(event_files_[1], &counters.instructions_, sizeof(uint64_t));
+        TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
 
-      bytes_read = read(event_files_[2], &counters.cache_references_, sizeof(uint64_t));
-      TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
+        bytes_read = read(event_files_[2], &counters.cache_references_, sizeof(uint64_t));
+        TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
 
-      bytes_read = read(event_files_[3], &counters.cache_misses_, sizeof(uint64_t));
-      TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
+        bytes_read = read(event_files_[3], &counters.cache_misses_, sizeof(uint64_t));
+        TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
 
-      bytes_read = read(event_files_[4], &counters.bus_cycles_, sizeof(uint64_t));
-      TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
+        bytes_read = read(event_files_[4], &counters.bus_cycles_, sizeof(uint64_t));
+        TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
 
-      bytes_read = read(event_files_[5], &counters.ref_cpu_cycles_, sizeof(uint64_t));
-      TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
+        bytes_read = read(event_files_[5], &counters.ref_cpu_cycles_, sizeof(uint64_t));
+        TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
+      } else {  // NOLINT
+        // Read all of the counters out with a single syscall.
+        auto bytes_read UNUSED_ATTRIBUTE = read(event_files_[0], &counters, sizeof(PerfCounters));
+        TERRIER_ASSERT(bytes_read == sizeof(PerfCounters), "Failed to read the counters.");
+        TERRIER_ASSERT(counters.num_counters_ == NUM_HW_EVENTS, "Failed to read the counters.");
+      }
     }
     return counters;
   }
@@ -211,7 +248,7 @@ class PerfMonitor {
  private:
   // set the first file descriptor to -1. Since event_files[0] is always passed into group_fd on
   // perf_event_open, this has the effect of making the first event the group leader. All subsequent syscalls can use
-  // that fd.
+  // that fd if we are not inheriting child counters.
   std::array<int32_t, NUM_HW_EVENTS> event_files_{-1};
   bool valid_ = true;
 

diff --git a/src/include/common/resource_tracker.h b/src/include/common/resource_tracker.h
@@ -18,6 +18,8 @@ namespace terrier::common {
  * thread-local level, but in theory this can be extended to track the system global resource usage.
  */
 class ResourceTracker {
+  static constexpr bool COUNT_CHILDREN_THREADS = false;
+
  public:
   /**
    * Store the start time, the duration, the perf counters and the rusage counters for the tracked event
@@ -28,7 +30,7 @@ class ResourceTracker {
     /** The elapsed time of the tracked event (microseconds) */
     uint64_t elapsed_us_;
     /** The perf counters of the tracked event */
-    PerfMonitor::PerfCounters counters_;
+    PerfMonitor<COUNT_CHILDREN_THREADS>::PerfCounters counters_;
     /** The rusage counters of the tracked event */
     rusage rusage_;
     /** The number of the CPU on which the thread is currently executing */
@@ -100,7 +102,7 @@ class ResourceTracker {
    */
   void SetMemory(const size_t memory_b) { metrics_.memory_b_ = memory_b; }
 
-  PerfMonitor perf_monitor_{false};
+  PerfMonitor<COUNT_CHILDREN_THREADS> perf_monitor_;
   RusageMonitor rusage_monitor_{false};
 
   // The struct to hold all the tracked resource metrics

diff --git a/test/common/perf_monitor_test.cpp b/test/common/perf_monitor_test.cpp
@@ -2,92 +2,76 @@
 
 #include <thread>  //NOLINT
 
-#include "common/macros.h"
 #include "common/managed_pointer.h"
-#include "common/scoped_timer.h"
 #include "main/db_main.h"
-#include "test_util/storage_test_util.h"
 #include "test_util/test_harness.h"
 
 namespace terrier {
 
-class PerfMonitorTests : public TerrierTest {
- public:
-  static void CreateAndDestroyCatalog(common::PerfMonitor::PerfCounters *const counters) {
-    common::PerfMonitor monitor(false);
+/**
+ * These tests mostly exist to make sure we can compile and use the API of perf counters. It's difficult to make any
+ * assertions about the counters' values due to OS behavior (unsupported on macOS, unsupported on Linux without changing
+ * kernel flags or possibly running the tests as root). I keep the test around as an easy sanity check via break points
+ * to make sure perf counters give us the data we want on a given system.
+ */
+class PerfMonitorTests : public TerrierTest {};
+
+template <bool inherit, typename perf_counters>
+static void CreateAndDestroyCatalog(perf_counters *const counters) {
+  common::PerfMonitor<inherit> monitor;
+  monitor.Start();
 
-    auto db_main = terrier::DBMain::Builder().SetUseGC(true).SetUseCatalog(true).Build();
-    monitor.Start();
-    db_main.reset();
+  auto db_main = terrier::DBMain::Builder().SetUseGC(true).SetUseCatalog(true).Build();
+  db_main.reset();
 
-    monitor.Stop();
-    *counters = monitor.Counters();
-  }
+  monitor.Stop();
+  *counters = monitor.Counters();
+}
+
+template <bool inherit, typename perf_counters>
+static void JustSleep(perf_counters *const counters) {
+  common::PerfMonitor<inherit> monitor;
+  monitor.Start();
 
-  static void JustSleep(common::PerfMonitor::PerfCounters *const counters) {
-    common::PerfMonitor monitor(false);
-    monitor.Start();
-    std::this_thread::sleep_for(std::chrono::seconds(2));
-    monitor.Stop();
-    *counters = monitor.Counters();
-  }
-};
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+
+  monitor.Stop();
+  *counters = monitor.Counters();
+}
 
 /**
  * Simple test that spins off 2 threads. One thread builds and then destroys a Catalog. The other sleeps. The parent
- * waits. We then do comparisons on their respective perf counters based on the work performed.
+ * waits. We can't make strong assertions about their values due to scheduling uncertainty.
  */
-// NOLINTNEXTLINE
-TEST_F(PerfMonitorTests, BasicTest) {
-  common::PerfMonitor parent_monitor(false);
-  common::PerfMonitor::PerfCounters parent_counters, catalog_counters, sleep_counters;
+template <bool inherit, typename perf_counters>
+static void UnbalancedChildrenThreads() {
+  common::PerfMonitor<inherit> parent_monitor;
+  perf_counters parent_counters, catalog_counters, sleep_counters;
   parent_monitor.Start();
-  std::thread thread1(CreateAndDestroyCatalog, &catalog_counters);
-  std::thread thread2(JustSleep, &sleep_counters);
+  std::thread thread1(CreateAndDestroyCatalog<inherit, perf_counters>, &catalog_counters);
+  std::thread thread2(JustSleep<inherit, perf_counters>, &sleep_counters);
   thread1.join();
   thread2.join();
   parent_monitor.Stop();
   parent_counters = parent_monitor.Counters();
+}
 
-  EXPECT_TRUE(catalog_counters.cpu_cycles_ >= parent_counters.cpu_cycles_);
-  EXPECT_TRUE(catalog_counters.instructions_ >= parent_counters.instructions_);
-  EXPECT_TRUE(catalog_counters.cache_references_ >= parent_counters.cache_references_);
-  EXPECT_TRUE(catalog_counters.bus_cycles_ >= parent_counters.bus_cycles_);
-  EXPECT_TRUE(catalog_counters.ref_cpu_cycles_ >= parent_counters.ref_cpu_cycles_);
-  EXPECT_TRUE(parent_counters.cpu_cycles_ >= sleep_counters.cpu_cycles_);
-  EXPECT_TRUE(parent_counters.instructions_ >= sleep_counters.instructions_);
-  EXPECT_TRUE(parent_counters.cache_references_ >= sleep_counters.cache_references_);
-  EXPECT_TRUE(parent_counters.bus_cycles_ >= sleep_counters.bus_cycles_);
-  EXPECT_TRUE(parent_counters.ref_cpu_cycles_ >= sleep_counters.ref_cpu_cycles_);
+/**
+ * Test with inherit flag (count children) false
+ */
+// NOLINTNEXTLINE
+TEST_F(PerfMonitorTests, BasicTest) {
+  constexpr bool inherit = false;
+  UnbalancedChildrenThreads<inherit, common::PerfMonitor<inherit>::PerfCounters>();
 }
 
 /**
- * Simple test that spins off 2 threads. One thread builds and then destroys a Catalog. The other sleeps. The parent
- * waits. We then do comparisons on their respective perf counters based on the work performed. This scenario has the
- * parent accumulate for sub-tasks.
+ * Test with inherit flag (count children) true
  */
 // NOLINTNEXTLINE
 TEST_F(PerfMonitorTests, InheritTest) {
-  common::PerfMonitor parent_monitor(true);
-  common::PerfMonitor::PerfCounters parent_counters, catalog_counters, sleep_counters;
-  parent_monitor.Start();
-  std::thread thread1(CreateAndDestroyCatalog, &catalog_counters);
-  std::thread thread2(JustSleep, &sleep_counters);
-  thread1.join();
-  thread2.join();
-  parent_monitor.Stop();
-  parent_counters = parent_monitor.Counters();
-
-  EXPECT_TRUE(parent_counters.cpu_cycles_ >= catalog_counters.cpu_cycles_);
-  EXPECT_TRUE(parent_counters.instructions_ >= catalog_counters.instructions_);
-  EXPECT_TRUE(parent_counters.cache_references_ >= catalog_counters.cache_references_);
-  EXPECT_TRUE(parent_counters.bus_cycles_ >= catalog_counters.bus_cycles_);
-  EXPECT_TRUE(parent_counters.ref_cpu_cycles_ >= catalog_counters.ref_cpu_cycles_);
-  EXPECT_TRUE(parent_counters.cpu_cycles_ >= sleep_counters.cpu_cycles_);
-  EXPECT_TRUE(parent_counters.instructions_ >= sleep_counters.instructions_);
-  EXPECT_TRUE(parent_counters.cache_references_ >= sleep_counters.cache_references_);
-  EXPECT_TRUE(parent_counters.bus_cycles_ >= sleep_counters.bus_cycles_);
-  EXPECT_TRUE(parent_counters.ref_cpu_cycles_ >= sleep_counters.ref_cpu_cycles_);
+  constexpr bool inherit = true;
+  UnbalancedChildrenThreads<inherit, common::PerfMonitor<inherit>::PerfCounters>();
 }
 
 }  // namespace terrier