Skip to content

Commit

Permalink
Templated perf monitor and benchmark (cmu-db#1123)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbutrovich authored Aug 26, 2020
1 parent f7b60df commit e105d4e
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 94 deletions.
1 change: 1 addition & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ if (TERRIER_BUILD_BENCHMARKS)
# benchmarks

add_subdirectory(catalog)
add_subdirectory(common)
add_subdirectory(integration)
add_subdirectory(metrics)
add_subdirectory(parser)
Expand Down
1 change: 1 addition & 0 deletions benchmark/common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ADD_TERRIER_BENCHMARKS()
45 changes: 45 additions & 0 deletions benchmark/common/perf_monitor_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include "benchmark/benchmark.h"
#include "common/perf_monitor.h"

namespace terrier {

/**
* These benchmarks exist to verify the performance difference between grouped and ungrouped perf counters. We do not
* include them in our CI regression checks since their behavior is determined more by the OS than our wrapper.
*/
class PerfMonitorBenchmark : public benchmark::Fixture {};

/**
* Benchmark with inherit flag (count children) false
*/
// NOLINTNEXTLINE
BENCHMARK_DEFINE_F(PerfMonitorBenchmark, Basic)(benchmark::State &state) {
common::PerfMonitor<false> monitor;
// NOLINTNEXTLINE
for (auto _ : state) {
monitor.Start();
monitor.Stop();
monitor.Counters();
}
state.SetItemsProcessed(state.iterations());
}

/**
* Benchmark with inherit flag (count children) true
*/
// NOLINTNEXTLINE
BENCHMARK_DEFINE_F(PerfMonitorBenchmark, Inherit)(benchmark::State &state) {
common::PerfMonitor<true> monitor;
// NOLINTNEXTLINE
for (auto _ : state) {
monitor.Start();
monitor.Stop();
monitor.Counters();
}
state.SetItemsProcessed(state.iterations());
}

BENCHMARK_REGISTER_F(PerfMonitorBenchmark, Basic);

BENCHMARK_REGISTER_F(PerfMonitorBenchmark, Inherit);
} // namespace terrier
99 changes: 68 additions & 31 deletions src/include/common/perf_monitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,23 @@ namespace terrier::common {
* Wrapper around hw perf events provided by the Linux kernel. Instantiating and destroying PerfMonitors are a bit
* expensive because they open multiple file descriptors (read: syscalls). Ideally you want to keep a PerfMonitor object
* around for a portion of code you want to profile, and then just rely on Start() and Stop().
* @tparam inherit true means that any threads spawned from this thread after the perf counter instantiation will be
* accumulated into the parents' counters. This has performance implications. false otherwise (only count this thread's
* counters, regardless of spawned threads)
*/
template <bool inherit>
class PerfMonitor {
public:
/**
* Represents the struct read_format with PERF_FORMAT_GROUP enabled, PERF_FORMAT_TOTAL_TIME_ENABLED and
* PERF_FORMAT_TOTAL_TIME_RUNNING disabled. http://www.man7.org/linux/man-pages/man2/perf_event_open.2.html
*/
struct PerfCounters {
/**
* Should always be NUM_HW_EVENTS after a read since that's how many counters we have.
*/
uint64_t num_counters_;

/**
* Total cycles. Be wary of what happens during CPU frequency scaling.
*/
Expand Down Expand Up @@ -89,11 +98,9 @@ class PerfMonitor {
};

/**
* @param count_children_tasks true if spawned threads should inherit perf counters. Calling counters on parent will
* accumulate all.
* @warning a true arg seems to result in garbage counters if any are separately created in children tasks.
* Create a perf monitor and open all of the necessary file descriptors.
*/
explicit PerfMonitor(const bool count_children_tasks) {
PerfMonitor() {
#if __APPLE__
// Apple doesn't support perf events and currently doesn't expose an equivalent kernel API
valid_ = false;
Expand All @@ -106,16 +113,25 @@ class PerfMonitor {
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
if (count_children_tasks) {
if constexpr (inherit) { // NOLINT
// Count your children's counters
pe.inherit = 1;
pe.inherit_stat = 1;
} else { // NOLINT
// Don't read children thread counters, can optimize to read this thread's counters in group fashion
pe.read_format = PERF_FORMAT_GROUP;
}
// pe.read_format = PERF_FORMAT_GROUP;

// Open file descriptors for each perf_event that we want. We reuse the first entry of the array as the group fd.
// Open file descriptors for each perf_event that we want.
for (uint8_t i = 0; i < NUM_HW_EVENTS; i++) {
pe.config = HW_EVENTS[i];
event_files_[i] = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
if constexpr (inherit) { // NOLINT
// Each counter is its own group (-1 group fd)
event_files_[i] = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
} else { // NOLINT
// We reuse the first entry of the array as the group fd.
event_files_[i] = syscall(__NR_perf_event_open, &pe, 0, -1, event_files_[0], 0);
}
valid_ = valid_ && event_files_[i] > 2; // 0, 1, 2 are reserved for stdin, stdout, stderr respectively
}
#endif
Expand All @@ -141,11 +157,20 @@ class PerfMonitor {
// do nothing
#else
if (valid_) {
// Iterate through all of the events' file descriptors resetting and starting them
for (const auto i : event_files_) {
auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_RESET);
if constexpr (inherit) { // NOLINT
// Iterate through all of the events' file descriptors resetting and starting them
for (const auto i : event_files_) {
auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_RESET);
TERRIER_ASSERT(result >= 0, "Failed to reset events.");
result = ioctl(i, PERF_EVENT_IOC_ENABLE);
TERRIER_ASSERT(result >= 0, "Failed to enable events.");
}
} else { // NOLINT
// Reset all of the counters out with a single syscall.
auto result UNUSED_ATTRIBUTE = ioctl(event_files_[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
TERRIER_ASSERT(result >= 0, "Failed to reset events.");
result = ioctl(i, PERF_EVENT_IOC_ENABLE);
// Start all of the counters out with a single syscall.
result = ioctl(event_files_[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
TERRIER_ASSERT(result >= 0, "Failed to enable events.");
}
running_ = true;
Expand All @@ -162,10 +187,15 @@ class PerfMonitor {
#else
if (valid_) {
TERRIER_ASSERT(running_, "StopEvents() called without StartEvents() first.");

// Iterate through all of the events' file descriptors stopping them
for (const auto i : event_files_) {
auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_DISABLE);
if constexpr (inherit) { // NOLINT
// Iterate through all of the events' file descriptors stopping them
for (const auto i : event_files_) {
auto result UNUSED_ATTRIBUTE = ioctl(i, PERF_EVENT_IOC_DISABLE);
TERRIER_ASSERT(result >= 0, "Failed to disable events.");
}
} else { // NOLINT
// Stop all of the counters out with a single syscall.
auto result UNUSED_ATTRIBUTE = ioctl(event_files_[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
TERRIER_ASSERT(result >= 0, "Failed to disable events.");
}
running_ = false;
Expand All @@ -175,30 +205,37 @@ class PerfMonitor {

/**
* Read out counters for the profiled period
* @return
* @return struct representing the counters
*/
PerfCounters Counters() const {
PerfCounters counters{}; // zero initialization
if (valid_) {
// Iterate through all of the events' file descriptors reading them
if constexpr (inherit) { // NOLINT
// Iterate through all of the events' file descriptors reading them

auto bytes_read UNUSED_ATTRIBUTE = read(event_files_[0], &counters.cpu_cycles_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
auto bytes_read UNUSED_ATTRIBUTE = read(event_files_[0], &counters.cpu_cycles_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");

bytes_read = read(event_files_[1], &counters.instructions_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
bytes_read = read(event_files_[1], &counters.instructions_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");

bytes_read = read(event_files_[2], &counters.cache_references_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
bytes_read = read(event_files_[2], &counters.cache_references_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");

bytes_read = read(event_files_[3], &counters.cache_misses_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
bytes_read = read(event_files_[3], &counters.cache_misses_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");

bytes_read = read(event_files_[4], &counters.bus_cycles_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
bytes_read = read(event_files_[4], &counters.bus_cycles_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");

bytes_read = read(event_files_[5], &counters.ref_cpu_cycles_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
bytes_read = read(event_files_[5], &counters.ref_cpu_cycles_, sizeof(uint64_t));
TERRIER_ASSERT(bytes_read == sizeof(uint64_t), "Failed to read the counter.");
} else { // NOLINT
// Read all of the counters out with a single syscall.
auto bytes_read UNUSED_ATTRIBUTE = read(event_files_[0], &counters, sizeof(PerfCounters));
TERRIER_ASSERT(bytes_read == sizeof(PerfCounters), "Failed to read the counters.");
TERRIER_ASSERT(counters.num_counters_ == NUM_HW_EVENTS, "Failed to read the counters.");
}
}
return counters;
}
Expand All @@ -211,7 +248,7 @@ class PerfMonitor {
private:
// set the first file descriptor to -1. Since event_files[0] is always passed into group_fd on
// perf_event_open, this has the effect of making the first event the group leader. All subsequent syscalls can use
// that fd.
// that fd if we are not inheriting child counters.
std::array<int32_t, NUM_HW_EVENTS> event_files_{-1};
bool valid_ = true;

Expand Down
6 changes: 4 additions & 2 deletions src/include/common/resource_tracker.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ namespace terrier::common {
* thread-local level, but in theory this can be extended to track the system global resource usage.
*/
class ResourceTracker {
static constexpr bool COUNT_CHILDREN_THREADS = false;

public:
/**
* Store the start time, the duration, the perf counters and the rusage counters for the tracked event
Expand All @@ -28,7 +30,7 @@ class ResourceTracker {
/** The elapsed time of the tracked event (microseconds) */
uint64_t elapsed_us_;
/** The perf counters of the tracked event */
PerfMonitor::PerfCounters counters_;
PerfMonitor<COUNT_CHILDREN_THREADS>::PerfCounters counters_;
/** The rusage counters of the tracked event */
rusage rusage_;
/** The number of the CPU on which the thread is currently executing */
Expand Down Expand Up @@ -100,7 +102,7 @@ class ResourceTracker {
*/
void SetMemory(const size_t memory_b) { metrics_.memory_b_ = memory_b; }

PerfMonitor perf_monitor_{false};
PerfMonitor<COUNT_CHILDREN_THREADS> perf_monitor_;
RusageMonitor rusage_monitor_{false};

// The struct to hold all the tracked resource metrics
Expand Down
106 changes: 45 additions & 61 deletions test/common/perf_monitor_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,92 +2,76 @@

#include <thread> //NOLINT

#include "common/macros.h"
#include "common/managed_pointer.h"
#include "common/scoped_timer.h"
#include "main/db_main.h"
#include "test_util/storage_test_util.h"
#include "test_util/test_harness.h"

namespace terrier {

class PerfMonitorTests : public TerrierTest {
public:
static void CreateAndDestroyCatalog(common::PerfMonitor::PerfCounters *const counters) {
common::PerfMonitor monitor(false);
/**
* These tests mostly exist to make sure we can compile and use the API of perf counters. It's difficult to make any
* assertions about the counters' values due to OS behavior (unsupported on macOS, unsupported on Linux without changing
* kernel flags or possibly running the tests as root). I keep the test around as an easy sanity check via break points
* to make sure perf counters give us the data we want on a given system.
*/
class PerfMonitorTests : public TerrierTest {};

template <bool inherit, typename perf_counters>
static void CreateAndDestroyCatalog(perf_counters *const counters) {
common::PerfMonitor<inherit> monitor;
monitor.Start();

auto db_main = terrier::DBMain::Builder().SetUseGC(true).SetUseCatalog(true).Build();
monitor.Start();
db_main.reset();
auto db_main = terrier::DBMain::Builder().SetUseGC(true).SetUseCatalog(true).Build();
db_main.reset();

monitor.Stop();
*counters = monitor.Counters();
}
monitor.Stop();
*counters = monitor.Counters();
}

template <bool inherit, typename perf_counters>
static void JustSleep(perf_counters *const counters) {
common::PerfMonitor<inherit> monitor;
monitor.Start();

static void JustSleep(common::PerfMonitor::PerfCounters *const counters) {
common::PerfMonitor monitor(false);
monitor.Start();
std::this_thread::sleep_for(std::chrono::seconds(2));
monitor.Stop();
*counters = monitor.Counters();
}
};
std::this_thread::sleep_for(std::chrono::seconds(2));

monitor.Stop();
*counters = monitor.Counters();
}

/**
* Simple test that spins off 2 threads. One thread builds and then destroys a Catalog. The other sleeps. The parent
* waits. We then do comparisons on their respective perf counters based on the work performed.
* waits. We can't make strong assertions about their values due to scheduling uncertainty.
*/
// NOLINTNEXTLINE
TEST_F(PerfMonitorTests, BasicTest) {
common::PerfMonitor parent_monitor(false);
common::PerfMonitor::PerfCounters parent_counters, catalog_counters, sleep_counters;
template <bool inherit, typename perf_counters>
static void UnbalancedChildrenThreads() {
common::PerfMonitor<inherit> parent_monitor;
perf_counters parent_counters, catalog_counters, sleep_counters;
parent_monitor.Start();
std::thread thread1(CreateAndDestroyCatalog, &catalog_counters);
std::thread thread2(JustSleep, &sleep_counters);
std::thread thread1(CreateAndDestroyCatalog<inherit, perf_counters>, &catalog_counters);
std::thread thread2(JustSleep<inherit, perf_counters>, &sleep_counters);
thread1.join();
thread2.join();
parent_monitor.Stop();
parent_counters = parent_monitor.Counters();
}

EXPECT_TRUE(catalog_counters.cpu_cycles_ >= parent_counters.cpu_cycles_);
EXPECT_TRUE(catalog_counters.instructions_ >= parent_counters.instructions_);
EXPECT_TRUE(catalog_counters.cache_references_ >= parent_counters.cache_references_);
EXPECT_TRUE(catalog_counters.bus_cycles_ >= parent_counters.bus_cycles_);
EXPECT_TRUE(catalog_counters.ref_cpu_cycles_ >= parent_counters.ref_cpu_cycles_);
EXPECT_TRUE(parent_counters.cpu_cycles_ >= sleep_counters.cpu_cycles_);
EXPECT_TRUE(parent_counters.instructions_ >= sleep_counters.instructions_);
EXPECT_TRUE(parent_counters.cache_references_ >= sleep_counters.cache_references_);
EXPECT_TRUE(parent_counters.bus_cycles_ >= sleep_counters.bus_cycles_);
EXPECT_TRUE(parent_counters.ref_cpu_cycles_ >= sleep_counters.ref_cpu_cycles_);
/**
* Test with inherit flag (count children) false
*/
// NOLINTNEXTLINE
TEST_F(PerfMonitorTests, BasicTest) {
constexpr bool inherit = false;
UnbalancedChildrenThreads<inherit, common::PerfMonitor<inherit>::PerfCounters>();
}

/**
* Simple test that spins off 2 threads. One thread builds and then destroys a Catalog. The other sleeps. The parent
* waits. We then do comparisons on their respective perf counters based on the work performed. This scenario has the
* parent accumulate for sub-tasks.
* Test with inherit flag (count children) true
*/
// NOLINTNEXTLINE
TEST_F(PerfMonitorTests, InheritTest) {
common::PerfMonitor parent_monitor(true);
common::PerfMonitor::PerfCounters parent_counters, catalog_counters, sleep_counters;
parent_monitor.Start();
std::thread thread1(CreateAndDestroyCatalog, &catalog_counters);
std::thread thread2(JustSleep, &sleep_counters);
thread1.join();
thread2.join();
parent_monitor.Stop();
parent_counters = parent_monitor.Counters();

EXPECT_TRUE(parent_counters.cpu_cycles_ >= catalog_counters.cpu_cycles_);
EXPECT_TRUE(parent_counters.instructions_ >= catalog_counters.instructions_);
EXPECT_TRUE(parent_counters.cache_references_ >= catalog_counters.cache_references_);
EXPECT_TRUE(parent_counters.bus_cycles_ >= catalog_counters.bus_cycles_);
EXPECT_TRUE(parent_counters.ref_cpu_cycles_ >= catalog_counters.ref_cpu_cycles_);
EXPECT_TRUE(parent_counters.cpu_cycles_ >= sleep_counters.cpu_cycles_);
EXPECT_TRUE(parent_counters.instructions_ >= sleep_counters.instructions_);
EXPECT_TRUE(parent_counters.cache_references_ >= sleep_counters.cache_references_);
EXPECT_TRUE(parent_counters.bus_cycles_ >= sleep_counters.bus_cycles_);
EXPECT_TRUE(parent_counters.ref_cpu_cycles_ >= sleep_counters.ref_cpu_cycles_);
constexpr bool inherit = true;
UnbalancedChildrenThreads<inherit, common::PerfMonitor<inherit>::PerfCounters>();
}

} // namespace terrier

0 comments on commit e105d4e

Please sign in to comment.