Skip to content

Commit

Permalink
Adding iree_hal_device_profiling_begin/end API.
Browse files Browse the repository at this point in the history
This extends the device interface to expose the common stateful/global
capture behavior used by GPU tooling. Most of these tools have some
pretty tricky requirements (some must be initialized before devices
and some after, and some only ever allow one device creation per
process, etc) and the intent is that these APIs are only enabled and
used in very specific debugging scenarios instead of a user-facing flow.
  • Loading branch information
benvanik committed Oct 24, 2022
1 parent 7b823f1 commit ec8dc02
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 0 deletions.
15 changes: 15 additions & 0 deletions experimental/rocm/rocm_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,19 @@ static iree_status_t iree_hal_rocm_device_wait_semaphores(
"semaphore not implemented");
}

static iree_status_t iree_hal_rocm_device_profiling_begin(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options) {
// Unimplemented (and that's ok).
return iree_ok_status();
}

static iree_status_t iree_hal_rocm_device_profiling_end(
iree_hal_device_t* device) {
// Unimplemented (and that's ok).
return iree_ok_status();
}

static const iree_hal_device_vtable_t iree_hal_rocm_device_vtable = {
.destroy = iree_hal_rocm_device_destroy,
.id = iree_hal_rocm_device_id,
Expand All @@ -324,4 +337,6 @@ static const iree_hal_device_vtable_t iree_hal_rocm_device_vtable = {
.queue_execute = iree_hal_rocm_device_queue_execute,
.queue_flush = iree_hal_rocm_device_queue_flush,
.wait_semaphores = iree_hal_rocm_device_wait_semaphores,
.profiling_begin = iree_hal_rocm_device_profiling_begin,
.profiling_end = iree_hal_rocm_device_profiling_end,
};
21 changes: 21 additions & 0 deletions runtime/src/iree/hal/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,24 @@ IREE_API_EXPORT iree_status_t iree_hal_device_wait_semaphores(
IREE_TRACE_ZONE_END(z0);
return status;
}

IREE_API_EXPORT iree_status_t iree_hal_device_profiling_begin(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options) {
IREE_ASSERT_ARGUMENT(device);
IREE_ASSERT_ARGUMENT(options);
IREE_TRACE_ZONE_BEGIN(z0);
iree_status_t status =
_VTABLE_DISPATCH(device, profiling_begin)(device, options);
IREE_TRACE_ZONE_END(z0);
return status;
}

IREE_API_EXPORT iree_status_t
iree_hal_device_profiling_end(iree_hal_device_t* device) {
IREE_ASSERT_ARGUMENT(device);
IREE_TRACE_ZONE_BEGIN(z0);
iree_status_t status = _VTABLE_DISPATCH(device, profiling_end)(device);
IREE_TRACE_ZONE_END(z0);
return status;
}
71 changes: 71 additions & 0 deletions runtime/src/iree/hal/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,39 @@ typedef struct iree_hal_device_info_t {
iree_string_view_t name;
} iree_hal_device_info_t;

// Defines what information is captured during profiling.
// Not all implementations will support all modes.
enum iree_hal_device_profiling_mode_bits_t {
IREE_HAL_DEVICE_PROFILING_MODE_NONE = 0u,

// Capture queue operations such as command buffer submissions and the
// transfer/dispatch commands within them. This gives a high-level overview
// of HAL API usage with minimal overhead.
IREE_HAL_DEVICE_PROFILING_MODE_QUEUE_OPERATIONS = 1u << 0,

// Capture aggregated dispatch performance counters across all commands within
// the profiled range.
IREE_HAL_DEVICE_PROFILING_MODE_DISPATCH_COUNTERS = 1u << 1,

// Capture detailed executable performance counters correlated to source
// locations. This can have a significant performance impact and should only
// be used when investigating the performance of an individual dispatch.
IREE_HAL_DEVICE_PROFILING_MODE_EXECUTABLE_COUNTERS = 1u << 2,
};
typedef uint32_t iree_hal_device_profiling_mode_t;

// Controls profiling options.
typedef struct iree_hal_device_profiling_options_t {
// Defines what kind of profiling information is captured.
iree_hal_device_profiling_mode_t mode;

// A file system path where profile data will be written if supported by the
// profiling implementation. Depending on the tool this may be a template
// path/prefix for a unique per capture name or a full path that will be
// overwritten each capture.
const char* file_path;
} iree_hal_device_profiling_options_t;

// A transfer source or destination.
typedef struct iree_hal_transfer_buffer_t {
// A host-allocated void* buffer.
Expand Down Expand Up @@ -381,6 +414,39 @@ IREE_API_EXPORT iree_status_t iree_hal_device_wait_semaphores(
iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout);

// Begins a profile capture on |device| with the given |options|.
// This will use an implementation-defined profiling API to capture all
// supported device operations until the iree_hal_device_profiling_end is
// called. If the device or current build configuration do not support profiling
// this method is a no-op. See implementation-specific device creation APIs and
// driver module registration for more information.
//
// WARNING: the device must be idle before calling this method. Behavior is
// undefined if there are any in-flight or pending queue operations or access
// from another thread while profiling is starting/stopping.
//
// WARNING: profiling in any mode can dramatically increase overhead with some
// modes being significantly more expensive in both host and device time enough
// to invalidate performance numbers from other mechanisms (perf/tracy/etc).
// When measuring end-to-end performance use only
// IREE_HAL_DEVICE_PROFILING_MODE_QUEUE_OPERATIONS.
//
// Examples of APIs this maps to (where supported):
// - CPU: perf_event_open/close or vendor APIs
// - CUDA: cuProfilerStart/cuProfilerStop
// - Direct3D: PIXBeginCapture/PIXEndCapture
// - Metal: [MTLCaptureManager startCapture/stopCapture]
// - Vulkan: vkAcquireProfilingLockKHR/vkReleaseProfilingLockKHR +
// RenderDoc StartFrameCapture/EndFrameCapture
IREE_API_EXPORT iree_status_t iree_hal_device_profiling_begin(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options);

// Ends a profile previous started with iree_hal_device_profiling_begin.
// The device must be idle before calling this method.
IREE_API_EXPORT iree_status_t
iree_hal_device_profiling_end(iree_hal_device_t* device);

//===----------------------------------------------------------------------===//
// iree_hal_device_t implementation details
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -468,6 +534,11 @@ typedef struct iree_hal_device_vtable_t {
iree_status_t(IREE_API_PTR* wait_semaphores)(
iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout);

iree_status_t(IREE_API_PTR* profiling_begin)(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options);
iree_status_t(IREE_API_PTR* profiling_end)(iree_hal_device_t* device);
} iree_hal_device_vtable_t;
IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_device_vtable_t);

Expand Down
16 changes: 16 additions & 0 deletions runtime/src/iree/hal/drivers/cuda/cuda_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,20 @@ static iree_status_t iree_hal_cuda_device_wait_semaphores(
"semaphore not implemented");
}

static iree_status_t iree_hal_cuda_device_profiling_begin(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options) {
// Unimplemented (and that's ok).
// We could hook in to CUPTI here or use the much simpler cuProfilerStart API.
return iree_ok_status();
}

static iree_status_t iree_hal_cuda_device_profiling_end(
iree_hal_device_t* device) {
// Unimplemented (and that's ok).
return iree_ok_status();
}

static const iree_hal_device_vtable_t iree_hal_cuda_device_vtable = {
.destroy = iree_hal_cuda_device_destroy,
.id = iree_hal_cuda_device_id,
Expand All @@ -418,4 +432,6 @@ static const iree_hal_device_vtable_t iree_hal_cuda_device_vtable = {
.queue_execute = iree_hal_cuda_device_queue_execute,
.queue_flush = iree_hal_cuda_device_queue_flush,
.wait_semaphores = iree_hal_cuda_device_wait_semaphores,
.profiling_begin = iree_hal_cuda_device_profiling_begin,
.profiling_end = iree_hal_cuda_device_profiling_end,
};
22 changes: 22 additions & 0 deletions runtime/src/iree/hal/drivers/local_sync/sync_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,26 @@ static iree_status_t iree_hal_sync_device_wait_semaphores(
semaphore_list, timeout);
}

static iree_status_t iree_hal_sync_device_profiling_begin(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options) {
// Unimplemented (and that's ok).
// We could hook in to vendor APIs (Intel/ARM/etc) or generic perf infra:
// https://man7.org/linux/man-pages/man2/perf_event_open.2.html
// Capturing things like:
// PERF_COUNT_HW_CPU_CYCLES / PERF_COUNT_HW_INSTRUCTIONS
// PERF_COUNT_HW_CACHE_REFERENCES / PERF_COUNT_HW_CACHE_MISSES
// etc
// TODO(benvanik): shared iree/hal/local/profiling implementation of this.
return iree_ok_status();
}

static iree_status_t iree_hal_sync_device_profiling_end(
iree_hal_device_t* device) {
// Unimplemented (and that's ok).
return iree_ok_status();
}

static const iree_hal_device_vtable_t iree_hal_sync_device_vtable = {
.destroy = iree_hal_sync_device_destroy,
.id = iree_hal_sync_device_id,
Expand All @@ -395,4 +415,6 @@ static const iree_hal_device_vtable_t iree_hal_sync_device_vtable = {
.queue_execute = iree_hal_sync_device_queue_execute,
.queue_flush = iree_hal_sync_device_queue_flush,
.wait_semaphores = iree_hal_sync_device_wait_semaphores,
.profiling_begin = iree_hal_sync_device_profiling_begin,
.profiling_end = iree_hal_sync_device_profiling_end,
};
22 changes: 22 additions & 0 deletions runtime/src/iree/hal/drivers/local_task/task_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,26 @@ static iree_status_t iree_hal_task_device_wait_semaphores(
&device->large_block_pool);
}

static iree_status_t iree_hal_task_device_profiling_begin(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options) {
// Unimplemented (and that's ok).
// We could hook in to vendor APIs (Intel/ARM/etc) or generic perf infra:
// https://man7.org/linux/man-pages/man2/perf_event_open.2.html
// Capturing things like:
// PERF_COUNT_HW_CPU_CYCLES / PERF_COUNT_HW_INSTRUCTIONS
// PERF_COUNT_HW_CACHE_REFERENCES / PERF_COUNT_HW_CACHE_MISSES
// etc
// TODO(benvanik): shared iree/hal/local/profiling implementation of this.
return iree_ok_status();
}

static iree_status_t iree_hal_task_device_profiling_end(
iree_hal_device_t* device) {
// Unimplemented (and that's ok).
return iree_ok_status();
}

static const iree_hal_device_vtable_t iree_hal_task_device_vtable = {
.destroy = iree_hal_task_device_destroy,
.id = iree_hal_task_device_id,
Expand All @@ -414,4 +434,6 @@ static const iree_hal_device_vtable_t iree_hal_task_device_vtable = {
.queue_execute = iree_hal_task_device_queue_execute,
.queue_flush = iree_hal_task_device_queue_flush,
.wait_semaphores = iree_hal_task_device_wait_semaphores,
.profiling_begin = iree_hal_task_device_profiling_begin,
.profiling_end = iree_hal_task_device_profiling_end,
};
18 changes: 18 additions & 0 deletions runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1128,6 +1128,22 @@ static iree_status_t iree_hal_vulkan_device_wait_semaphores(
device->logical_device, &semaphore_list, timeout, wait_flags);
}

static iree_status_t iree_hal_vulkan_device_profiling_begin(
iree_hal_device_t* device,
const iree_hal_device_profiling_options_t* options) {
// Unimplemented (and that's ok). If counters are requested we'd use
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_performance_query.html
// to acquire a lock. For most other cases we can use something like the
// RenderDoc API to directly tell an attached tool that we want to capture.
return iree_ok_status();
}

static iree_status_t iree_hal_vulkan_device_profiling_end(
iree_hal_device_t* device) {
// Unimplemented (and that's ok).
return iree_ok_status();
}

namespace {
const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable = {
/*.destroy=*/iree_hal_vulkan_device_destroy,
Expand All @@ -1153,5 +1169,7 @@ const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable = {
/*.queue_execute=*/iree_hal_vulkan_device_queue_execute,
/*.queue_flush=*/iree_hal_vulkan_device_queue_flush,
/*.wait_semaphores=*/iree_hal_vulkan_device_wait_semaphores,
/*.profiling_begin=*/iree_hal_vulkan_device_profiling_begin,
/*.profiling_end=*/iree_hal_vulkan_device_profiling_end,
};
} // namespace

0 comments on commit ec8dc02

Please sign in to comment.