Skip to content

Commit

Permalink
Implement cuda events
Browse files Browse the repository at this point in the history
  • Loading branch information
mikex86 committed Sep 6, 2024
1 parent 26dc81a commit cb2606a
Show file tree
Hide file tree
Showing 21 changed files with 1,099 additions and 5 deletions.
8 changes: 8 additions & 0 deletions driverapi/include/librecuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ struct LibreCUcontext_;
struct LibreCUmodule_;
struct LibreCUFunction_;
struct LibreCUstream_;
struct LibreCUEvent_;

typedef LibreCUdevice_ *LibreCUdevice;
typedef LibreCUcontext_ *LibreCUcontext;
typedef LibreCUmodule_ *LibreCUmodule;
typedef LibreCUFunction_ *LibreCUFunction;
typedef LibreCUstream_ *LibreCUstream;
typedef LibreCUEvent_ *LibreCUEvent;

#define CU_CTX_SCHED_SPIN 0x01
#define CU_CTX_SCHED_YIELD 0x02
Expand Down Expand Up @@ -85,6 +87,12 @@ LIBRECUDA_EXPORT libreCudaStatus_t libreCuLaunchKernel(LibreCUFunction function,
void **extra,
bool async=false);

LIBRECUDA_EXPORT libreCudaStatus_t libreCuEventCreate(LibreCUEvent *pEventOut, uint32_t flags);
LIBRECUDA_EXPORT libreCudaStatus_t libreCuEventRecord(LibreCUEvent event, LibreCUstream stream);
LIBRECUDA_EXPORT libreCudaStatus_t libreCuEventSynchronize(LibreCUEvent event);
LIBRECUDA_EXPORT libreCudaStatus_t libreCuEventElapsedTime(float *pMillisecondsOut, LibreCUEvent start, LibreCUEvent end);
LIBRECUDA_EXPORT libreCudaStatus_t libreCuEventDestroy(LibreCUEvent event);

/**
* Submits the built up command buffer to the gpu.
* Operations performed on streams fall into two types: "compute" (eg. launch kernel) and "dma".
Expand Down
1 change: 1 addition & 0 deletions driverapi/include/librecuda_all_statuses.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_DEVICE, 101)
LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_IMAGE, 200)
LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_CONTEXT, 201)
LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_FOUND, 500)
LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_READY, 600)
LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, 701)
LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE, 804)
LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_UNKNOWN, 999)
6 changes: 6 additions & 0 deletions driverapi/internal/cmdqueue.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,12 @@ class NvCommandQueue {

libreCudaStatus_t gpuMemcpy(void *dst, void *src, size_t numBytes, bool async);

libreCudaStatus_t recordEvent(LibreCUEvent event, LibreCUstream_ *pStream);

libreCudaStatus_t waitForEvent(LibreCUEvent event);

libreCudaStatus_t getEventTimestamp(LibreCUEvent event, uint64_t *pTimestampOut);

private:

libreCudaStatus_t enqueue(NvMethod method, std::initializer_list<NvU32> arguments, QueueType type);
Expand Down
1 change: 0 additions & 1 deletion driverapi/internal/librecuda_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ struct LibreCUstream_ {
NvCommandQueue *command_queue;
};


#define LIBRECUDA_VALIDATE_UVM_IOCTL(ret, data_ptr) { \
int return_value = ret; \
int status = (data_ptr) != nullptr ? (data_ptr)->rmStatus : return_value; \
Expand Down
81 changes: 80 additions & 1 deletion driverapi/src/cmdqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,23 @@ static inline NvU32 lo_32(NvU64 value) {
#define U64_HI_32_BITS(value) hi_32(reinterpret_cast<NvU64>(value))
#define U64_LO_32_BITS(value) lo_32(reinterpret_cast<NvU64>(value))

struct LibreCUEvent_ {
LibreCUstream_ *stream;
NvSignal *computeSignal{};
NvSignal *dmaSignal{};
};

LibreCUEvent_ *NewEvent() {
return new LibreCUEvent_{};
}

LibreCUstream_ *EventGetStream(LibreCUEvent_ *pEvent) {
return pEvent->stream;
}

void DeleteEvent(LibreCUEvent_ *pEvent) {
delete pEvent;
}

static NvMethod makeNvMethod(int subcommand, int method, int size, int typ) {
return (typ << 28) | (size << 16) | (subcommand << 13) | (method >> 2);
Expand Down Expand Up @@ -546,7 +563,8 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
// cuda encodes everything with these 32-bit words. The fact that this would be allowed is highly
// implausible given that even most c compilers pad struct lengths to multiples of 4 anyway,
// so cuda doing it any different would be highly implausible
LIBRECUDA_DEBUG("Encountered kernel with array parameter with size % 4 != 0! This should not be possible");
LIBRECUDA_DEBUG(
"Encountered kernel with array parameter with size % 4 != 0! This should not be possible");
LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE);
}
auto *param_ptr = reinterpret_cast<NvU32 *>(params[i]);
Expand Down Expand Up @@ -874,3 +892,64 @@ libreCudaStatus_t NvCommandQueue::signalWaitGpu(NvSignal *pSignal, NvU32 signalT
));
LIBRECUDA_SUCCEED();
}

libreCudaStatus_t NvCommandQueue::recordEvent(LibreCUEvent event, LibreCUstream_ *pStream) {
LIBRECUDA_VALIDATE(event != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);

event->stream = pStream;
switch (currentQueueType) {
case COMPUTE: {
if (event->computeSignal == nullptr) {
LIBRECUDA_ERR_PROPAGATE(obtainSignal(&event->computeSignal));
}
LIBRECUDA_ERR_PROPAGATE(signalNotify(event->computeSignal, 1, COMPUTE));
break;
}
case DMA: {
if (event->dmaSignal == nullptr) {
LIBRECUDA_ERR_PROPAGATE(obtainSignal(&event->dmaSignal));
}
LIBRECUDA_ERR_PROPAGATE(signalNotify(event->dmaSignal, 1, DMA));
break;
}
default: {
// if the queue type isn't determined yet, we just force compute mode.
if (event->computeSignal == nullptr) {
LIBRECUDA_ERR_PROPAGATE(obtainSignal(&event->computeSignal));
}
LIBRECUDA_ERR_PROPAGATE(signalNotify(event->computeSignal, 1, COMPUTE));
currentQueueType = COMPUTE;
}
}

LIBRECUDA_SUCCEED();
}

libreCudaStatus_t NvCommandQueue::waitForEvent(LibreCUEvent event) {
LIBRECUDA_VALIDATE(event != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(event->stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(event->computeSignal != nullptr || event->dmaSignal != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);

if (event->computeSignal != nullptr) {
signalWaitCpu(event->computeSignal, 1);
} else if (event->dmaSignal != nullptr) {
signalWaitCpu(event->dmaSignal, 1);
}

LIBRECUDA_SUCCEED();
}

libreCudaStatus_t NvCommandQueue::getEventTimestamp(LibreCUEvent event, uint64_t *pTimestampOut) {
LIBRECUDA_VALIDATE(event != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(pTimestampOut != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(event->stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(event->computeSignal != nullptr || event->dmaSignal != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
if (currentQueueType == COMPUTE) {
LIBRECUDA_VALIDATE(event->computeSignal->value == 1, LIBRECUDA_ERROR_NOT_READY); // event was not hit yet
*pTimestampOut = event->computeSignal->time_stamp;
} else if (currentQueueType == DMA) {
LIBRECUDA_VALIDATE(event->dmaSignal->value == 1, LIBRECUDA_ERROR_NOT_READY); // event was not hit yet
*pTimestampOut = event->dmaSignal->time_stamp;
}
LIBRECUDA_SUCCEED();
}
46 changes: 45 additions & 1 deletion driverapi/src/librecuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1506,4 +1506,48 @@ libreCudaStatus_t libreCuFuncSetAttribute(LibreCUFunction function, LibreCuFunct
default: LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE);
}
LIBRECUDA_SUCCEED();
}
}

LibreCUEvent_ *NewEvent();
LibreCUstream_ *EventGetStream(LibreCUEvent pEvent);
void DeleteEvent(LibreCUEvent_ *pEvent);

libreCudaStatus_t libreCuEventCreate(LibreCUEvent *pEventOut, uint32_t flags) {
LIBRECUDA_VALIDATE(pEventOut != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
*pEventOut = NewEvent();
LIBRECUDA_SUCCEED();
}

libreCudaStatus_t libreCuEventRecord(LibreCUEvent event, LibreCUstream stream) {
LIBRECUDA_VALIDATE(event != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->recordEvent(event, stream));
LIBRECUDA_SUCCEED();
}

libreCudaStatus_t libreCuEventSynchronize(LibreCUEvent event) {
LIBRECUDA_VALIDATE(event != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LibreCUstream_ *stream = EventGetStream(event);
LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->waitForEvent(event));
LIBRECUDA_SUCCEED();
}

libreCudaStatus_t libreCuEventElapsedTime(float *pMillisecondsOut, LibreCUEvent start, LibreCUEvent end) {
LIBRECUDA_VALIDATE(start != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LibreCUstream_ *stream = EventGetStream(start);
LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(end != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
LIBRECUDA_VALIDATE(EventGetStream(end) == stream, LIBRECUDA_ERROR_INVALID_VALUE);
uint64_t startTimestamp{}, endTimestamp{};
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->getEventTimestamp(start, &startTimestamp));
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->getEventTimestamp(end, &endTimestamp));
*pMillisecondsOut = static_cast<float>(static_cast<double>(endTimestamp - startTimestamp) / 1e6); // ns to ms
LIBRECUDA_SUCCEED();
}

libreCudaStatus_t libreCuEventDestroy(LibreCUEvent event) {
LIBRECUDA_VALIDATE(event != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
DeleteEvent(event);
LIBRECUDA_SUCCEED();
}
5 changes: 3 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ add_subdirectory(write_float)
add_subdirectory(memcopy)
add_subdirectory(dynamic_shared_mem)
add_subdirectory(compute_chronological_consistency)
add_subdirectory(test_async_kernels)
add_subdirectory(async_kernels)
add_subdirectory(dma_chronological_consistency)
add_subdirectory(kernel_struct_param)
add_subdirectory(indexing)
add_subdirectory(indexing)
add_subdirectory(stream_events)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
11 changes: 11 additions & 0 deletions tests/stream_events/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
add_executable(
test_stream_events
main.cpp
)
target_link_libraries(
test_stream_events
PRIVATE
driverapi
)

configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_stream_events COPYONLY)
130 changes: 130 additions & 0 deletions tests/stream_events/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#include <librecuda.h>

#include <iostream>
#include <vector>
#include <fstream>
#include <cstring>

inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
if (error != LIBRECUDA_SUCCESS) {
const char *error_string;
libreCuGetErrorString(error, &error_string);
printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
exit(EXIT_FAILURE);
}
};
#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))

int main() {
CUDA_CHECK(libreCuInit(0));

int device_count{};
CUDA_CHECK(libreCuDeviceGetCount(&device_count));
std::cout << "Device count: " + std::to_string(device_count) << std::endl;

LibreCUdevice device{};
CUDA_CHECK(libreCuDeviceGet(&device, 0));

LibreCUcontext ctx{};
CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));

char name_buffer[256] = {};
libreCuDeviceGetName(name_buffer, 256, device);
std::cout << "Device Name: " + std::string(name_buffer) << std::endl;

LibreCUmodule module{};

// read cubin file
uint8_t *image;
size_t n_bytes;
{
std::ifstream input("write_float.cubin", std::ios::binary);
std::vector<uint8_t> bytes(
(std::istreambuf_iterator<char>(input)),
(std::istreambuf_iterator<char>()));
input.close();
image = new uint8_t[bytes.size()];
memcpy(image, bytes.data(), bytes.size());
n_bytes = bytes.size();
}
CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));

// read functions
uint32_t num_funcs{};
CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
std::cout << "Num functions: " << num_funcs << std::endl;

auto *functions = new LibreCUFunction[num_funcs];
CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));

for (size_t i = 0; i < num_funcs; i++) {
LibreCUFunction func = functions[i];
const char *func_name{};
CUDA_CHECK(libreCuFuncGetName(&func_name, func));
std::cout << " function \"" << func_name << "\"" << std::endl;
}

delete[] functions;

// find function
LibreCUFunction func{};
CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));

// create stream
LibreCUstream stream{};
CUDA_CHECK(libreCuStreamCreate(&stream, 0));

void *float_dst_compute_va{};
void *float_dst_dma_va{};
CUDA_CHECK(libreCuMemAlloc(&float_dst_compute_va, sizeof(float), true));
CUDA_CHECK(libreCuMemAlloc(&float_dst_dma_va, sizeof(float), true));
*(float *) float_dst_compute_va = 0.0f;
*(float *) float_dst_dma_va = 0.0f;

LibreCUEvent start{}, end{};
CUDA_CHECK(libreCuEventCreate(&start, 0));
CUDA_CHECK(libreCuEventCreate(&end, 0));

CUDA_CHECK(libreCuEventRecord(start, stream));
{
void *params[] = {
&float_dst_compute_va, &float_dst_dma_va
};
CUDA_CHECK(
libreCuLaunchKernel(func,
1, 1, 1,
1, 1, 1,
0,
stream,
params, sizeof(params) / sizeof(void *),
nullptr
)
);
}
CUDA_CHECK(libreCuEventRecord(end, stream));
CUDA_CHECK(libreCuStreamCommence(stream));
CUDA_CHECK(libreCuEventSynchronize(end));

float elapsed{};
CUDA_CHECK(libreCuEventElapsedTime(&elapsed, start, end));
std::cout << "Elapsed: " << elapsed << "ms" << std::endl;

CUDA_CHECK(libreCuStreamAwait(stream));

std::cout << "Dst compute value (post exec): " << *(float *) (float_dst_compute_va) << std::endl;
std::cout << "Dst dma value (post exec): " << *(float *) (float_dst_dma_va) << std::endl;

// free memory
CUDA_CHECK(libreCuMemFree(float_dst_compute_va));
CUDA_CHECK(libreCuMemFree(float_dst_dma_va));

// destroy stream
CUDA_CHECK(libreCuStreamDestroy(stream));

// unload module
CUDA_CHECK(libreCuModuleUnload(module));

// destroy ctx
CUDA_CHECK(libreCuCtxDestroy(ctx));
return 0;
}
Loading

0 comments on commit cb2606a

Please sign in to comment.