diff --git a/.gitignore b/.gitignore index 1a8b5bc..4dc9cf7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ build/* # any build subdirectory in the tree **/build/ +**/build_web/ examples/hello_gpu/build/* examples/raymarch/build/* docs/html @@ -8,6 +9,7 @@ source .DS_Store third_party/lib/* third_party/local/* +third_party/dawn/* # formatter files .cmake-format.py @@ -20,3 +22,6 @@ build .cache compile_commands.json +# editor specific +.vscode/* + diff --git a/CMakeLists.txt b/CMakeLists.txt index db89df7..85911a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,23 +1,13 @@ +# This only builds a shared lib, see cmake/example.cmake +# and cmake/gpu.cmake for more details cmake_minimum_required(VERSION 3.28) project(gpu) - -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/webgpu.cmake") - +set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with # LSP -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) -option(USE_LOCAL_LIBS - "Use local libraries instead of fetching from the internet" OFF) - -# Ensure the build type is set -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE - Release - CACHE STRING "Choose the type of build: Debug or Release" FORCE) -endif() - option(FASTBUILD "Option to enable fast builds" OFF) if(FASTBUILD) set(CMAKE_BUILD_TYPE None) # Avoid default flags of predefined build types @@ -30,21 +20,27 @@ if(DEBUG) set(CMAKE_CXX_FLAGS "-O0 -g") endif() -if(WIN64) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN") -endif() - +include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake") include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake") -message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}") -message( - STATUS - "Include directories for wgpu: ${CMAKE_CURRENT_SOURCE_DIR}/third_party/headers" -) +target_link_libraries(gpu PRIVATE webgpu_dawn) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test) + +add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp) +target_link_libraries(test_gpu PRIVATE gpu) + +# Platform-specific post-build actions (e.g. copying DLLs for MSVC) +if(MSVC) + add_custom_command( + TARGET test_gpu POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${DAWN_BUILD_DIR}/$/webgpu_dawn.dll + $ + COMMENT "Copying webgpu_dawn.dll to the build directory" + ) +endif() add_library(gpud SHARED gpu.hpp) set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(gpud PRIVATE wgpu) -target_link_libraries(gpud PRIVATE webgpu) target_link_libraries(gpud PRIVATE gpu) -install(TARGETS gpud) diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake new file mode 100644 index 0000000..c6fed94 --- /dev/null +++ b/cmake/dawn.cmake @@ -0,0 +1,124 @@ +# Setup directories +set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party") +set(DAWN_DIR "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "") +set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "") + +if(EMSCRIPTEN) + set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "") + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "") + set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE) +else() + add_compile_definitions(USE_DAWN_API) +endif() + +# Enable find for no dawn rebuilds with flutter run +set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE) +set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE) +if(ENABLE_DAWN_FIND) + # find_library, windows adds extra folder + if(MSVC) + find_library(WEBGPU_DAWN_DEBUG webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" + ) + find_library(WEBGPU_DAWN_RELEASE webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" + ) + set(DAWN_BUILD_FOUND ON) + elseif(NOT EMSCRIPTEN AND NOT MSVC) + find_library(WEBGPU_DAWN_LIB + NAMES webgpu_dawn + PATHS "${DAWN_BUILD_DIR}/src/dawn/native" + REQUIRED + ) + set(DAWN_BUILD_FOUND ON) + else() + set(DAWN_BUILD_FOUND ON) + endif() +endif() + +# Dawn options for more, +# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt +set(DAWN_ALWAYS_ASSERT OFF CACHE INTERNAL "Always assert in Dawn" FORCE) +set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE) +set(DAWN_BUILD_EXAMPLES OFF CACHE INTERNAL "Build Dawn examples" FORCE) +set(DAWN_BUILD_SAMPLES OFF CACHE INTERNAL "Build Dawn samples" FORCE) +set(DAWN_BUILD_TESTS OFF CACHE INTERNAL "Build Dawn tests" FORCE) +set(DAWN_ENABLE_INSTALL OFF CACHE INTERNAL "Enable Dawn installation" FORCE) +set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) +set(TINT_BUILD_TESTS OFF CACHE INTERNAL "Build Tint Tests" FORCE) +set(TINT_BUILD_IR_BINARY OFF CACHE INTERNAL "Build Tint IR binary" FORCE) +set(TINT_BUILD_CMD_TOOLS OFF CACHE INTERNAL "Build Tint command line tools" FORCE) + +if(NOT DAWN_BUILD_FOUND) + include(FetchContent) + message("webgpu_dawn not found start building") + if(EMSCRIPTEN) + set(EMSCRIPTEN_DIR "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "" FORCE) + set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EMSCRIPTEN_DIR} CACHE INTERNAL "" FORCE) + endif() + + FetchContent_Declare( + dawn + DOWNLOAD_DIR ${DAWN_DIR} + SOURCE_DIR ${DAWN_DIR} + SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp + BINARY_DIR ${DAWN_BUILD_DIR} + DOWNLOAD_COMMAND + cd ${DAWN_DIR} && + git init && + git fetch --depth=1 https://dawn.googlesource.com/dawn && + git reset --hard FETCH_HEAD + ) + + # Download the repository and add it as a subdirectory. + FetchContent_MakeAvailable(dawn) + + # attempt fix flutter rebuilds + set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "") + + execute_process( + WORKING_DIRECTORY ${DAWN_DIR} + COMMAND ${CMAKE_COMMAND} -S ${DAWN_DIR} + -B ${DAWN_BUILD_DIR} + ) + + # Build Dawn + execute_process( + COMMAND ${CMAKE_COMMAND} --build ${DAWN_BUILD_DIR} + ) + + # find_library, windows adds extra folder + if(MSVC) + find_library(WEBGPU_DAWN_DEBUG webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" + ) + find_library(WEBGPU_DAWN_RELEASE webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" + ) + set(DAWN_BUILD_FOUND ON) + elseif(NOT EMSCRIPTEN AND NOT MSVC) + find_library(WEBGPU_DAWN_LIB + NAMES webgpu_dawn + PATHS "${DAWN_BUILD_DIR}/src/dawn/native" + REQUIRED + ) + set(DAWN_BUILD_FOUND ON) + else() + set(DAWN_BUILD_FOUND ON) + endif() +endif() + +if(EMSCRIPTEN) + add_library(webgpu_dawn INTERFACE IMPORTED) + target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include) + target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/webgpu.h) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js) +else() +endif() diff --git a/cmake/example.cmake b/cmake/example.cmake index eba8e7c..cf697b5 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -1,68 +1,98 @@ -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with - # LSP -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) +# Getting Started with CMAKE +# Each example includes this and sets PROJECT_NAME. +# +# Example usage: +# cd examples/hello_world +# cmake -S . build/ -DCMAKE_BUILD_TYPE=Release +# cmake --build build/ --config Release +# ./build/hello_world (or serve the output .js/.wasm for Emscripten) +# or for emscripten +# emcmake cmake -S . -B ./build_web -DCMAKE_BUILD_TYPE=Release +# cmake --build build_web --config Release +# python3 -m http.server 8080 --d build_web +if(NOT MSVC) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 20) +endif() + +# Locate the project root (two levels up from the current source dir) get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) -# Construct potential paths -set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") -set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}") +# Include external libraries and helper scripts (dawn and gpu) +include("${PROJECT_ROOT}/cmake/dawn.cmake") +include("${PROJECT_ROOT}/cmake/gpu.cmake") -# Include file finding utility script -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake") +# Create the executable +add_executable(${PROJECT_NAME} run.cpp) -# Check if the file exists in the current directory -find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} - TARGET_FILE_PATH) -if("${TARGET_FILE_PATH}" STREQUAL "") - find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} - TARGET_FILE_PATH) - if("${TARGET_FILE_PATH}" STREQUAL "") - message( - FATAL_ERROR - "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../" - ) - endif() -endif() +# Platform-specific linking & build settings +if(EMSCRIPTEN) + # Emscripten-specific configuration -# Ensure the build type is set -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE - Release - CACHE STRING "Choose the type of build: Debug or Release" FORCE) -endif() + # Define a web output directory (adjust as needed) + set(WEB_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/web_build") -# Define architecture and build type directories or file names -if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(ARCH "x64") -else() - set(ARCH "x86") -endif() + # If necessary, include the generated WebGPU include dirs first. + include_directories(BEFORE "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(BUILD_TYPE "Debug") -else() - set(BUILD_TYPE "Release") -endif() + # Create a helper library for WebGPU support. + add_library(webgpu_web "${DAWN_DIR}/third_party/emdawnwebgpu/webgpu.cpp") + target_link_libraries(${PROJECT_NAME} PRIVATE webgpu_web) + + # Set Emscripten-specific link flags that enable WASM output and expose certain symbols. + # Needed to use updated version, emdawnwebgpu + set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\ + -O3 \ + -sUSE_WEBGPU=0 \ + -sWASM=1 \ + -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \ + -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \ + -sEXPORTED_RUNTIME_METHODS=ccall \ + -sUSE_GLFW=3 \ + -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \ + -sASYNCIFY \ + --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ + --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ + --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \ + --js-library=${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js \ + --closure-args=--externs=${EMSCRIPTEN_DIR}/src/closure-externs/webgpu-externs.js \ + ") -if(NOT TARGET gpu) - message(STATUS "GPU_LIB not found") - include("${TARGET_FILE_PATH}/cmake/webgpu.cmake") - include("${TARGET_FILE_PATH}/cmake/gpu.cmake") +else() + # Non-Emscripten (desktop) linking + if(MSVC) + target_link_libraries(gpu + PRIVATE + $<$:${WEBGPU_DAWN_DEBUG}> + $<$:${WEBGPU_DAWN_RELEASE}> + ) + else() + target_link_libraries(gpu PRIVATE webgpu_dawn) + endif() endif() -add_executable(${PROJECT_NAME} run.cpp) +# Link the gpu/dawn library to the executable. target_link_libraries(${PROJECT_NAME} PRIVATE gpu) -target_link_libraries(${PROJECT_NAME} PRIVATE wgpu) -target_link_libraries(${PROJECT_NAME} PRIVATE webgpu) -if(WIN32) - # Ensure DLL is copied if on Windows +# Platform-specific post-build actions (e.g. copying DLLs for MSVC) +if(MSVC) add_custom_command( - TARGET ${PROJECT_NAME} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DLL_PATH} - $) + TARGET ${PROJECT_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${DAWN_BUILD_DIR}/$/webgpu_dawn.dll + $ + COMMENT "Copying webgpu_dawn.dll to the build directory" + ) +endif() + +if(EMSCRIPTEN) + + # Configure the HTML file by replacing @PROJECT_NAME@ with the actual target name. + configure_file(${PROJECT_ROOT}cmake/templates/index.html.in + ${CMAKE_CURRENT_BINARY_DIR}/index.html + @ONLY) + endif() diff --git a/cmake/find_gpu.cmake b/cmake/find_gpu.cmake deleted file mode 100644 index b6b7dad..0000000 --- a/cmake/find_gpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -# file name to find -set(FILENAME "gpu.hpp") - -# Function to check for file existence up the directory hierarchy -function(find_project_root current_dir filename result_var) - set(found FALSE) # Flag to indicate if the file is found - set(current_check_dir "${current_dir}") # Start from the given directory - # using 1 is jsut to supress the cmane-format warning - foreach(i RANGE 0 2 1) - set(filepath "${current_check_dir}/${filename}") - - if(EXISTS "${filepath}") - set(${result_var} - "${current_check_dir}" - PARENT_SCOPE) - set(found TRUE) - break() - endif() - - # Move one level up - get_filename_component(current_check_dir "${current_check_dir}" - DIRECTORY) - endforeach() - - if(NOT found) - set(${result_var} - "" - PARENT_SCOPE) # Set to empty if not found - endif() -endfunction() diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 08db244..d991a18 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -1,69 +1,41 @@ -get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) -get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) +set(FILENAME "gpu.hpp") -# Construct potential paths -set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") -set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}") - -# Include file finding utility script -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake") - -# Check if the file exists in the current directory -find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH) -if("${TARGET_FILE_PATH}" STREQUAL "") - find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH) - if("${TARGET_FILE_PATH}" STREQUAL "") - message( - FATAL_ERROR - "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../" - ) - endif() -endif() - -# Define architecture and build type directories or file names -if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(ARCH "x64") +# Setup project root here. +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") + set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") else() - set(ARCH "x86") + get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) + get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) + set(PROJECT_ROOT "${PROJECT_ROOT}/") endif() -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(BUILD_TYPE "Debug") +message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}") + +# Add sources +set(GPU_SOURCES + "${PROJECT_ROOT}/gpu.cpp" + "${PROJECT_ROOT}/numeric_types/half.cpp" + "${DAWN_BUILD_DIR}/gen/include/dawn/webgpu.h" +) + +# Add headers +set(GPU_HEADERS + "${PROJECT_ROOT}/gpu.hpp" + "${PROJECT_ROOT}/utils/logging.hpp" + "${PROJECT_ROOT}/utils/array_utils.hpp" + "${PROJECT_ROOT}/numeric_types/half.hpp" + +) + +# Create the STATIC library for gpu +add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS}) +set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX) +target_include_directories(gpu PUBLIC "${PROJECT_ROOT}") +if(NOT EMSCRIPTEN) + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/") + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/") + target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/") else() - set(BUILD_TYPE "Release") + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/") endif() - -add_library(webgpulib SHARED IMPORTED) -add_library(gpu INTERFACE) -add_library(wgpu INTERFACE) -add_dependencies(gpu webgpulib) -# Define the header-only library -target_include_directories(gpu INTERFACE ${TARGET_FILE_PATH}) - -# Add headers webgpu.h -target_include_directories(wgpu - INTERFACE ${TARGET_FILE_PATH}/third_party/headers) -include(ExternalProject) - -set(DAWN_EXT_PREFIX "${TARGET_FILE_PATH}/third_party/local/dawn") - -ExternalProject_Add( - dawn_project - PREFIX ${DAWN_EXT_PREFIX} - GIT_REPOSITORY "https://dawn.googlesource.com/dawn" - GIT_TAG "main" - SOURCE_DIR "${DAWN_EXT_PREFIX}/source" - BINARY_DIR "${DAWN_EXT_PREFIX}/build" - INSTALL_DIR "${DAWN_EXT_PREFIX}/install" - GIT_SUBMODULES "" - # setting cmake args doesn't work and I don't know why - CONFIGURE_COMMAND - ${CMAKE_COMMAND} -S ${DAWN_EXT_PREFIX}/source -B - ${DAWN_EXT_PREFIX}/build -DDAWN_FETCH_DEPENDENCIES=ON - -DDAWN_ENABLE_INSTALL=ON -DDAWN_BUILD_MONOLITHIC_LIBRARY=ON - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -G ${CMAKE_GENERATOR} - INSTALL_COMMAND ${CMAKE_COMMAND} --install . --prefix - ${DAWN_EXT_PREFIX}/install - LOG_INSTALL ON) -find_library(LIBDAWN dawn PATHS "${DAWN_EXT_PREFIX}/install/lib") -target_link_libraries(webgpulib INTERFACE ${LIBDAWN}) diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in new file mode 100644 index 0000000..6b5957b --- /dev/null +++ b/cmake/templates/index.html.in @@ -0,0 +1,22 @@ + + + + + @PROJECT_NAME@ + + + + + + + diff --git a/cmake/webgpu.cmake b/cmake/webgpu.cmake deleted file mode 100644 index c63f1e2..0000000 --- a/cmake/webgpu.cmake +++ /dev/null @@ -1,61 +0,0 @@ -# Specify the filename to search for -set(FILENAME "gpu.hpp") - -get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) -get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) - -# Construct potential paths -set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") -set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}") - -# Include file finding utility script -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake") - -# Check if the file exists in the current directory -find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH) -if("${TARGET_FILE_PATH}" STREQUAL "") - find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH) - if("${TARGET_FILE_PATH}" STREQUAL "") - message( - FATAL_ERROR - "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../" - ) - endif() -endif() - -include(FetchContent) - -set(FETCHCONTENT_BASE_DIR "${TARGET_FILE_PATH}/third_party/fetchcontent") -set(WEBGPU_DIST_LOCAL_PATH - "${TARGET_FILE_PATH}/third_party/local/WebGPU-distribution") - -if(USE_LOCAL_LIBS) - set(WEBGPU_DIST_GIT_REPO ${WEBGPU_DIST_LOCAL_PATH}) - message(STATUS "Using local WebGPU distribution: ${WEBGPU_DIST_LOCAL_PATH}") -else() - set(WEBGPU_DIST_GIT_REPO - "https://github.com/eliemichel/WebGPU-distribution") -endif() - -option(WEBGPU_TAG "WebGPU distribution tag to use") -if(NOT WEBGPU_TAG) - set(WEBGPU_TAG "dawn") -endif() -message(STATUS "Using WebGPU distribution tag: ${WEBGPU_TAG}") - -if(WEBGPU_TAG STREQUAL "dawn") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN") - # use specific commit set(WEBGPU_TAG - # "1025b977e1927b6d0327e67352f90feb4bcf8274") set(WEBGPU_TAG - # "acf972b7b909f52e183bdae3971b93bb13d4a29e") - # add_compile_options(-UABSL_INTERNAL_AT_LEAST_CXX20) set(CMAKE_CXX_FLAGS - # "${CMAKE_CXX_FLAGS} -UABSL_INTERNAL_AT_LEAST_CXX20") - message(STATUS "Using Dawn backend") -endif() - -FetchContent_Declare( - webgpu - GIT_REPOSITORY ${WEBGPU_DIST_GIT_REPO} - GIT_TAG ${WEBGPU_TAG} - GIT_SHALLOW TRUE) -FetchContent_MakeAvailable(webgpu) diff --git a/docs/gpuflow.md b/docs/gpuflow.md new file mode 100644 index 0000000..d13a228 --- /dev/null +++ b/docs/gpuflow.md @@ -0,0 +1,78 @@ +# GPU.cpp Lifecycle + +```mermaid +flowchart TD + %% Data Preparation & Upload + subgraph "Data Preparation & Upload" + A["CPU Data"] + B["Define Data Properties
(shape, type, size)"] + C["Create GPU Buffer
(allocate raw buffer)"] + D["Create Tensor
(allocates Array with one
or more buffers
and associates Shape)"] + + E["Upload Data via toGPU
(raw buffer)
toGPU
(ctx, data, buffer, size)"] + F["Upload Data via toGPU
(Tensor overload)
toGPU(ctx, data, tensor)"] + G["Optional:
Kernel Parameters
toGPU(ctx, params, Kernel)"] + end + + %% Buffer Setup & Bindings + subgraph "Buffer & Binding Setup" + H["Define Bindings
(Bindings, TensorView)"] + I["Map GPU buffers
to shader bindings
(Collection from Tensor
or single buffers)"] + end + + %% Kernel Setup & Execution + subgraph "Kernel Setup & Execution" + J["Define KernelCode
(WGSL template, workgroup size, precision)"] + K["Create Kernel"] + L["Dispatch Kernel"] + end + + %% GPU Execution & Result Readback + subgraph "GPU Execution & Result Readback" + M["Kernel Execution
(GPU shader runs)"] + N["Readback Data
(toCPU variants)"] + end + + %% Context & Resources + O["Context
(Device, Queue,
TensorPool, KernelPool)"] + + %% Flow Connections + A --> B + B --> C + B --> D + C --> E + D --> F + F --> H + E --> H + H --> I + I --> K + J --> K + G --- K + K --> L + L --> M + M --> N + + %% Context shared by all stages + O --- D + O --- E + O --- F + O --- K + O --- L + O --- N +``` + +• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`. +• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};` +• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel. +• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup. +• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration. +• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.). +• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor. + +`gpu::Tensor` Ranks: +Rank 0: Scalar +Rank 1: Vector +Rank 2: Matrix +Rank 3: 3D Tensor (or Cube) +Rank 4: 4D Tensor +Rank (max 8): Higher Dimensional Tensors diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index 7453869..b44934b 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -28,7 +28,13 @@ int main(int argc, char **argv) { printf("--------------\n\n"); // std::unique_ptr ctx = createContext(); + #ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); + auto adaptersList = listAdapters(ctx); + LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str()); + #else Context ctx = createContext(); + #endif static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -36,13 +42,10 @@ int main(int argc, char **argv) { } Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf32); - std::promise promise; - std::future future = promise.get_future(); Kernel op = createKernel(ctx, {kGelu, 256, kf32}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, op, promise); - wait(ctx, future); + dispatchKernel(ctx, op); toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]); diff --git a/examples/render/run.cpp b/examples/render/run.cpp index f2c6bec..64122cd 100644 --- a/examples/render/run.cpp +++ b/examples/render/run.cpp @@ -124,10 +124,8 @@ int main(int argc, char **argv) { cdiv({NCOLS, NROWS, 1}, wgSize), params); printf("\033[2J\033[H"); while (true) { - std::promise promise; - std::future future = promise.get_future(); - dispatchKernel(ctx, renderKernel, promise); - wait(ctx, future); + + dispatchKernel(ctx, renderKernel); toCPU(ctx, devScreen, screen.data(), sizeof(screen)); params.time = getCurrentTimeInMilliseconds() - zeroTime; @@ -149,11 +147,12 @@ int main(int argc, char **argv) { std::array raster; for (size_t i = 0; i < screen.size(); ++i) { - size_t index = - std::min(sizeof(intensity) - 2, - std::max(0ul, static_cast(screen[i] * - (sizeof(intensity) - 2)))); - raster[i] = intensity[index]; + // Convert all values to size_t to ensure proper type matching + const size_t intensity_max = sizeof(intensity) - 2; + const size_t scaled_value = static_cast(screen[i] * intensity_max); + size_t index = std::min(intensity_max, + std::max(static_cast(0), scaled_value)); + raster[i] = intensity[index]; } char buffer[(NROWS + 2) * (NCOLS + 2)]; diff --git a/examples/shadertui/CMakeLists.txt b/examples/shadertui/CMakeLists.txt index 0938023..b728fc8 100644 --- a/examples/shadertui/CMakeLists.txt +++ b/examples/shadertui/CMakeLists.txt @@ -1,3 +1,4 @@ +# Not working yet needs update with libs for emscripten cmake_minimum_required(VERSION 3.28) project(shadertui) diff --git a/gpu.hpp b/gpu.hpp index 5327fe7..69ed0e9 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1,6 +1,7 @@ #ifndef GPU_HPP #define GPU_HPP +#include "webgpu.h" #include #include #include @@ -9,21 +10,20 @@ #include #include #include +#include #include #include #include #include // std::pair #include -#include "webgpu/webgpu.h" - -#include "numeric_types/half.hpp" -#include "utils/logging.hpp" - #ifdef __EMSCRIPTEN__ #include "emscripten/emscripten.h" #endif +#include "numeric_types/half.hpp" +#include "utils/logging.hpp" + #ifdef USE_DAWN_API #include "dawn/native/DawnNative.h" #endif @@ -254,6 +254,26 @@ inline std::string toString(const Shape &shape) { */ inline std::string toString(size_t value) { return std::to_string(value); } +/** + * @brief Converts a WGPUStringView to an std::string. + * + * If the view's data is null, an empty string is returned. If the view's + * length equals WGPU_STRLEN, it is assumed to be null‑terminated; otherwise, + * the explicit length is used. + * + * @param strView The WGPUStringView to convert. + * @return std::string The resulting standard string. + */ +inline std::string formatWGPUStringView(WGPUStringView strView) { + if (!strView.data) { + return ""; + } + if (strView.length == WGPU_STRLEN) { + return std::string(strView.data); + } + return std::string(strView.data, strView.length); +} + /** * @brief simple in-place string replacement helper function for substituting * placeholders in a WGSL string template. @@ -430,8 +450,8 @@ struct CallbackData { WGPUBuffer buffer; // managed by owning Kernel size_t bufferSize; void *output; // non-owning, only for target memory in toCPU, not used for - // kernel invocations - std::promise *promise; + // kernel invocations + std::shared_ptr> promise; std::future *future; }; @@ -530,32 +550,27 @@ struct Context { // Default constructor Context() = default; - Context(Context&& other) noexcept - : instance(other.instance), - adapter(other.adapter), - device(other.device), + Context(Context &&other) noexcept + : instance(other.instance), adapter(other.adapter), device(other.device), queue(other.queue), // Re‐initialize pools to point to *this*: - pool(this), - kernelPool(this), - adapterStatus(other.adapterStatus), - deviceStatus(other.deviceStatus) - { + pool(this), kernelPool(this), adapterStatus(other.adapterStatus), + deviceStatus(other.deviceStatus) { LOG(kDefLog, kTrace, "Moving Context ownership"); // Move over the resources in the pools: - pool.data = std::move(other.pool.data); + pool.data = std::move(other.pool.data); kernelPool.data = std::move(other.kernelPool.data); // Null out handles in the source so its destructor won't release them. other.instance = nullptr; - other.adapter = nullptr; - other.device = nullptr; - other.queue = nullptr; + other.adapter = nullptr; + other.device = nullptr; + other.queue = nullptr; // other.adapterStatus = 0; // other.deviceStatus = 0; } - Context& operator=(Context&& other) noexcept { + Context &operator=(Context &&other) noexcept { if (this != &other) { // Free any existing resources. In most cases, this should be a no-op // since we typically shouldn't have two active initialized Context @@ -625,7 +640,7 @@ inline Tensor createTensor(TensorPool &pool, WGPUDevice &device, size_t numElements = size(shape); size_t size = sizeBytes(dtype) * numElements; WGPUBufferDescriptor bufferDesc = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = usage, .size = size, }; @@ -794,6 +809,212 @@ inline void check(bool condition, const char *message, } } +/** + * @brief Pumps events until the provided future is ready. + * + * This helper template function continuously checks the status of the provided + * std::future until it becomes ready. On Emscripten builds, it yields + * control to the JavaScript event loop using emscripten_sleep to allow + * asynchronous callbacks to execute. On other platforms, it processes events + * from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future + * is ready, its value is returned. + * + * @tparam T The type of the value contained in the future. + * @param instance The WGPUInstance used to process events. + * @param f The future to wait on. + * @return T The value retrieved from the ready future. + * + * @code + * std::future deviceFuture = requestDeviceAsync(adapter, + * devDescriptor); WGPUDevice device = wait(instance, deviceFuture); + * @endcode + */ +template T wait(Context &ctx, std::future &f) { +#ifdef __EMSCRIPTEN__ + // Poll until the future is ready. + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + // Yield control to the JS event loop. + emscripten_sleep(1); + } + return f.get(); +#else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + wgpuInstanceProcessEvents(ctx.instance); + } + return f.get(); +#endif +} + +// Context Callbacks & Helpers + +/** + * @brief Waits for the provided std::future to become ready by polling its + * status. + * + * This helper template function continuously checks the status of the provided + * std::future until it is ready. On Emscripten builds, it yields control to + * the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous + * behavior. On non-Emscripten platforms, it sleeps for a short duration (10 + * milliseconds) between checks. Once the future is ready, its value is + * returned. + * + * @tparam T The type of the value contained in the future. + * @param f The future to wait on. + * @return T The value retrieved from the ready future. + * + * @code + * std::future contextFuture = createContext(); + * Context ctx = waitForContextFuture(contextFuture); + * @endcode + */ +template T waitForContextFuture(std::future &f, size_t sleepTime = 10) { +#ifdef __EMSCRIPTEN__ + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + emscripten_sleep(1); // Yield back to the JS event loop. + } + return f.get(); +#else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + std::this_thread::sleep_for(std::chrono::milliseconds(sleepTime)); + } + return f.get(); +#endif +} + +/** + * @brief Adapter callback function invoked upon completion of an asynchronous + * WebGPU adapter request. + * + * This callback is triggered when the request for a WebGPU adapter completes. + * It verifies whether the adapter was successfully obtained. On failure, it + * logs an error message (in Emscripten builds) and sets an exception on the + * associated promise. On success, it sets the value of the promise with the + * obtained adapter. Finally, it frees the allocated memory for the promise + * pointer. + * + * @param status The status of the adapter request. Expected to be + * WGPURequestAdapterStatus_Success on success. + * @param adapter The WGPUAdapter obtained on a successful request. + * @param message A string view containing additional information about the + * adapter request. + * @param userdata1 A pointer to a heap-allocated + * std::shared_ptr>. + * @param userdata2 Unused. + */ +inline void adapterCallback(WGPURequestAdapterStatus status, + WGPUAdapter adapter, WGPUStringView message, + void *userdata1, void * /*userdata2*/) { + auto *promisePtr = + reinterpret_cast> *>(userdata1); + if (status != WGPURequestAdapterStatus_Success) { +#ifdef __EMSCRIPTEN__ + LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s", + static_cast(message.length), message.data); +#endif + (*promisePtr) + ->set_exception(std::make_exception_ptr( + std::runtime_error("Request WebGPU adapter failed"))); + } else { + (*promisePtr)->set_value(adapter); + } + delete promisePtr; +} + +/** + * @brief Callback function invoked upon completion of an asynchronous WebGPU + * device request. + * + * This callback is triggered when the request for a WebGPU device completes. It + * verifies that the device was successfully created. On success, the callback + * sets the value of the associated promise; otherwise, it sets an exception. + * After fulfilling the promise, it frees the allocated memory for the promise + * pointer. + * + * @param status The status of the device request. Expected to be + * WGPURequestDeviceStatus_Success on success. + * @param device The WGPUDevice obtained on successful request. + * @param message A string view containing additional information about the + * device request. + * @param userdata1 A pointer to a heap-allocated + * std::shared_ptr>. + * @param userdata2 Unused. + */ +inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device, + WGPUStringView message, void *userdata1, + void * /*userdata2*/) { + auto *promisePtr = + reinterpret_cast> *>(userdata1); + if (status != WGPURequestDeviceStatus_Success) { + (*promisePtr) + ->set_exception(std::make_exception_ptr( + std::runtime_error("Request WebGPU device failed"))); + } else { + LOG(kDefLog, kTrace, "Device Request succeeded %p", + static_cast(device)); + (*promisePtr)->set_value(device); + } + delete promisePtr; +} + +/** + * @brief Asynchronously requests a WebGPU adapter from the given instance. + * + * This helper function wraps the asynchronous call to request an adapter using + * the WebGPU API. It sets up a promise and registers an adapter callback, + * returning a future that will eventually hold the requested WGPUAdapter. + * + * @param instance The WGPUInstance from which to request the adapter. + * @param adapterOpts The options for requesting the adapter. + * @return std::future A future that will eventually hold the + * created WGPUAdapter. + */ +inline std::future +requestAdapterAsync(WGPUInstance instance, + const WGPURequestAdapterOptions &adapterOpts) { + auto promise = std::make_shared>(); + auto *promisePtr = new std::shared_ptr>(promise); + + WGPURequestAdapterCallbackInfo callbackInfo{ + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = adapterCallback, + .userdata1 = promisePtr, + .userdata2 = nullptr}; + wgpuInstanceRequestAdapter(instance, &adapterOpts, callbackInfo); + return promise->get_future(); +} + +/** + * @brief Asynchronously requests a WebGPU device from a given adapter. + * + * This helper function wraps the asynchronous call to request a device using + * the WebGPU API. It sets up a promise and registers a device callback, + * returning a future that will be fulfilled once the device is available. + * + * @param adapter The WGPUAdapter to request the device from. + * @param devDescriptor The descriptor specifying the characteristics of the + * requested device. + * @return std::future A future that will eventually hold the + * created WGPUDevice. + */ +inline std::future +requestDeviceAsync(WGPUAdapter adapter, + const WGPUDeviceDescriptor &devDescriptor) { + auto promise = std::make_shared>(); + auto *promisePtr = new std::shared_ptr>(promise); + + WGPURequestDeviceCallbackInfo deviceCallbackInfo{ + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = deviceCallback, + .userdata1 = promisePtr, + .userdata2 = nullptr}; + wgpuAdapterRequestDevice(adapter, &devDescriptor, deviceCallbackInfo); + return promise->get_future(); +} + /** * @brief Factory function to create a GPU context, which aggregates WebGPU API * handles to interact with the GPU including the instance, adapter, device, and @@ -812,316 +1033,388 @@ inline void check(bool condition, const char *message, * @return Context instance representing the created GPU context * */ -inline Context createContext( - const WGPUInstanceDescriptor &desc = {}, - const WGPURequestAdapterOptions &adapterOpts = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) -{ - Context ctx; // stack-allocated +inline std::future +createContextAsync(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { -#ifdef __EMSCRIPTEN__ - ctx.instance = wgpuCreateInstance(nullptr); -#else + auto promise = std::make_shared>(); + + // On native platforms, run our context creation in a detached thread. + + Context ctx; ctx.instance = wgpuCreateInstance(&desc); -#endif - check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); + if (!ctx.instance) { + promise->set_exception(std::make_exception_ptr( + std::runtime_error("Failed to create WebGPU instance."))); + return promise->get_future(); + } + try { + auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts); + ctx.adapter = wait(ctx, adapterFuture); + ctx.adapterStatus = WGPURequestAdapterStatus_Success; + } catch (const std::exception &ex) { + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); + } + try { + auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); + ctx.device = wait(ctx, deviceFuture); + ctx.deviceStatus = WGPURequestDeviceStatus_Success; + } catch (const std::exception &ex) { + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); + } + ctx.queue = wgpuDeviceGetQueue(ctx.device); + promise->set_value(std::move(ctx)); - LOG(kDefLog, kTrace, "Requesting adapter"); - { - struct AdapterData { - WGPUAdapter adapter = nullptr; - bool requestEnded = false; - WGPURequestAdapterStatus status; - }; - AdapterData adapterData; - - auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status, - WGPUAdapter adapter, - WGPUStringView message, - void *pUserData, void *) { - auto &ad = *reinterpret_cast(pUserData); - ad.status = status; -#ifdef __EMSCRIPTEN__ - if (status != WGPURequestAdapterStatus_Success) { - LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s", - static_cast(message.length), message.data); - } -#endif - check(status == WGPURequestAdapterStatus_Success, - "Request WebGPU adapter", __FILE__, __LINE__); - ad.adapter = adapter; - ad.requestEnded = true; - }; + return promise->get_future(); +} - WGPURequestAdapterCallbackInfo callbackInfo { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onAdapterRequestEnded, - .userdata1 = &adapterData, - .userdata2 = nullptr - }; - wgpuInstanceRequestAdapter(ctx.instance, &adapterOpts, callbackInfo); +/** + * @brief Synchronously waits for and returns the created GPU context. + * + * This function invokes the asynchronous createContext() factory function to + * create a GPU context, then waits for its completion using + * waitForContextFuture. The returned Context holds handles to the WebGPU + * instance, adapter, device, and queue, and is used for subsequent GPU + * operations. + * + * @return Context The fully initialized GPU context. + * + * @code + * Context ctx = waitForContext(); + * // Now ctx can be used for GPU operations. + * @endcode + */ +inline Context createContext(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + std::future contextFuture = + createContextAsync(desc, adapterOpts, devDescriptor); + return waitForContextFuture(contextFuture); +} + +#ifndef __EMSCRIPTEN__ +#if USE_DAWN_API +/** + * @brief Retrieves the list of available GPU adapters from the Dawn instance. + * + * This function creates a Dawn instance using the provided context's instance + * handle, then enumerates and returns the available GPU adapters as a vector. + * + * @param ctx The Context containing the WebGPU instance handle. + * @return std::vector A vector of available GPU + * adapters. + * + * @code + * std::vector adapters = getAdapters(ctx); + * @endcode + */ +inline std::vector getAdapters(Context &ctx) { + dawn::native::Instance dawnInstance( + reinterpret_cast(ctx.instance)); + return dawnInstance.EnumerateAdapters(); +} - while (!adapterData.requestEnded) { - processEvents(ctx.instance); +/** + * @brief Formats the given vector of Dawn adapters into a single concatenated + * string. + * + * This function iterates over each Dawn adapter in the provided vector, + * retrieves its description using the WebGPU API, and converts the description + * from a WGPUStringView to an std::string using the formatWGPUStringView + * helper. The resulting descriptions are concatenated into a single string + * separated by newline characters. + * + * @param adapters A vector of Dawn adapters obtained from a WebGPU instance. + * @return std::string A newline-delimited string listing each adapter's + * description. + * + * @code + * std::string adapterList = formatAdapters(adapters); + * @endcode + */ +inline std::string +formatAdapters(const std::vector &adapters) { + std::string adapterList; + for (size_t i = 0; i < adapters.size(); ++i) { + auto adapterPtr = adapters[i].Get(); + if (adapterPtr) { + WGPUAdapterInfo info = {}; + wgpuAdapterGetInfo(adapterPtr, &info); + std::string desc = formatWGPUStringView(info.description); + adapterList += "GPU Adapter [" + std::to_string(i) + "]: " + desc + "\n"; + wgpuAdapterInfoFreeMembers(info); } - ctx.adapter = adapterData.adapter; - ctx.adapterStatus = adapterData.status; } + return adapterList; +} - LOG(kDefLog, kTrace, "Requesting device"); - { - struct DeviceData { - WGPUDevice device = nullptr; - bool requestEnded = false; - WGPURequestDeviceStatus status; - }; - DeviceData devData; - - auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status, - WGPUDevice device, - WGPUStringView message, - void *pUserData, void *) { - auto &dd = *reinterpret_cast(pUserData); - dd.status = status; - check(status == WGPURequestDeviceStatus_Success, - "Could not get WebGPU device.", __FILE__, __LINE__); - LOG(kDefLog, kTrace, "Device Request succeeded %p", - static_cast(device)); - dd.device = device; - dd.requestEnded= true; - }; +/** + * @brief Lists the available GPU adapters in the current WebGPU instance. + * + * This function retrieves the list of available GPU adapters using the + * getAdapters helper function, then formats and returns the adapter + * descriptions as a single string using the formatAdapters helper function. + * + * @param ctx The Context containing the WebGPU instance handle. + * @return std::string A newline-delimited string listing each adapter's + * description. + * + * @code + * std::string adapterList = listAdapters(ctx); + * @endcode + */ +inline std::string listAdapters(Context &ctx) { + auto adapters = getAdapters(ctx); + return formatAdapters(adapters); +} - WGPURequestDeviceCallbackInfo deviceCallbackInfo { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onDeviceRequestEnded, - .userdata1= &devData, - .userdata2= nullptr - }; - wgpuAdapterRequestDevice(ctx.adapter, &devDescriptor, deviceCallbackInfo); +/** + * @brief Asynchronously creates a GPU context using the specified GPU index. + * + * This function creates a WebGPU instance, retrieves the available GPU + * adapters, and selects the adapter at the specified index. It then requests a + * device from the selected adapter and sets up a logging callback for device + * errors. The function returns a future that will be fulfilled with the + * created Context once all operations are complete. + * + * @param gpuIdx The index of the GPU adapter to use. + * @param desc Instance descriptor for the WebGPU instance (optional) + * @param devDescriptor Device descriptor for the WebGPU device (optional) + * @return std::future A future that will eventually hold the created + * Context. + * + * @code + * std::future contextFuture = createContextByGpuIdxAsync(0); + * Context ctx = waitForContextFuture(contextFuture); + * @endcode + */ +inline std::future +createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + auto promise = std::make_shared>(); + Context ctx; - LOG(kDefLog, kTrace, "Waiting for device request to end"); - while (!devData.requestEnded) { - processEvents(ctx.instance); - } - LOG(kDefLog, kTrace, "Device request ended"); - - ctx.device = devData.device; - ctx.deviceStatus = devData.status; - - // If the device was created, set up logging and fetch the queue - if (devData.status == WGPURequestDeviceStatus_Success) { - WGPULoggingCallbackInfo loggingCallbackInfo { - .nextInChain = nullptr, - .callback = - [](WGPULoggingType type, WGPUStringView message, - void *, void *) { + ctx.instance = wgpuCreateInstance(&desc); + + if (!ctx.instance) { + promise->set_exception(std::make_exception_ptr( + std::runtime_error("Failed to create WebGPU instance."))); + return promise->get_future(); + } + check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); + + // Use helper functions to obtain and format the adapters. + auto adapters = getAdapters(ctx); + + if (gpuIdx >= adapters.size()) { + promise->set_exception( + std::make_exception_ptr(std::runtime_error("Invalid GPU index."))); + return promise->get_future(); + } + LOG(kDefLog, kInfo, "Using GPU Adapter[%d]", gpuIdx); + auto adapterPtr = adapters[gpuIdx].Get(); + if (adapterPtr) { + WGPUAdapterInfo info = {}; + wgpuAdapterGetInfo(adapterPtr, &info); + LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s", gpuIdx, + formatWGPUStringView(info.description).c_str()); + wgpuAdapterInfoFreeMembers(info); + } + ctx.adapter = reinterpret_cast(adapterPtr); + dawn::native::GetProcs().adapterAddRef(ctx.adapter); + + LOG(kDefLog, kInfo, "Requesting device"); + // Request the device asynchronously (using our requestDeviceAsync helper). + auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); + try { + ctx.device = wait(ctx, deviceFuture); + ctx.deviceStatus = WGPURequestDeviceStatus_Success; + } catch (const std::exception &ex) { + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); + } + + WGPULoggingCallbackInfo loggingCallbackInfo{ + .nextInChain = nullptr, + .callback = + [](WGPULoggingType type, WGPUStringView message, void *userdata1, + void *userdata2) { LOG(kDefLog, kError, "Device logging callback: %.*s", static_cast(message.length), message.data); if (type == WGPULoggingType_Error) { throw std::runtime_error("Device error logged."); } }, - .userdata1 = nullptr, - .userdata2 = nullptr - }; - wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); - ctx.queue = wgpuDeviceGetQueue(ctx.device); - } - } - - return std::move(ctx); + .userdata1 = nullptr, + .userdata2 = nullptr}; + wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); + ctx.queue = wgpuDeviceGetQueue(ctx.device); + promise->set_value(std::move(ctx)); + return promise->get_future(); } - -#ifdef USE_DAWN_API /** - * @brief Factory function to create a GPU context, which aggregates WebGPU API - * handles to interact with the GPU including the instance, adapter, device, and - * queue. + * @brief Synchronously creates a GPU context using the specified GPU index. * - * The function takes gpu index to support for multi GPUs. - * To activate this function, it needs not only webgpu's headers but also DAWN's - * headers. + * This function calls the asynchronous createContextByGpuIdxAsync function to + * create a GPU context, then waits for its completion using + * waitForContextFuture. The returned Context holds handles to the WebGPU + * instance, adapter, device, and queue, and is used for subsequent GPU + * operations. * - * If dawn is used, it also sets up an error callback for device loss. - * - * @param[in] gpuIdx GPU index - * @param[in] desc Instance descriptor for the WebGPU instance (optional) - * @param[in] devDescriptor Device descriptor for the WebGPU device (optional) - * @return Context instance representing the created GPU context + * @param gpuIdx The index of the GPU adapter to use. + * @param desc Instance descriptor for the WebGPU instance (optional) + * @param devDescriptor Device descriptor for the WebGPU device (optional) + * @return Context The fully initialized GPU context. * * @code - * Context ctx = createContextByGpuIdx(1); + * Context ctx = createContextByGpuIdx(0); * @endcode */ inline Context createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, const WGPUDeviceDescriptor &devDescriptor = {}) { - Context context; - { -#ifdef __EMSCRIPTEN__ - // Emscripten does not support the instance descriptor - // and throws an assertion error if it is not nullptr. - context.instance = wgpuCreateInstance(nullptr); -#else - context.instance = wgpuCreateInstance(&desc); -#endif - // check status - check(context.instance, "Initialize WebGPU", __FILE__, __LINE__); - } - - LOG(kDefLog, kInfo, "Requesting adapter"); - { - std::vector adapters = - dawn::native::Instance( - reinterpret_cast(context.instance)) - .EnumerateAdapters(); - LOG(kDefLog, kInfo, "The number of GPUs=%d\n", adapters.size()); - // Note: Second gpu is not available on Macos, but the number of GPUs is 2 - // on Macos. - // Calling wgpuAdapterGetInfo function for the second gpu becomes - // segfault. When you check all GPUs on linux, uncomment out following - // codes. - // - // for (size_t i = 0; i < adapters.size(); i++) { - // WGPUAdapterInfo info {}; - // auto ptr = adapters[i].Get(); - // if (ptr && adapters[i]) { - // wgpuAdapterGetInfo(ptr, &info); - // LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", i, info.description); - // wgpuAdapterInfoFreeMembers(info); - // } - // } - - { - LOG(kDefLog, kInfo, "Use GPU(Adapter)[%d]\n", gpuIdx); - auto ptr = adapters[gpuIdx].Get(); - if (ptr) { - WGPUAdapterInfo info{}; - wgpuAdapterGetInfo(ptr, &info); - LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", gpuIdx, - info.description); - wgpuAdapterInfoFreeMembers(info); - } - context.adapter = adapters[gpuIdx].Get(); - dawn::native::GetProcs().adapterAddRef(context.adapter); - } - } + std::future contextFuture = + createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor); + return waitForContextFuture(contextFuture); +} - LOG(kDefLog, kInfo, "Requesting device"); - { - struct DeviceData { - WGPUDevice device = nullptr; - bool requestEnded = false; - }; - DeviceData devData; - - auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status, - WGPUDevice device, WGPUStringView message, - void *pUserData, void *) { - DeviceData &devData = *reinterpret_cast(pUserData); - check(status == WGPURequestDeviceStatus_Success, - "Could not get WebGPU device.", __FILE__, __LINE__); - LOG(kDefLog, kTrace, "Device Request succeeded %x", - static_cast(device)); - devData.device = device; - devData.requestEnded = true; - }; +#endif // USE_DAWN_API +#endif // __EMSCRIPTEN__ - WGPURequestDeviceCallbackInfo deviceCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onDeviceRequestEnded, - .userdata1 = &devData, - .userdata2 = nullptr}; - wgpuAdapterRequestDevice(context.adapter, &devDescriptor, - deviceCallbackInfo); - - LOG(kDefLog, kInfo, "Waiting for device request to end"); - while (!devData.requestEnded) { - processEvents(context.instance); - } - LOG(kDefLog, kInfo, "Device request ended"); - assert(devData.requestEnded); - context.device = devData.device; - - WGPULoggingCallbackInfo loggingCallbackInfo = { - .nextInChain = nullptr, - .callback = - [](WGPULoggingType type, WGPUStringView message, void *userdata1, - void *userdata2) { - LOG(kDefLog, kError, "Device logging callback: %.*s", - static_cast(message.length), message.data); - if (type == WGPULoggingType_Error) { - throw std::runtime_error("Device error logged."); - } - }, - .userdata1 = nullptr, - .userdata2 = nullptr}; - wgpuDeviceSetLoggingCallback(context.device, loggingCallbackInfo); - } - context.queue = wgpuDeviceGetQueue(context.device); - return context; +/** + * @brief Callback function invoked upon completion of an asynchronous GPU + * buffer mapping. + * + * This callback is triggered when the GPU buffer mapping for a readback buffer + * is completed. It verifies that the mapping operation was successful, + * retrieves the mapped memory, copies the data from the GPU buffer to a CPU + * memory region, unmaps the buffer, signals the completion by fulfilling the + * associated promise, and cleans up the allocated callback data. + * + * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success + * on success. + * @param message A string view containing additional information about the + * mapping operation. + * @param userdata1 A pointer to a heap-allocated CallbackData structure + * containing the GPU buffer, buffer size, destination CPU memory pointer, and a + * promise for signaling completion. + * @param userdata2 Unused. + */ +inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, + void *userdata1, void * /*userdata2*/) { + const CallbackData *cbData = static_cast(userdata1); + // Check that mapping succeeded. + check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__, + __LINE__); + + // Get the mapped memory. + const void *mappedData = + wgpuBufferGetConstMappedRange(cbData->buffer, 0, cbData->bufferSize); + check(mappedData, "Get mapped range", __FILE__, __LINE__); + + // Copy the data from the mapped GPU buffer to the CPU memory. + memcpy(cbData->output, mappedData, cbData->bufferSize); + + // Unmap the buffer. + wgpuBufferUnmap(cbData->buffer); + + // Signal that the copy has completed. + // Ensure you use the arrow operator on the shared_ptr to call set_value(). + cbData->promise->set_value(); + + // Clean up the dynamically allocated callback data. + delete cbData; } -#endif -inline void wait(Context &ctx, std::future &future) { - while (future.wait_for(std::chrono::seconds(0)) != - std::future_status::ready) { - processEvents(ctx.instance); - } +/** + * @brief Callback function invoked when the GPU queue’s submitted work is + * complete. + * + * This callback is registered with the GPU queue after submitting work. When + * invoked, it verifies that all queued work completed successfully, and then + * sets up the buffer mapping callback to initiate the asynchronous mapping of a + * readback buffer. The readback buffer is mapped to access the processed data + * on the CPU. + * + * @param status The status of the completed work. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A pointer to a heap-allocated CallbackData structure + * containing the readback buffer, buffer size, destination CPU memory pointer, + * and a promise to signal completion. + * @param userdata2 Unused. + */ +inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + const CallbackData *cbData = static_cast(userdata1); + // Ensure the queue work finished successfully. + check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__, + __LINE__); + + // Set up the buffer mapping callback information. + WGPUBufferMapCallbackInfo mapCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = bufferMapCallback, + .userdata1 = const_cast(cbData), // Pass the callback data. + .userdata2 = nullptr // No additional user data. + }; + + // Begin the asynchronous mapping of the readback buffer. + wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize, + mapCallbackInfo); } /** * @brief Copies data from a GPU buffer to CPU memory. * @param[in] ctx Context instance to manage the operation - * @param[in] tensor Tensor instance representing the GPU buffer to copy from * @param[out] data Pointer to the CPU memory to copy the data to * @param[in] bufferSize Size of the data buffer in bytes * @param[in] op StagingBuffer instance to manage the operation + * @param[in] sourceOffset Offset in the GPU buffer to start copying from. * * @code * toCPU(ctx, tensor, data, bufferSize); * @endcode */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, - CopyData &op) { + +// NOTE: I think this one is redundant? CopyData not used externally. +inline std::future toCPUAsync(Context &ctx, void *data, size_t bufferSize, + CopyData &op, size_t sourceOffset = 0) { + // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); - CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise, - &op.future}; + // Create a promise and get its future. + auto promise = std::make_shared>(); + + // Allocate callback data so it remains valid until the async + // chain finishes. + CallbackData *cbData = new CallbackData{ + op.readbackBuffer, // The GPU buffer to be read back. + bufferSize, + data, // CPU memory destination. + promise, // The promise to be signaled. + }; + + // Set up the work-done callback to initiate the buffer mapping. WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - const auto *data = static_cast(userdata1); - WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUMapAsyncStatus status, WGPUStringView message, - void *userdata1, void *userdata2) { - const auto *data = static_cast(userdata1); - check(status == WGPUMapAsyncStatus_Success, - "Map readbackBuffer", __FILE__, __LINE__); - const void *mappedData = wgpuBufferGetConstMappedRange( - data->buffer, /*offset=*/0, data->bufferSize); - check(mappedData, "Get mapped range", __FILE__, __LINE__); - memcpy(data->output, mappedData, data->bufferSize); - wgpuBufferUnmap(data->buffer); - data->promise->set_value(); - }, - .userdata1 = const_cast(data), - .userdata2 = nullptr}; - wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0, - data->bufferSize, mapCallbackInfo); - }, - .userdata1 = &callbackData, + .callback = queueWorkDoneCallback, + .userdata1 = const_cast(cbData), .userdata2 = nullptr}; + + // Begin the asynchronous chain by registering the queue work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - wait(ctx, op.future); + // Release the readback buffer as it is no longer needed. + if (op.readbackBuffer) { + wgpuBufferRelease(op.readbackBuffer); + } + + return promise->get_future(); } /** @@ -1136,34 +1429,124 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, * * @param[in] ctx Context instance to manage the operation * @param[in] tensor Tensor instance representing the GPU buffer to copy from - * @param[in] bufferSize Size of the data buffer in bytes + * @param[in] bufferSize Size to read in bytes as out data. * @param[out] data Pointer to the CPU memory to copy the data to + * @param[in] sourceOffset Offset in the GPU buffer to start copying from. */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize, + size_t sourceOffset = 0) { + // Create a promise that will later be satisfied when the async copy + // completes. + auto promise = std::make_shared>(); + + // Create a readback buffer that will be used for copying and mapping. + WGPUBufferDescriptor readbackBufferDescriptor = { + .label = {.data = nullptr, .length = 0}, + .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, + .size = bufferSize, // Size of the readback buffer. + }; + WGPUBuffer readbackBuffer = + wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); + + // Create a command encoder and record a copy from the tensor GPU buffer + WGPUCommandEncoder commandEncoder = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, + sourceOffset, readbackBuffer, 0, + bufferSize); + // Finish recording by creating a command buffer and release the encoder. + WGPUCommandBuffer commandBuffer = + wgpuCommandEncoderFinish(commandEncoder, nullptr); + wgpuCommandEncoderRelease(commandEncoder); + check(commandBuffer, "Create command buffer", __FILE__, __LINE__); + + // Submit the work to the queue and release the command buffer immediately. + wgpuQueueSubmit(ctx.queue, 1, &commandBuffer); + wgpuCommandBufferRelease(commandBuffer); + + // Allocate callback data + CallbackData *cbData = new CallbackData{ + readbackBuffer, // The readback buffer to map. + bufferSize, // The size of the copy. + data, // CPU memory destination. + promise // The promise to signal when done. + }; + + // Set up the work-done callback. When the queue’s submitted work is + // completed, it is routed to queueWorkDoneCallback which then starts the + // asynchronous map. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = queueWorkDoneCallback, + .userdata1 = cbData, + .userdata2 = nullptr, + }; + + // Register the callback. The async chain continues inside + // queueWorkDoneCallback. + wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + + return promise->get_future(); +} + +inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, + size_t bufferSize, + size_t sourceOffset = 0) { + + // Create an operation structure (here we reuse CopyData solely for its + // members that we need to create a readback buffer and command buffer). CopyData op; - op.future = op.promise.get_future(); + + // Create the promise that will be fulfilled once the copy is done. + auto promise = std::make_shared>(); + + // Create a readback buffer that we can map for reading. { WGPUBufferDescriptor readbackBufferDescriptor = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, .size = bufferSize, }; op.readbackBuffer = wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); } + + // Create a command encoder which copies from the provided buffer to the + // readback buffer. { - WGPUCommandEncoder commandEncoder; - commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0, + WGPUCommandEncoder commandEncoder = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, sourceOffset, op.readbackBuffer, 0, bufferSize); op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); wgpuCommandEncoderRelease(commandEncoder); check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__); } - toCPU(ctx, tensor, data, bufferSize, op); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } + + // Submit the command and release the command buffer. + wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); + wgpuCommandBufferRelease(op.commandBuffer); + + // Allocate callback data + CallbackData *cbData = new CallbackData{ + op.readbackBuffer, // The readback buffer created above. + bufferSize, // Size of the copy. + data, // Destination CPU memory. // Offset in the GPU buffer. + promise // Our promise to satisfy when done. + }; + + // Set up the queue work-done callback info. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = queueWorkDoneCallback, // Our free function callback. + .userdata1 = cbData, // Pass the callback data pointer. + .userdata2 = nullptr}; + + // Start the asynchronous chain by registering the work-done callback. + wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + + return promise->get_future(); } /** @@ -1174,76 +1557,86 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { * @param[out] data Array of floats to copy the data to * * @code - * toCPU(ctx, tensor, data); + * std::future toCPUFuture = toCPU(ctx, tensor, data); + * wait(ctx, toCPUFuture); * @endcode */ template -void toCPU(Context &ctx, Tensor &tensor, std::array &data) { - toCPU(ctx, tensor, data.data(), sizeof(data)); +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, + std::array &data, + size_t sourceOffset = 0) { + return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset); } -inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { - uint64_t bufferSize = size; - CopyData op; - op.future = op.promise.get_future(); - { - WGPUBufferDescriptor readbackBufferDescriptor = { - .label = {.data = nullptr, .length = 0}, - .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, - .size = bufferSize, - }; - op.readbackBuffer = - wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); - } - { - WGPUCommandEncoder commandEncoder; - commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0, - op.readbackBuffer, 0, bufferSize); - op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); - wgpuCommandEncoderRelease(commandEncoder); - check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__); - } - wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); - wgpuCommandBufferRelease(op.commandBuffer); - CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise, - &op.future}; +/** + * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU + * memory. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param tensor Tensor instance representing the GPU buffer to copy from + * @param data Pointer to the CPU memory to copy the data to + * @param bufferSize Size of the data buffer in bytes + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, tensor, data, bufferSize, instance); + * @endcode + */ +inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, + size_t sourceOffset = 0) { + auto future = toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset); + wait(ctx, future); +} - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - const auto *data = static_cast(userdata1); - WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUMapAsyncStatus status, WGPUStringView message, - void *userdata1, void *userdata2) { - const auto *data = static_cast(userdata1); - check(status == WGPUMapAsyncStatus_Success, - "Map readbackBuffer", __FILE__, __LINE__); - const void *mappedData = wgpuBufferGetConstMappedRange( - data->buffer, /*offset=*/0, data->bufferSize); - check(mappedData, "Get mapped range", __FILE__, __LINE__); - memcpy(data->output, mappedData, data->bufferSize); - wgpuBufferUnmap(data->buffer); - data->promise->set_value(); - }, - .userdata1 = const_cast(data), - .userdata2 = nullptr}; - wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0, - data->bufferSize, mapCallbackInfo); - }, - .userdata1 = &callbackData, - .userdata2 = nullptr}; - wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); +/** + * @brief Synchronous wrapper for copying from a GPU buffer to CPU memory. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param buffer WGPUBuffer instance representing the GPU buffer to copy from + * @param data Pointer to the CPU memory to copy the data to + * @param size Size of the data buffer in bytes + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, buffer, data, size, instance); + * @endcode + */ +inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size, + size_t sourceOffset = 0) { + auto future = toCPUAsync(ctx, buffer, data, size, sourceOffset); + wait(ctx, future); +} - wait(ctx, op.future); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } +/** + * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU + * memory for an array of floats instead of a pointer to a float buffer. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param tensor Tensor instance representing the GPU buffer to copy from + * @param data Array of floats to copy the data to + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, tensor, data, instance); + * @endcode + */ +template +inline void toCPU(Context &ctx, Tensor &tensor, std::array &data, + size_t sourceOffset = 0) { + auto future = toCPUAsync(ctx, tensor, data, sourceOffset); + wait(ctx, future); } /** @@ -1375,9 +1768,24 @@ inline Shape cdiv(Shape total, Shape group) { } /** - * @brief A factory function to create a kernel on the GPU. The kernel is - * created with the given WGSL code, input tensors, output tensor, and - * optional parameters. + * @brief Packages the shader compilation information along with a promise for + * asynchronous signaling. + * + * This structure holds a pointer to a CompilationInfo instance that collects + * details such as status, messages, line numbers, and positions from the shader + * compilation. It also contains a shared pointer to a std::promise which + * is used to signal the completion of the asynchronous shader compilation + * process. + */ +struct CompData { + CompilationInfo *compInfo; + std::shared_ptr> compPromise; +}; + +/** + * @brief A factory function to create a kernel asynchronously on the GPU. + * The kernel is created with the given WGSL code, input tensors, + * output tensor, and optional parameters. * * Note that the values of the input tensors are not used here, only the * reference handles to the underlying buffers as well as the size of the @@ -1397,34 +1805,40 @@ inline Shape cdiv(Shape total, Shape group) { * @return Kernel instance representing the created kernel * * @code - * Kernel kernel = createKernel(ctx, code, dataBindings, numInputs, + * std::future kernelFuture = createKernelAsync(ctx, code, dataBindings, + numInputs, output, nThreads, params, paramsSize); + * Kernel kernel = wait(ctx.instance, kernelFuture); * @endcode - * output, nThreads, params, paramsSize); + */ -inline Kernel createKernel(Context& ctx, const KernelCode &code, - const Tensor *dataBindings, size_t numTensors, - const size_t *viewOffsets, - const Shape &totalWorkgroups, - const void *params = nullptr, size_t paramsSize = 0, - CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +inline std::future +createKernelAsync(Context &ctx, const KernelCode &code, + const Tensor *dataBindings, size_t numTensors, + const size_t *viewOffsets, const Shape &totalWorkgroups, + const void *params = nullptr, size_t paramsSize = 0, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { // Create a cache key by the pointer values of the data bindings and the // kernel code if (cacheKey != nullptr && ctx.kernelPool.data.find(cacheKey) != ctx.kernelPool.data.end()) { - LOG(kDefLog, kInfo, "Kernel cache hit"); - return ctx.kernelPool.data[cacheKey]; + std::promise ready; + ready.set_value(ctx.kernelPool.data[cacheKey]); + return ready.get_future(); } + // Create an outer promise for the new kernel. + std::promise outerPromise; + std::future outerFuture = outerPromise.get_future(); + assert(totalWorkgroups.rank == 3); WGPUDevice device = ctx.device; WGPUQueue queue = ctx.queue; Kernel op(new RawKernel()); - // paramIndex is the index into bgLayoutEntries for the parameters buffer If // there are no parameters for the kernel, paramsSize == 0 and paramIndex is // effectively undefined (== -1) - size_t paramIndex = -1; + size_t paramIndex = static_cast(-1); // Note: paramIndex is undefined unless paramsSize > 0 size_t numBindings = numTensors; if (paramsSize > 0) { @@ -1433,11 +1847,13 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, // op.buffers, op.bufferSizes and // bgLayoutEntries } + op->buffers = std::make_unique(numBindings); op->bufferSizes = std::make_unique(numBindings); op->numBindings = numBindings; - std::vector bgLayoutEntries(numBindings); + // Create layout entries for input buffers + std::vector bgLayoutEntries(numBindings); for (size_t i = 0; i < numTensors; ++i) { bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{ .binding = static_cast(i), @@ -1450,8 +1866,6 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, }; } if (paramsSize > 0) { - LOG(kDefLog, kInfo, "Create layout entry for the params buffer"); - // Create layout entry for the params buffer bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{ .binding = static_cast(paramIndex), .visibility = WGPUShaderStage_Compute, @@ -1464,10 +1878,11 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, } WGPUBindGroupLayoutDescriptor bgLayoutDesc = { .entryCount = static_cast(bgLayoutEntries.size()), - .entries = bgLayoutEntries.data(), - }; + .entries = bgLayoutEntries.data()}; WGPUBindGroupLayout bgLayout = wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc); + + // Assign buffers from dataBindings. for (size_t i = 0; i < numTensors; ++i) { op->buffers[i] = dataBindings[i].data.buffer; op->bufferSizes[i] = dataBindings[i].data.size; @@ -1475,7 +1890,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, // Create a buffer for the Params struct if (paramsSize > 0) { WGPUBufferDescriptor paramsBufferDesc = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, .size = paramsSize, .mappedAtCreation = false, @@ -1487,6 +1902,8 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, } else { LOG(kDefLog, kTrace, "No params buffer needed"); } + + // Build bind group entries and the bind group. std::vector bindGroupEntries(numBindings); for (size_t i = 0; i < numTensors; ++i) { bindGroupEntries[i] = WGPUBindGroupEntry{ @@ -1514,6 +1931,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, }; op->bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc); + // Create pipeline layout. WGPUPipelineLayoutDescriptor pipelineLayoutDesc = { .bindGroupLayoutCount = 1, .bindGroupLayouts = &bgLayout, @@ -1521,63 +1939,151 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, WGPUPipelineLayout pipelineLayout = wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc); + // Prepare the WGSL source and shader module descriptor. WGPUShaderSourceWGSL wgslDesc = { .chain = {.sType = WGPUSType_ShaderSourceWGSL}, .code = {.data = code.data.c_str(), .length = code.data.length()}}; - WGPUShaderModuleDescriptor shaderModuleDesc = {}; shaderModuleDesc.nextInChain = &wgslDesc.chain; shaderModuleDesc.label = {code.label.c_str(), code.label.length()}; - WGPUComputePipelineDescriptor computePipelineDesc = {}; - computePipelineDesc.layout = pipelineLayout; - computePipelineDesc.compute.module = + // Create the shader module. + WGPUShaderModule shaderModule = wgpuDeviceCreateShaderModule(device, &shaderModuleDesc); + // If compilation info is requested, register the callback immediately. + if (compilationInfo) { + auto compPromise = std::make_shared>(); + std::future compFuture = compPromise->get_future(); + // Allocate helper data to pass to the callback. + auto *compData = new CompData{compilationInfo, compPromise}; + + auto compilationCallback = [](WGPUCompilationInfoRequestStatus status, + WGPUCompilationInfo const *info, + void *userdata1, void * /*userdata2*/) { + CompData *cd = reinterpret_cast(userdata1); + if (info && cd->compInfo) { + cd->compInfo->status = status; + for (uint32_t i = 0; i < info->messageCount; ++i) { + cd->compInfo->messages.push_back( + std::string(info->messages[i].message.data, + info->messages[i].message.length)); + cd->compInfo->lineNums.push_back(info->messages[i].lineNum); + cd->compInfo->linePos.push_back(info->messages[i].linePos); + } + cd->compInfo->finished = true; + } + cd->compPromise->set_value(); + delete cd; + }; + + WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {}; + compilationCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + compilationCallbackInfo.callback = compilationCallback; + compilationCallbackInfo.userdata1 = compData; + compilationCallbackInfo.userdata2 = nullptr; + + // Register callback and then wait for the result. + wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo); + wait(ctx, compFuture); + } + + // Now create the compute pipeline using the shader module. + WGPUComputePipelineDescriptor computePipelineDesc = {}; + computePipelineDesc.layout = pipelineLayout; + computePipelineDesc.compute.module = shaderModule; computePipelineDesc.compute.entryPoint = {code.entryPoint.c_str(), code.entryPoint.length()}; computePipelineDesc.label = {code.label.c_str(), code.label.length()}; - op->computePipeline = wgpuDeviceCreateComputePipeline(device, &computePipelineDesc); + op->totalWorkgroups = {totalWorkgroups[0], totalWorkgroups[1], totalWorkgroups[2]}; + resetCommandBuffer(device, op); if (cacheKey != nullptr) ctx.kernelPool.data[cacheKey] = op; - auto compilationInfoCallback = [](WGPUCompilationInfoRequestStatus status, - WGPUCompilationInfo const *compilationInfo, - void *userdata1, void *userdata2) { - CompilationInfo *result = static_cast(userdata1); - if (compilationInfo && result) { - result->status = status; - for (uint32_t i = 0; i < compilationInfo->messageCount; ++i) { - printf("Message %d: %.*s\n", i, - static_cast(compilationInfo->messages[i].message.length), - compilationInfo->messages[i].message.data); - result->messages.push_back( - std::string(compilationInfo->messages[i].message.data, - compilationInfo->messages[i].message.length)); - result->lineNums.push_back(compilationInfo->messages[i].lineNum); - result->linePos.push_back(compilationInfo->messages[i].linePos); - } - result->finished = true; - } else { - LOG(kDefLog, kTrace, "No compilation info or result"); - } - }; + outerPromise.set_value(op); + return outerFuture; +} - WGPUCompilationInfoCallbackInfo compilationCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = compilationInfoCallback, - .userdata1 = static_cast(compilationInfo), - .userdata2 = nullptr}; +/* + * @brief Overload which wraps the createKernelAsync factory function to create + * a kernel on the GPU. This overload uses takes a pointer and size for the + * input tensors instead of a static collection and a void pointer for params + * instead of a static type. + * + * @param[in] ctx Context instance to manage the kernel + * @param[in] code WGSL code for the kernel + * @param[in] dataBindings Pointer to a span of tensors bound to the kernel + * @param[in] numTensors Number of tensors in the dataBindings span + * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be + * a Shape of rank == 3. + * @param[in] params Optional parameters for the kernel. If the kernel does + * not have any parameters, use NoParam. + * @return Kernel instance representing the created kernel + * + * @code + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance, + * kernelFuture); + * @endcode + */ +inline Kernel createKernel(Context &ctx, const KernelCode &code, + const Tensor *dataBindings, size_t numTensors, + const size_t *viewOffsets, + const Shape &totalWorkgroups, + const void *params = nullptr, size_t paramsSize = 0, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { + std::future kernelFuture = createKernelAsync( + ctx, code, dataBindings, numTensors, viewOffsets, totalWorkgroups, params, + paramsSize, compilationInfo, cacheKey); + return wait(ctx, kernelFuture); +} - while (compilationInfo && !compilationInfo->finished) { - processEvents(ctx.instance); +/** + * @brief Overload which wraps the createKernelAsync factory function to create + * a kernel asynchronously on the GPU. This overload uses takes a static + * collection of input tensors instead of a pointer and a statically determined + * ParamsType instead of casting params to a void pointer. + * + * @param[in] ctx Context instance to manage the kernel + * @param[in] code WGSL code for the kernel + * @param[in] dataBindings A Bindings of tensors whose GPU buffers are bound + * to the kernel as inputs and outputs. + * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be + * a Shape of rank == 3. + * @param[in] params Optional parameters for the kernel. If the kernel does + * not have any parameters, use NoParam. + * @return Kernel instance representing the created kernel + * + * @code + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance, + * kernelFuture); + * @endcode + */ +template +std::future +createKernelAsync(Context &ctx, const KernelCode &code, + const Bindings &dataBindings, + const Shape &totalWorkgroups, + const ParamsType ¶ms = ParamsType{}, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { + if constexpr (!IsNoParam) { + return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs, + dataBindings.viewOffsets.data(), totalWorkgroups, + reinterpret_cast(¶ms), + sizeof(ParamsType), compilationInfo, cacheKey); + } else { + return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs, + dataBindings.viewOffsets.data(), totalWorkgroups, + nullptr, 0, compilationInfo, cacheKey); } - return op; } /** @@ -1597,9 +2103,9 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, * @return Kernel instance representing the created kernel * * @code - * Kernel kernel = createKernel(ctx, code, tensorData, output, + * Kernel kernel = createKernel(ctx, code, tensorData, output,totalWorkgroups, + * params); * @endcode - * totalWorkgroups, params); */ template Kernel createKernel(Context &ctx, const KernelCode &code, @@ -1620,6 +2126,37 @@ Kernel createKernel(Context &ctx, const KernelCode &code, } } +/** + * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. + * + * This callback is invoked when the GPU queue signals the completion of the + * submitted workload for a kernel dispatch. It receives the work-done status + * and a userdata pointer, which is expected to be a heap‑allocated pointer to a + * std::promise. + * + * On success, the promise is fulfilled by calling set_value(). Otherwise, it is + * set with an exception. After setting the promise state, the allocated memory + * for the promise is freed. + * + * @param status The status of the work done. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A heap allocated pointer to std::promise which is set + * when the work is done. + * @param userdata2 Unused. + */ +inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + // Cast the userdata pointer back to our heap‑allocated promise. + auto *p = reinterpret_cast *>(userdata1); + if (status == WGPUQueueWorkDoneStatus_Success) { + p->set_value(); + } else { + p->set_exception(std::make_exception_ptr( + std::runtime_error("Queue work did not complete successfully."))); + } + delete p; // free the heap allocation +} + /** * @brief Asynchronously submits a kernel to the GPU queue for execution. * It also sets up a callback to notify when the kernel has finished executing @@ -1635,30 +2172,54 @@ Kernel createKernel(Context &ctx, const KernelCode &code, * @param[in] promise Promise to set when the kernel has finished executing * * @code - * dispatchKernel(ctx, kernel); + * std::future dispatchFuture = dispatchKernel(ctx, kernel); + * wait(ctx.instance, dispatchFuture); * @endcode */ -inline void dispatchKernel(Context &ctx, Kernel &kernel, - std::promise &promise) { +inline std::future dispatchKernelAsync(Context &ctx, Kernel &kernel) { + // If the kernel was used before, reset the command buffer. if (kernel->used) { resetCommandBuffer(ctx.device, kernel); } + + // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &kernel->commandBuffer); wgpuCommandBufferRelease(kernel->commandBuffer); kernel->used = true; - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - auto *promise = static_cast *>(userdata1); - promise->set_value(); - }, - .userdata1 = &promise, - .userdata2 = nullptr}; + // Allocate a promise on the heap so it remains valid beyond this function’s + // scope. + std::promise *promise = new std::promise(); + std::future future = promise->get_future(); + + // Set up the callback info. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {}; + workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + workDoneCallbackInfo.callback = dispatchKernelCallback; + workDoneCallbackInfo.userdata1 = reinterpret_cast(promise); + workDoneCallbackInfo.userdata2 = nullptr; + + // IMPORTANT: Pass the address of the callback info structure. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + + return future; +} + +/** + * @brief Synchronous wrapper for dispatchKernelAsync. This function submits + * the kernel to the GPU queue and waits for it to finish executing. + * + * @param[in] ctx Context instance to manage the kernel, from which the queue + * for the GPU is obtained + * @param[in] kernel Kernel instance to dispatch + * + * @code + * dispatchKernel(ctx, kernel); + * @endcode + */ +inline void dispatchKernel(Context &ctx, Kernel &kernel) { + auto future = dispatchKernelAsync(ctx, kernel); + wait(ctx, future); } } // namespace gpu diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index e5bdaf0..c183754 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -228,12 +228,9 @@ fn main( } Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf16); - std::promise promise; - std::future future = promise.get_future(); Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, op, promise); - wait(ctx, future); + dispatchKernel(ctx, op); toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", static_cast(inputArr[i]), @@ -241,7 +238,7 @@ fn main( } } -int main() { +int testHalfMain() { printf("\nHalf-precision float tests\n==========================\n"); printf("\nRegular values float round trips\n\n"); diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp new file mode 100644 index 0000000..b855712 --- /dev/null +++ b/test/test_gpu.cpp @@ -0,0 +1,250 @@ +#include "gpu.hpp" +#include +#include +#include +#include +#include +#include +#include + +using namespace gpu; +using namespace std::chrono; + + +// Forward declarations: +void testToCPUWithTensor(); +void testToCPUWithBuffer(); +void testToCPUWithTensorSourceOffset(); +void testToCPUWithBufferSourceOffset(); +void stressTestToCPU(); + +int main() { + LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testToCPUWithTensor(); + testToCPUWithBuffer(); + testToCPUWithTensorSourceOffset(); + testToCPUWithBufferSourceOffset(); + stressTestToCPU(); + LOG(kDefLog, kInfo, "All tests passed."); + return 0; +} + + +// A simple WGSL copy kernel that copies input to output. +static const char *kCopyKernel = R"( +@group(0) @binding(0) var inp: array<{{precision}}>; +@group(0) @binding(1) var out: array<{{precision}}>; +@group(0) @binding(1) var dummy: array<{{precision}}>; +@compute @workgroup_size({{workgroupSize}}) +fn main(@builtin(global_invocation_id) gid: vec3) { + let i: u32 = gid.x; + if (i < arrayLength(&inp)) { + out[i] = inp[i]; + } +} +)"; + + +// Test using the overload that takes a Tensor. +void testToCPUWithTensor() { + LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); + +// Create a real GPU context. +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + outputData[i] = 0.0f; + } + + // Create input and output tensors. + Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + Tensor outputTensor = createTensor(ctx, Shape{N}, kf32); + + // Create and dispatch the copy kernel. + Kernel copyKernel = + createKernel(ctx, {kCopyKernel, 256, kf32}, + Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1}); + dispatchKernel(ctx, copyKernel); + + // Synchronously copy GPU output to CPU using the tensor overload. + toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); +} + +// Test using the overload that takes a raw GPU buffer. +// We reuse the Tensor's underlying buffer for this test. +void testToCPUWithBuffer() { + LOG(kDefLog, kInfo, "Running testToCPUWithBuffer..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array data, outputData; + for (size_t i = 0; i < N; ++i) { + data[i] = static_cast(i * 2); + outputData[i] = 0.0f; + } + + // Create a tensor to allocate a GPU buffer and initialize it. + Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data()); + + // Now extract the raw GPU buffer from the tensor. + WGPUBuffer gpuBuffer = tensor.data.buffer; + + // Use the WGPUBuffer overload. This call returns a future. + auto future = + toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0); + wait(ctx, future); + + // Verify that the CPU output matches the original data. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == data[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); +} + +void testToCPUWithTensorSourceOffset() { + LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset..."); +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t numElements = 25; + constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements + constexpr size_t copyCount = 10; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with known data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 50); // Arbitrary values + } + // Create a tensor from the full data. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + + // Allocate a destination CPU buffer exactly as large as the data we want to + // copy. + std::vector cpuOutput(copyCount, -1.0f); + + // Set sourceOffset to skip the first few float elements + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + // Call the tensor overload with sourceOffset and destOffset = 0. + auto future = + toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); +} + +void testToCPUWithBufferSourceOffset() { + LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset..."); +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t numElements = 30; + constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements + constexpr size_t copyCount = 12; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with arbitrary data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 100); + } + // Create a tensor to initialize a GPU buffer. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + // Extract the raw GPU buffer from the tensor. + WGPUBuffer buffer = tensor.data.buffer; + + // Allocate a destination CPU buffer exactly as large as needed. + std::vector cpuOutput(copyCount, -2.0f); + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + + // Call the buffer overload with sourceOffset and destOffset = 0. + auto future = + toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify that the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); +} + +void stressTestToCPU() { + LOG(kDefLog, kInfo, "Running stressTestToCPU for 2 seconds..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + // Create a persistent tensor with some test data. + std::vector inputData(N, 0.0f); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + } + Tensor tensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + + // Prepare to run for one second. + auto startTime = high_resolution_clock::now(); + size_t opCount = 0; + while (high_resolution_clock::now() - startTime < seconds(2)) { + // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes) + auto outputData = std::make_shared>(N, 0.0f); + // Use the tensor overload; we’re copying the entire tensor (destOffset = 0) + LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float)); + // log count + LOG(kDefLog, kInfo, "opCount = %zu", opCount); + auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0); + wait(ctx, fut); + ++opCount; + } + + auto endTime = high_resolution_clock::now(); + auto totalMs = duration_cast(endTime - startTime).count(); + double throughput = (opCount / (totalMs / 1000.0)); + + LOG(kDefLog, kInfo, "Stress test completed:\n" + " %zu GPU to CPU operations in %lld ms\n" + " Throughput: %.2f ops/sec", opCount, totalMs, throughput); +}