From b748399f1f567640577a1bfd46a38814bf5c22e9 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Tue, 11 Feb 2025 01:43:13 -0600 Subject: [PATCH 01/30] simplify cmake, build dawn in repo, fix render for windows --- .gitignore | 3 + CMakeLists.txt | 29 +------ cmake/example.cmake | 27 +++---- cmake/gpu.cmake | 170 +++++++++++++++++++++++++--------------- cmake/webgpu.cmake | 61 -------------- examples/render/run.cpp | 11 +-- 6 files changed, 132 insertions(+), 169 deletions(-) delete mode 100644 cmake/webgpu.cmake diff --git a/.gitignore b/.gitignore index 1a8b5bc..c7f60c3 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,6 @@ build .cache compile_commands.json +# editor specific +.vscode/* + diff --git a/CMakeLists.txt b/CMakeLists.txt index db89df7..a464b34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,23 +1,11 @@ cmake_minimum_required(VERSION 3.28) project(gpu) -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/webgpu.cmake") - set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with # LSP -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) -option(USE_LOCAL_LIBS - "Use local libraries instead of fetching from the internet" OFF) - -# Ensure the build type is set -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE - Release - CACHE STRING "Choose the type of build: Debug or Release" FORCE) -endif() - option(FASTBUILD "Option to enable fast builds" OFF) if(FASTBUILD) set(CMAKE_BUILD_TYPE None) # Avoid default flags of predefined build types @@ -30,21 +18,8 @@ if(DEBUG) set(CMAKE_CXX_FLAGS "-O0 -g") endif() -if(WIN64) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN") -endif() - include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake") -message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}") -message( - STATUS - "Include directories for wgpu: ${CMAKE_CURRENT_SOURCE_DIR}/third_party/headers" -) - add_library(gpud SHARED gpu.hpp) set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(gpud PRIVATE wgpu) -target_link_libraries(gpud PRIVATE webgpu) -target_link_libraries(gpud PRIVATE gpu) -install(TARGETS gpud) +target_link_libraries(gpud PRIVATE gpu) \ No newline at end of file diff --git a/cmake/example.cmake b/cmake/example.cmake index eba8e7c..41b15fd 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -1,17 +1,17 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with # LSP -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) # Construct potential paths -set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") -set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}") +set(FILEPATH_CURRENT_DIR "${DIRECTORY}/") +set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/") # Include file finding utility script -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake") +include("${FILEPATH_PROJECT_ROOT}/cmake/find_gpu.cmake") # Check if the file exists in the current directory find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} @@ -49,20 +49,19 @@ endif() if(NOT TARGET gpu) message(STATUS "GPU_LIB not found") - include("${TARGET_FILE_PATH}/cmake/webgpu.cmake") include("${TARGET_FILE_PATH}/cmake/gpu.cmake") endif() - add_executable(${PROJECT_NAME} run.cpp) target_link_libraries(${PROJECT_NAME} PRIVATE gpu) -target_link_libraries(${PROJECT_NAME} PRIVATE wgpu) -target_link_libraries(${PROJECT_NAME} PRIVATE webgpu) +target_link_libraries(${PROJECT_NAME} PRIVATE ${WEBGPU_DAWN}) -if(WIN32) - # Ensure DLL is copied if on Windows +if(MSVC) +# Copy webgpu_dawn.dll to the build directory add_custom_command( - TARGET ${PROJECT_NAME} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DLL_PATH} - $) + TARGET ${PROJECT_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${DAWN_INSTALL_PREFIX}/${CMAKE_BUILD_TYPE}/webgpu_dawn.dll + $ + ) endif() + diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 08db244..15f3b43 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -1,69 +1,115 @@ -get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) -get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) - -# Construct potential paths -set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") -set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}") - -# Include file finding utility script -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake") - -# Check if the file exists in the current directory -find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH) -if("${TARGET_FILE_PATH}" STREQUAL "") - find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH) - if("${TARGET_FILE_PATH}" STREQUAL "") - message( - FATAL_ERROR - "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../" - ) - endif() -endif() +set(FILENAME "gpu.hpp") -# Define architecture and build type directories or file names -if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(ARCH "x64") +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") + set(FILEPATH_PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") else() - set(ARCH "x86") + get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) + get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) + + set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/") endif() -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(BUILD_TYPE "Debug") + +include(FetchContent) + +set(FETCHCONTENT_BASE_DIR "${FILEPATH_PROJECT_ROOT}/third_party/fetchcontent/_deps") +set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_TYPE}" CACHE INTERNAL "Dawn install location" FORCE) + + +# Before fetching, set configuration options for Dawn. +# These CMake variables are “global” (cached INTERNAL) so that Dawn’s own CMakeLists.txt +# will pick them up. Adjust them as needed. +set(DAWN_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn build type" FORCE) +set(DCMAKE_INSTALL_PREFIX ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE) + +# Dawn options +set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) +set(DAWN_ENABLE_INSTALL ON CACHE INTERNAL "Enable Dawn installation" FORCE) +set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE INTERNAL "Build Dawn monolithically" FORCE) +set(DAWN_BUILD_EXAMPLES OFF CACHE INTERNAL "Build Dawn examples" FORCE) +set(DAWN_BUILD_SAMPLES OFF CACHE INTERNAL "Build Dawn samples" FORCE) +set(DAWN_BUILD_TESTS OFF CACHE INTERNAL "Build Dawn tests" FORCE) +set(DAWN_BUILD_UTILS OFF CACHE INTERNAL "Build Dawn utilities" FORCE) +set(TINT_BUILD_TESTS OFF CACHE INTERNAL "Build Tint Tests" FORCE) +set(TINT_BUILD_IR_BINARY OFF CACHE INTERNAL "Build Tint IR binary" FORCE) +set(TINT_BUILD_CMD_TOOLS OFF CACHE INTERNAL "Build Tint command line tools" FORCE) +set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Build shared libraries" FORCE) + + +# Set up an install location for Dawn – you can change this to a specific location. + + +FetchContent_Declare( + dawn + DOWNLOAD_COMMAND + cd ${FETCHCONTENT_BASE_DIR}/dawn-src && + git init && + git fetch --depth=1 https://dawn.googlesource.com/dawn && + git reset --hard FETCH_HEAD +) + + +# This call will download the repository and add it as a subdirectory. +FetchContent_MakeAvailable(dawn) + + +# At this point, assuming Dawn’s CMakeLists.txt is written so that an install step is available, +# we trigger a build of its install target. This custom target will build (and install) Dawn +# into ${DAWN_INSTALL_PREFIX}. (If Dawn already adds an install target, you may simply depend on it.) +add_custom_target(build_dawn_config ALL + COMMAND ${CMAKE_COMMAND} ${FETCHCONTENT_BASE_DIR}/dawn-src + -B ${DAWN_INSTALL_PREFIX} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DDAWN_FETCH_DEPENDENCIES=ON + -DDAWN_ENABLE_INSTALL=ON + -DDAWN_BUILD_MONOLITHIC_LIBRARY=OFF + -DDAWN_BUILD_EXAMPLES=OFF + -DDAWN_BUILD_SAMPLES=OFF + -DDAWN_BUILD_TESTS=OFF + -DDAWN_BUILD_UTILS=OFF + -DTINT_BUILD_TESTS=OFF + -DTINT_BUILD_IR_BINARY=OFF + -DTINT_BUILD_CMD_TOOLS=OFF + -DBUILD_SHARED_LIBS=OFF + -G "${CMAKE_GENERATOR}" + COMMENT "Configuring Dawn build with custom options in ${DAWN_INSTALL_PREFIX}" +) + +add_custom_target(build_dawn_install ALL + COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --target install + COMMENT "Installing Dawn into ${DAWN_INSTALL_PREFIX}" +) + +include(${FETCHCONTENT_BASE_DIR}/dawn-build/cmake/DawnTargets.cmake) + +set(GPU_SOURCES + "${FILEPATH_PROJECT_ROOT}/gpu.cpp" + "${FILEPATH_PROJECT_ROOT}/numeric_types/half.cpp" +) + +set(GPU_HEADERS + "${FILEPATH_PROJECT_ROOT}/gpu.hpp" + "${FILEPATH_PROJECT_ROOT}/utils/logging.hpp" + "${FILEPATH_PROJECT_ROOT}/utils/array_utils.hpp" + "${FILEPATH_PROJECT_ROOT}/numeric_types/half.hpp" +) + +if(EMSCRIPTEN) + file(REMOVE "${FILEPATH_PROJECT_ROOT}/webgpu/webgpu.h") else() - set(BUILD_TYPE "Release") + list(APPEND GPU_HEADERS "${DAWN_INSTALL_PREFIX}/gen/webgpu-headers/webgpu.h") endif() -add_library(webgpulib SHARED IMPORTED) -add_library(gpu INTERFACE) -add_library(wgpu INTERFACE) -add_dependencies(gpu webgpulib) -# Define the header-only library -target_include_directories(gpu INTERFACE ${TARGET_FILE_PATH}) - -# Add headers webgpu.h -target_include_directories(wgpu - INTERFACE ${TARGET_FILE_PATH}/third_party/headers) -include(ExternalProject) - -set(DAWN_EXT_PREFIX "${TARGET_FILE_PATH}/third_party/local/dawn") - -ExternalProject_Add( - dawn_project - PREFIX ${DAWN_EXT_PREFIX} - GIT_REPOSITORY "https://dawn.googlesource.com/dawn" - GIT_TAG "main" - SOURCE_DIR "${DAWN_EXT_PREFIX}/source" - BINARY_DIR "${DAWN_EXT_PREFIX}/build" - INSTALL_DIR "${DAWN_EXT_PREFIX}/install" - GIT_SUBMODULES "" - # setting cmake args doesn't work and I don't know why - CONFIGURE_COMMAND - ${CMAKE_COMMAND} -S ${DAWN_EXT_PREFIX}/source -B - ${DAWN_EXT_PREFIX}/build -DDAWN_FETCH_DEPENDENCIES=ON - -DDAWN_ENABLE_INSTALL=ON -DDAWN_BUILD_MONOLITHIC_LIBRARY=ON - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -G ${CMAKE_GENERATOR} - INSTALL_COMMAND ${CMAKE_COMMAND} --install . --prefix - ${DAWN_EXT_PREFIX}/install - LOG_INSTALL ON) -find_library(LIBDAWN dawn PATHS "${DAWN_EXT_PREFIX}/install/lib") -target_link_libraries(webgpulib INTERFACE ${LIBDAWN}) + +# Create the INTERFACE library ‘gpu’ +add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS}) +target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}") +target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}/third_party/headers") + +# Ensure that the gpu target is built only after Dawn has been installed. +add_dependencies(gpu build_dawn_install) + +find_library(WEBGPU_DAWN + NAMES webgpu_dawn + HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/Debug/" +) \ No newline at end of file diff --git a/cmake/webgpu.cmake b/cmake/webgpu.cmake deleted file mode 100644 index c63f1e2..0000000 --- a/cmake/webgpu.cmake +++ /dev/null @@ -1,61 +0,0 @@ -# Specify the filename to search for -set(FILENAME "gpu.hpp") - -get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) -get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) - -# Construct potential paths -set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") -set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}") - -# Include file finding utility script -include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake") - -# Check if the file exists in the current directory -find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH) -if("${TARGET_FILE_PATH}" STREQUAL "") - find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH) - if("${TARGET_FILE_PATH}" STREQUAL "") - message( - FATAL_ERROR - "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../" - ) - endif() -endif() - -include(FetchContent) - -set(FETCHCONTENT_BASE_DIR "${TARGET_FILE_PATH}/third_party/fetchcontent") -set(WEBGPU_DIST_LOCAL_PATH - "${TARGET_FILE_PATH}/third_party/local/WebGPU-distribution") - -if(USE_LOCAL_LIBS) - set(WEBGPU_DIST_GIT_REPO ${WEBGPU_DIST_LOCAL_PATH}) - message(STATUS "Using local WebGPU distribution: ${WEBGPU_DIST_LOCAL_PATH}") -else() - set(WEBGPU_DIST_GIT_REPO - "https://github.com/eliemichel/WebGPU-distribution") -endif() - -option(WEBGPU_TAG "WebGPU distribution tag to use") -if(NOT WEBGPU_TAG) - set(WEBGPU_TAG "dawn") -endif() -message(STATUS "Using WebGPU distribution tag: ${WEBGPU_TAG}") - -if(WEBGPU_TAG STREQUAL "dawn") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN") - # use specific commit set(WEBGPU_TAG - # "1025b977e1927b6d0327e67352f90feb4bcf8274") set(WEBGPU_TAG - # "acf972b7b909f52e183bdae3971b93bb13d4a29e") - # add_compile_options(-UABSL_INTERNAL_AT_LEAST_CXX20) set(CMAKE_CXX_FLAGS - # "${CMAKE_CXX_FLAGS} -UABSL_INTERNAL_AT_LEAST_CXX20") - message(STATUS "Using Dawn backend") -endif() - -FetchContent_Declare( - webgpu - GIT_REPOSITORY ${WEBGPU_DIST_GIT_REPO} - GIT_TAG ${WEBGPU_TAG} - GIT_SHALLOW TRUE) -FetchContent_MakeAvailable(webgpu) diff --git a/examples/render/run.cpp b/examples/render/run.cpp index f2c6bec..f9a90f9 100644 --- a/examples/render/run.cpp +++ b/examples/render/run.cpp @@ -149,11 +149,12 @@ int main(int argc, char **argv) { std::array raster; for (size_t i = 0; i < screen.size(); ++i) { - size_t index = - std::min(sizeof(intensity) - 2, - std::max(0ul, static_cast(screen[i] * - (sizeof(intensity) - 2)))); - raster[i] = intensity[index]; + // Convert all values to size_t to ensure proper type matching + const size_t intensity_max = sizeof(intensity) - 2; + const size_t scaled_value = static_cast(screen[i] * intensity_max); + size_t index = std::min(intensity_max, + std::max(static_cast(0), scaled_value)); + raster[i] = intensity[index]; } char buffer[(NROWS + 2) * (NCOLS + 2)]; From bbc3addc4a8fb5ed7bf3c9ecf525a2c91f70ff6a Mon Sep 17 00:00:00 2001 From: MichealReed Date: Tue, 11 Feb 2025 13:28:05 -0600 Subject: [PATCH 02/30] More simplification --- CMakeLists.txt | 2 + cmake/example.cmake | 68 ++++++++++-------------------- cmake/find_gpu.cmake | 30 -------------- cmake/gpu.cmake | 99 ++++++++++++++++++++------------------------ 4 files changed, 70 insertions(+), 129 deletions(-) delete mode 100644 cmake/find_gpu.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index a464b34..ca735a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,3 +1,5 @@ +# This only builds a shared lib, see cmake/example.cmake +# and cmake/gpu.cmake for more details cmake_minimum_required(VERSION 3.28) project(gpu) diff --git a/cmake/example.cmake b/cmake/example.cmake index 41b15fd..d92c204 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -1,32 +1,20 @@ -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with - # LSP -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED ON) +# Getting Started with CMAKE +# Each example includes this and sets PROJECT_NAME +# cd examples/hello_world +# cmake -S . build/ -DCMAKE_BUILD_TYPE=Release +# cmake --build build/ --config Release +# ./build/hello_world + +if(NOT MSVC) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 20) +endif() +# Path finding logic to find our root recipes from nested folders get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) -# Construct potential paths -set(FILEPATH_CURRENT_DIR "${DIRECTORY}/") -set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/") - -# Include file finding utility script -include("${FILEPATH_PROJECT_ROOT}/cmake/find_gpu.cmake") - -# Check if the file exists in the current directory -find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} - TARGET_FILE_PATH) -if("${TARGET_FILE_PATH}" STREQUAL "") - find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} - TARGET_FILE_PATH) - if("${TARGET_FILE_PATH}" STREQUAL "") - message( - FATAL_ERROR - "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../" - ) - endif() -endif() - # Ensure the build type is set if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE @@ -34,34 +22,24 @@ if(NOT CMAKE_BUILD_TYPE) CACHE STRING "Choose the type of build: Debug or Release" FORCE) endif() -# Define architecture and build type directories or file names -if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(ARCH "x64") -else() - set(ARCH "x86") -endif() - -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(BUILD_TYPE "Debug") -else() - set(BUILD_TYPE "Release") -endif() +# Include the gpu.cpp + Dawn library +include("${PROJECT_ROOT}/cmake/gpu.cmake") -if(NOT TARGET gpu) - message(STATUS "GPU_LIB not found") - include("${TARGET_FILE_PATH}/cmake/gpu.cmake") -endif() +# Create the executable add_executable(${PROJECT_NAME} run.cpp) + +# Link gpu + dawn library target_link_libraries(${PROJECT_NAME} PRIVATE gpu) -target_link_libraries(${PROJECT_NAME} PRIVATE ${WEBGPU_DAWN}) +# Certain platforms need to copy the library files to the build directory if(MSVC) -# Copy webgpu_dawn.dll to the build directory + # Copy webgpu_dawn.dll to the build directory + # CMake multigenerators like MSVC need --config Release on + # the cmake --build command or they will output to /Debug add_custom_command( TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${DAWN_INSTALL_PREFIX}/${CMAKE_BUILD_TYPE}/webgpu_dawn.dll - $ - ) + $) endif() diff --git a/cmake/find_gpu.cmake b/cmake/find_gpu.cmake deleted file mode 100644 index b6b7dad..0000000 --- a/cmake/find_gpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -# file name to find -set(FILENAME "gpu.hpp") - -# Function to check for file existence up the directory hierarchy -function(find_project_root current_dir filename result_var) - set(found FALSE) # Flag to indicate if the file is found - set(current_check_dir "${current_dir}") # Start from the given directory - # using 1 is jsut to supress the cmane-format warning - foreach(i RANGE 0 2 1) - set(filepath "${current_check_dir}/${filename}") - - if(EXISTS "${filepath}") - set(${result_var} - "${current_check_dir}" - PARENT_SCOPE) - set(found TRUE) - break() - endif() - - # Move one level up - get_filename_component(current_check_dir "${current_check_dir}" - DIRECTORY) - endforeach() - - if(NOT found) - set(${result_var} - "" - PARENT_SCOPE) # Set to empty if not found - endif() -endfunction() diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 15f3b43..c8f011a 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -1,44 +1,46 @@ set(FILENAME "gpu.hpp") +# Setup project root here. if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") - set(FILEPATH_PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") + set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") else() get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) - - set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/") + + set(PROJECT_ROOT "${PROJECT_ROOT}/") endif() +message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}") + include(FetchContent) -set(FETCHCONTENT_BASE_DIR "${FILEPATH_PROJECT_ROOT}/third_party/fetchcontent/_deps") +set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party/fetchcontent/_deps") set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_TYPE}" CACHE INTERNAL "Dawn install location" FORCE) # Before fetching, set configuration options for Dawn. -# These CMake variables are “global” (cached INTERNAL) so that Dawn’s own CMakeLists.txt -# will pick them up. Adjust them as needed. -set(DAWN_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn build type" FORCE) set(DCMAKE_INSTALL_PREFIX ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE) +set(CMAKE_CONFIGURATION_TYPES ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn configuration types" FORCE) -# Dawn options -set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) -set(DAWN_ENABLE_INSTALL ON CACHE INTERNAL "Enable Dawn installation" FORCE) -set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE INTERNAL "Build Dawn monolithically" FORCE) +# Dawn options for more, +# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt +set(DAWN_ALWAYS_ASSERT OFF CACHE INTERNAL "Always assert in Dawn" FORCE) +set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE) set(DAWN_BUILD_EXAMPLES OFF CACHE INTERNAL "Build Dawn examples" FORCE) set(DAWN_BUILD_SAMPLES OFF CACHE INTERNAL "Build Dawn samples" FORCE) set(DAWN_BUILD_TESTS OFF CACHE INTERNAL "Build Dawn tests" FORCE) -set(DAWN_BUILD_UTILS OFF CACHE INTERNAL "Build Dawn utilities" FORCE) +set(DAWN_ENABLE_INSTALL ON CACHE INTERNAL "Enable Dawn installation" FORCE) +set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) + set(TINT_BUILD_TESTS OFF CACHE INTERNAL "Build Tint Tests" FORCE) set(TINT_BUILD_IR_BINARY OFF CACHE INTERNAL "Build Tint IR binary" FORCE) set(TINT_BUILD_CMD_TOOLS OFF CACHE INTERNAL "Build Tint command line tools" FORCE) -set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Build shared libraries" FORCE) - -# Set up an install location for Dawn – you can change this to a specific location. +set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Build shared libraries" FORCE) +# Fetch Setup FetchContent_Declare( dawn DOWNLOAD_COMMAND @@ -49,67 +51,56 @@ FetchContent_Declare( ) -# This call will download the repository and add it as a subdirectory. +# Download the repository and add it as a subdirectory. FetchContent_MakeAvailable(dawn) -# At this point, assuming Dawn’s CMakeLists.txt is written so that an install step is available, -# we trigger a build of its install target. This custom target will build (and install) Dawn -# into ${DAWN_INSTALL_PREFIX}. (If Dawn already adds an install target, you may simply depend on it.) -add_custom_target(build_dawn_config ALL +# Since we require Dawn to be built before linking against it, we need to configure it now. +execute_process( COMMAND ${CMAKE_COMMAND} ${FETCHCONTENT_BASE_DIR}/dawn-src -B ${DAWN_INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DDAWN_FETCH_DEPENDENCIES=ON - -DDAWN_ENABLE_INSTALL=ON - -DDAWN_BUILD_MONOLITHIC_LIBRARY=OFF - -DDAWN_BUILD_EXAMPLES=OFF - -DDAWN_BUILD_SAMPLES=OFF - -DDAWN_BUILD_TESTS=OFF - -DDAWN_BUILD_UTILS=OFF - -DTINT_BUILD_TESTS=OFF - -DTINT_BUILD_IR_BINARY=OFF - -DTINT_BUILD_CMD_TOOLS=OFF - -DBUILD_SHARED_LIBS=OFF -G "${CMAKE_GENERATOR}" - COMMENT "Configuring Dawn build with custom options in ${DAWN_INSTALL_PREFIX}" ) -add_custom_target(build_dawn_install ALL - COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --target install - COMMENT "Installing Dawn into ${DAWN_INSTALL_PREFIX}" +# Build Dawn +execute_process( + WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}/dawn-src + COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --config ${CMAKE_BUILD_TYPE} ) -include(${FETCHCONTENT_BASE_DIR}/dawn-build/cmake/DawnTargets.cmake) - +# Add sources set(GPU_SOURCES - "${FILEPATH_PROJECT_ROOT}/gpu.cpp" - "${FILEPATH_PROJECT_ROOT}/numeric_types/half.cpp" + "${PROJECT_ROOT}/gpu.cpp" + "${PROJECT_ROOT}/numeric_types/half.cpp" ) +# Add headers set(GPU_HEADERS - "${FILEPATH_PROJECT_ROOT}/gpu.hpp" - "${FILEPATH_PROJECT_ROOT}/utils/logging.hpp" - "${FILEPATH_PROJECT_ROOT}/utils/array_utils.hpp" - "${FILEPATH_PROJECT_ROOT}/numeric_types/half.hpp" + "${PROJECT_ROOT}/gpu.hpp" + "${PROJECT_ROOT}/utils/logging.hpp" + "${PROJECT_ROOT}/utils/array_utils.hpp" + "${PROJECT_ROOT}/numeric_types/half.hpp" ) +# Emscripten includes a header automatically if(EMSCRIPTEN) - file(REMOVE "${FILEPATH_PROJECT_ROOT}/webgpu/webgpu.h") + file(REMOVE "${PROJECT_ROOT}/webgpu/webgpu.h") else() - list(APPEND GPU_HEADERS "${DAWN_INSTALL_PREFIX}/gen/webgpu-headers/webgpu.h") + list(APPEND GPU_HEADERS "${PROJECT_ROOT}/third_party/headers/webgpu/webgpu.h") endif() -# Create the INTERFACE library ‘gpu’ +# Create the STATIC library for gpu add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS}) -target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}") -target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}/third_party/headers") +target_include_directories(gpu PUBLIC "${PROJECT_ROOT}") +target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers") -# Ensure that the gpu target is built only after Dawn has been installed. -add_dependencies(gpu build_dawn_install) - -find_library(WEBGPU_DAWN +# Find the monolithic library for Dawn +find_library(WEBGPU_DAWN_MONOLITHIC NAMES webgpu_dawn - HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/Debug/" -) \ No newline at end of file + HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}" +) + +# Link the monolithic library +target_link_libraries(gpu PRIVATE ${WEBGPU_DAWN_MONOLITHIC}) From 2360ba9af7432c33a2703eb2a88706d4150387dc Mon Sep 17 00:00:00 2001 From: MichealReed Date: Tue, 11 Feb 2025 13:44:09 -0600 Subject: [PATCH 03/30] cleanup --- CMakeLists.txt | 2 +- cmake/gpu.cmake | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca735a9..e8e569a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,4 +24,4 @@ include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake") add_library(gpud SHARED gpu.hpp) set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(gpud PRIVATE gpu) \ No newline at end of file +target_link_libraries(gpud PRIVATE gpu) diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index c8f011a..11d6c67 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -41,6 +41,8 @@ set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Build shared libraries" FORCE) # Fetch Setup +# Add a commit hash to pin the version of Dawn. +# git fetch --depth=1 url FetchContent_Declare( dawn DOWNLOAD_COMMAND From 30f7594896ecd9c4b7616bd9ad0d03598f0b4939 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Tue, 11 Feb 2025 15:59:24 -0600 Subject: [PATCH 04/30] build path for msvc find library --- cmake/gpu.cmake | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 11d6c67..1767a50 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -6,7 +6,6 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}") else() get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) - set(PROJECT_ROOT "${PROJECT_ROOT}/") endif() @@ -21,7 +20,6 @@ set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_T # Before fetching, set configuration options for Dawn. set(DCMAKE_INSTALL_PREFIX ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE) -set(CMAKE_CONFIGURATION_TYPES ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn configuration types" FORCE) # Dawn options for more, # see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt @@ -98,11 +96,18 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS}) target_include_directories(gpu PUBLIC "${PROJECT_ROOT}") target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers") -# Find the monolithic library for Dawn -find_library(WEBGPU_DAWN_MONOLITHIC +# find_library, windows adds extra folder +if(MSVC) + find_library(WEBGPU_DAWN_MONOLITHIC NAMES webgpu_dawn - HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}" -) + PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}" + ) +else() + find_library(WEBGPU_DAWN_MONOLITHIC + NAMES webgpu_dawn + PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native" + ) +endif() # Link the monolithic library target_link_libraries(gpu PRIVATE ${WEBGPU_DAWN_MONOLITHIC}) From 82ff79d1b2853e29f9e5de81c93bef9670535b4d Mon Sep 17 00:00:00 2001 From: MichealReed Date: Tue, 11 Feb 2025 16:13:29 -0600 Subject: [PATCH 05/30] require the libs so we fail early --- cmake/gpu.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 1767a50..b687e83 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -101,11 +101,13 @@ if(MSVC) find_library(WEBGPU_DAWN_MONOLITHIC NAMES webgpu_dawn PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}" + REQUIRED ) else() find_library(WEBGPU_DAWN_MONOLITHIC NAMES webgpu_dawn PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native" + REQUIRED ) endif() From ebd5bcf1b7f3dc7d9f888039adba36cbf6e39b4f Mon Sep 17 00:00:00 2001 From: MichealReed Date: Tue, 11 Feb 2025 17:22:24 -0600 Subject: [PATCH 06/30] use hints for MSVC --- cmake/gpu.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index b687e83..52a348b 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -100,7 +100,7 @@ target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers") if(MSVC) find_library(WEBGPU_DAWN_MONOLITHIC NAMES webgpu_dawn - PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}" + HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}" REQUIRED ) else() From d1c0b81a529f49c9bddb2e27021ce3624824c32e Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sun, 16 Feb 2025 18:24:49 -0600 Subject: [PATCH 07/30] adds emscripten support --- .gitignore | 2 + CMakeLists.txt | 1 + cmake/dawn.cmake | 149 ++++++++++++++++++++++++++++++ cmake/example.cmake | 94 ++++++++++++++----- cmake/gpu.cmake | 89 ++---------------- cmake/templates/index.html.in | 22 +++++ examples/shadertui/CMakeLists.txt | 1 + gpu.hpp | 6 +- numeric_types/half.cpp | 2 +- 9 files changed, 260 insertions(+), 106 deletions(-) create mode 100644 cmake/dawn.cmake create mode 100644 cmake/templates/index.html.in diff --git a/.gitignore b/.gitignore index c7f60c3..4dc9cf7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ build/* # any build subdirectory in the tree **/build/ +**/build_web/ examples/hello_gpu/build/* examples/raymarch/build/* docs/html @@ -8,6 +9,7 @@ source .DS_Store third_party/lib/* third_party/local/* +third_party/dawn/* # formatter files .cmake-format.py diff --git a/CMakeLists.txt b/CMakeLists.txt index e8e569a..816cdf3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ if(DEBUG) set(CMAKE_CXX_FLAGS "-O0 -g") endif() +include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake") include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake") add_library(gpud SHARED gpu.hpp) diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake new file mode 100644 index 0000000..f7ab748 --- /dev/null +++ b/cmake/dawn.cmake @@ -0,0 +1,149 @@ +# Setup directories +set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party") +set(DAWN_DIR "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "") +set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "") + +if(EMSCRIPTEN) + set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "") + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "") +endif() + +function(find_dawn_library) + if(MSVC) + find_library(WEBGPU_DAWN_DEBUG webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" + ) + find_library(WEBGPU_DAWN_RELEASE webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" + ) + elseif(NOT EMSCRIPTEN AND NOT MSVC) + find_library(WEBGPU_DAWN_LIB + NAMES webgpu_dawn + PATHS "${DAWN_BUILD_DIR}/src/dawn/native" + REQUIRED + ) + endif() + + # Set result variables in parent scope + set(DAWN_BUILD_FOUND ON PARENT_SCOPE) + if(MSVC) + set(WEBGPU_DAWN_DEBUG ${WEBGPU_DAWN_DEBUG} PARENT_SCOPE) + set(WEBGPU_DAWN_RELEASE ${WEBGPU_DAWN_RELEASE} PARENT_SCOPE) + else() + set(WEBGPU_DAWN_LIB ${WEBGPU_DAWN_LIB} PARENT_SCOPE) + endif() +endfunction() + +# Enable find for no dawn rebuilds with flutter run +set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE) +set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE) +if(ENABLE_DAWN_FIND) + # find_library, windows adds extra folder + if(MSVC) + find_library(WEBGPU_DAWN_DEBUG webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" + ) + find_library(WEBGPU_DAWN_RELEASE webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" + ) + set(DAWN_BUILD_FOUND ON) + elseif(NOT EMSCRIPTEN AND NOT MSVC) + find_library(WEBGPU_DAWN_LIB + NAMES webgpu_dawn + PATHS "${DAWN_BUILD_DIR}/src/dawn/native" + REQUIRED + ) + set(DAWN_BUILD_FOUND ON) + else() + set(DAWN_BUILD_FOUND ON) + endif() +endif() + +# Dawn options for more, +# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt +set(DAWN_ALWAYS_ASSERT OFF CACHE INTERNAL "Always assert in Dawn" FORCE) +set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE) +set(DAWN_BUILD_EXAMPLES OFF CACHE INTERNAL "Build Dawn examples" FORCE) +set(DAWN_BUILD_SAMPLES OFF CACHE INTERNAL "Build Dawn samples" FORCE) +set(DAWN_BUILD_TESTS OFF CACHE INTERNAL "Build Dawn tests" FORCE) +set(DAWN_ENABLE_INSTALL OFF CACHE INTERNAL "Enable Dawn installation" FORCE) +set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) +set(TINT_BUILD_TESTS OFF CACHE INTERNAL "Build Tint Tests" FORCE) +set(TINT_BUILD_IR_BINARY OFF CACHE INTERNAL "Build Tint IR binary" FORCE) +set(TINT_BUILD_CMD_TOOLS OFF CACHE INTERNAL "Build Tint command line tools" FORCE) + +if(NOT DAWN_BUILD_FOUND) + include(FetchContent) + message("webgpu_dawn not found start building") + if(EMSCRIPTEN) + set(EMSCRIPTEN_DIR "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "" FORCE) + set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EMSCRIPTEN_DIR} CACHE INTERNAL "" FORCE) + endif() + + FetchContent_Declare( + dawn + DOWNLOAD_DIR ${DAWN_DIR} + SOURCE_DIR ${DAWN_DIR} + SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp + BINARY_DIR ${DAWN_BUILD_DIR} + DOWNLOAD_COMMAND + cd ${DAWN_DIR} && + git init && + git fetch --depth=1 https://dawn.googlesource.com/dawn && + git reset --hard FETCH_HEAD + ) + + # Download the repository and add it as a subdirectory. + FetchContent_MakeAvailable(dawn) + + # attempt fix flutter rebuilds + set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "") + + execute_process( + WORKING_DIRECTORY ${DAWN_DIR} + COMMAND ${CMAKE_COMMAND} -S ${DAWN_DIR} + -B ${DAWN_BUILD_DIR} + ) + + # Build Dawn + execute_process( + COMMAND ${CMAKE_COMMAND} --build ${DAWN_BUILD_DIR} + ) + + # find_library, windows adds extra folder + if(MSVC) + find_library(WEBGPU_DAWN_DEBUG webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" + ) + find_library(WEBGPU_DAWN_RELEASE webgpu_dawn + NAMES webgpu_dawn + HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" + ) + set(DAWN_BUILD_FOUND ON) + elseif(NOT EMSCRIPTEN AND NOT MSVC) + find_library(WEBGPU_DAWN_LIB + NAMES webgpu_dawn + PATHS "${DAWN_BUILD_DIR}/src/dawn/native" + REQUIRED + ) + set(DAWN_BUILD_FOUND ON) + else() + set(DAWN_BUILD_FOUND ON) + endif() +endif() + +if(EMSCRIPTEN) + add_library(webgpu_dawn INTERFACE IMPORTED) + target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include) + target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/webgpu.h) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js) + target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js) +else() +endif() \ No newline at end of file diff --git a/cmake/example.cmake b/cmake/example.cmake index d92c204..99578fd 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -1,9 +1,14 @@ # Getting Started with CMAKE -# Each example includes this and sets PROJECT_NAME -# cd examples/hello_world -# cmake -S . build/ -DCMAKE_BUILD_TYPE=Release -# cmake --build build/ --config Release -# ./build/hello_world +# Each example includes this and sets PROJECT_NAME. +# +# Example usage: +# cd examples/hello_world +# cmake -S . build/ -DCMAKE_BUILD_TYPE=Release +# cmake --build build/ --config Release +# ./build/hello_world (or serve the output .js/.wasm for Emscripten) +# or for emscripten +# emcmake cmake -S . -B ./build_web -DCMAKE_BUILD_TYPE=Release +# cmake --build build_web --config Release if(NOT MSVC) set(CMAKE_CXX_STANDARD 17) @@ -11,35 +16,82 @@ else() set(CMAKE_CXX_STANDARD 20) endif() -# Path finding logic to find our root recipes from nested folders +# Locate the project root (two levels up from the current source dir) get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY) -# Ensure the build type is set -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE - Release - CACHE STRING "Choose the type of build: Debug or Release" FORCE) -endif() - -# Include the gpu.cpp + Dawn library +# Include external libraries and helper scripts (dawn and gpu) +include("${PROJECT_ROOT}/cmake/dawn.cmake") include("${PROJECT_ROOT}/cmake/gpu.cmake") # Create the executable add_executable(${PROJECT_NAME} run.cpp) -# Link gpu + dawn library +# Platform-specific linking & build settings +if(EMSCRIPTEN) + # Emscripten-specific configuration + + # Define a web output directory (adjust as needed) + set(WEB_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/web_build") + + # If necessary, include the generated WebGPU include dirs first. + include_directories(BEFORE "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") + + # Create a helper library for WebGPU support. + add_library(webgpu_web "${DAWN_DIR}/third_party/emdawnwebgpu/webgpu.cpp") + target_link_libraries(${PROJECT_NAME} PRIVATE webgpu_web) + + # Set Emscripten-specific link flags that enable WASM output and expose certain symbols. + # Needed to use updated version, emdawnwebgpu + set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\ + -sUSE_WEBGPU=0 \ + -sWASM=1 \ + -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \ + -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \ + -sEXPORTED_RUNTIME_METHODS=ccall \ + -sUSE_GLFW=3 \ + -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \ + -sASYNCIFY \ + --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ + --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ + --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \ + --js-library=${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js \ + --closure-args=--externs=${EMSCRIPTEN_DIR}/src/closure-externs/webgpu-externs.js \ + -O3 \ + ") + +else() + # Non-Emscripten (desktop) linking + if(MSVC) + target_link_libraries(gpu + PRIVATE + $<$:${WEBGPU_DAWN_DEBUG}> + $<$:${WEBGPU_DAWN_RELEASE}> + ) + else() + target_link_libraries(gpu PRIVATE webgpu_dawn) + endif() +endif() + +# Link the gpu/dawn library to the executable. target_link_libraries(${PROJECT_NAME} PRIVATE gpu) -# Certain platforms need to copy the library files to the build directory +# Platform-specific post-build actions (e.g. copying DLLs for MSVC) if(MSVC) - # Copy webgpu_dawn.dll to the build directory - # CMake multigenerators like MSVC need --config Release on - # the cmake --build command or they will output to /Debug add_custom_command( TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy - ${DAWN_INSTALL_PREFIX}/${CMAKE_BUILD_TYPE}/webgpu_dawn.dll - $) + ${DAWN_BUILD_DIR}/$/webgpu_dawn.dll + $ + COMMENT "Copying webgpu_dawn.dll to the build directory" + ) endif() +if(EMSCRIPTEN) + + # Configure the HTML file by replacing @PROJECT_NAME@ with the actual target name. + configure_file(${PROJECT_ROOT}cmake/templates/index.html.in + ${CMAKE_CURRENT_BINARY_DIR}/index.html + @ONLY) + +endif() \ No newline at end of file diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 52a348b..6cce9e6 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -11,68 +11,11 @@ endif() message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}") - -include(FetchContent) - -set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party/fetchcontent/_deps") -set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_TYPE}" CACHE INTERNAL "Dawn install location" FORCE) - - -# Before fetching, set configuration options for Dawn. -set(DCMAKE_INSTALL_PREFIX ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE) - -# Dawn options for more, -# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt -set(DAWN_ALWAYS_ASSERT OFF CACHE INTERNAL "Always assert in Dawn" FORCE) -set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE) -set(DAWN_BUILD_EXAMPLES OFF CACHE INTERNAL "Build Dawn examples" FORCE) -set(DAWN_BUILD_SAMPLES OFF CACHE INTERNAL "Build Dawn samples" FORCE) -set(DAWN_BUILD_TESTS OFF CACHE INTERNAL "Build Dawn tests" FORCE) -set(DAWN_ENABLE_INSTALL ON CACHE INTERNAL "Enable Dawn installation" FORCE) -set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) - -set(TINT_BUILD_TESTS OFF CACHE INTERNAL "Build Tint Tests" FORCE) -set(TINT_BUILD_IR_BINARY OFF CACHE INTERNAL "Build Tint IR binary" FORCE) -set(TINT_BUILD_CMD_TOOLS OFF CACHE INTERNAL "Build Tint command line tools" FORCE) - -set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Build shared libraries" FORCE) - - -# Fetch Setup -# Add a commit hash to pin the version of Dawn. -# git fetch --depth=1 url -FetchContent_Declare( - dawn - DOWNLOAD_COMMAND - cd ${FETCHCONTENT_BASE_DIR}/dawn-src && - git init && - git fetch --depth=1 https://dawn.googlesource.com/dawn && - git reset --hard FETCH_HEAD -) - - -# Download the repository and add it as a subdirectory. -FetchContent_MakeAvailable(dawn) - - -# Since we require Dawn to be built before linking against it, we need to configure it now. -execute_process( - COMMAND ${CMAKE_COMMAND} ${FETCHCONTENT_BASE_DIR}/dawn-src - -B ${DAWN_INSTALL_PREFIX} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -G "${CMAKE_GENERATOR}" -) - -# Build Dawn -execute_process( - WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}/dawn-src - COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --config ${CMAKE_BUILD_TYPE} -) - # Add sources set(GPU_SOURCES "${PROJECT_ROOT}/gpu.cpp" "${PROJECT_ROOT}/numeric_types/half.cpp" + "${DAWN_BUILD_DIR}/gen/include/dawn/webgpu.h" ) # Add headers @@ -81,35 +24,17 @@ set(GPU_HEADERS "${PROJECT_ROOT}/utils/logging.hpp" "${PROJECT_ROOT}/utils/array_utils.hpp" "${PROJECT_ROOT}/numeric_types/half.hpp" + ) -# Emscripten includes a header automatically -if(EMSCRIPTEN) - file(REMOVE "${PROJECT_ROOT}/webgpu/webgpu.h") -else() - list(APPEND GPU_HEADERS "${PROJECT_ROOT}/third_party/headers/webgpu/webgpu.h") -endif() - - # Create the STATIC library for gpu add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS}) +set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX) target_include_directories(gpu PUBLIC "${PROJECT_ROOT}") -target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers") - -# find_library, windows adds extra folder -if(MSVC) - find_library(WEBGPU_DAWN_MONOLITHIC - NAMES webgpu_dawn - HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}" - REQUIRED - ) +if(NOT EMSCRIPTEN) + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/") else() - find_library(WEBGPU_DAWN_MONOLITHIC - NAMES webgpu_dawn - PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native" - REQUIRED - ) + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/") endif() -# Link the monolithic library -target_link_libraries(gpu PRIVATE ${WEBGPU_DAWN_MONOLITHIC}) diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in new file mode 100644 index 0000000..1bd64ca --- /dev/null +++ b/cmake/templates/index.html.in @@ -0,0 +1,22 @@ + + + + + @PROJECT_NAME@ + + + + + + + \ No newline at end of file diff --git a/examples/shadertui/CMakeLists.txt b/examples/shadertui/CMakeLists.txt index 0938023..b728fc8 100644 --- a/examples/shadertui/CMakeLists.txt +++ b/examples/shadertui/CMakeLists.txt @@ -1,3 +1,4 @@ +# Not working yet needs update with libs for emscripten cmake_minimum_required(VERSION 3.28) project(shadertui) diff --git a/gpu.hpp b/gpu.hpp index 5327fe7..edc8b38 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -15,7 +15,7 @@ #include // std::pair #include -#include "webgpu/webgpu.h" +#include "webgpu.h" #include "numeric_types/half.hpp" #include "utils/logging.hpp" @@ -910,6 +910,7 @@ inline Context createContext( // If the device was created, set up logging and fetch the queue if (devData.status == WGPURequestDeviceStatus_Success) { + #ifndef __EMSCRIPTEN__ WGPULoggingCallbackInfo loggingCallbackInfo { .nextInChain = nullptr, .callback = @@ -925,6 +926,7 @@ inline Context createContext( .userdata2 = nullptr }; wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); + #endif ctx.queue = wgpuDeviceGetQueue(ctx.device); } } @@ -1206,7 +1208,7 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { } wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); - CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise, + CallbackData callbackData = {op.readbackBuffer, static_cast(bufferSize), data, &op.promise, &op.future}; WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index e5bdaf0..fe5aab7 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -241,7 +241,7 @@ fn main( } } -int main() { +int testMain() { printf("\nHalf-precision float tests\n==========================\n"); printf("\nRegular values float round trips\n\n"); From 9247b79f3f31b87a19bc7dc0ae524608e8eea593 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sun, 16 Feb 2025 18:26:57 -0600 Subject: [PATCH 08/30] remove redundant find function --- cmake/dawn.cmake | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake index f7ab748..b9394d4 100644 --- a/cmake/dawn.cmake +++ b/cmake/dawn.cmake @@ -8,34 +8,6 @@ if(EMSCRIPTEN) set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "") endif() -function(find_dawn_library) - if(MSVC) - find_library(WEBGPU_DAWN_DEBUG webgpu_dawn - NAMES webgpu_dawn - HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" - ) - find_library(WEBGPU_DAWN_RELEASE webgpu_dawn - NAMES webgpu_dawn - HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" - ) - elseif(NOT EMSCRIPTEN AND NOT MSVC) - find_library(WEBGPU_DAWN_LIB - NAMES webgpu_dawn - PATHS "${DAWN_BUILD_DIR}/src/dawn/native" - REQUIRED - ) - endif() - - # Set result variables in parent scope - set(DAWN_BUILD_FOUND ON PARENT_SCOPE) - if(MSVC) - set(WEBGPU_DAWN_DEBUG ${WEBGPU_DAWN_DEBUG} PARENT_SCOPE) - set(WEBGPU_DAWN_RELEASE ${WEBGPU_DAWN_RELEASE} PARENT_SCOPE) - else() - set(WEBGPU_DAWN_LIB ${WEBGPU_DAWN_LIB} PARENT_SCOPE) - endif() -endfunction() - # Enable find for no dawn rebuilds with flutter run set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE) set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE) From 3e59576f2b752a4f6255445d0569583d87f38d44 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sun, 16 Feb 2025 18:42:27 -0600 Subject: [PATCH 09/30] clean linker flags --- cmake/example.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/example.cmake b/cmake/example.cmake index 99578fd..192358f 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -9,6 +9,7 @@ # or for emscripten # emcmake cmake -S . -B ./build_web -DCMAKE_BUILD_TYPE=Release # cmake --build build_web --config Release +# python3 -m http.server 8080 --d build_web if(NOT MSVC) set(CMAKE_CXX_STANDARD 17) @@ -50,14 +51,13 @@ if(EMSCRIPTEN) -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \ -sEXPORTED_RUNTIME_METHODS=ccall \ -sUSE_GLFW=3 \ - -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \ + -sALLOW_MEMORY_GROWTH=1 \ -sASYNCIFY \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \ --js-library=${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js \ --closure-args=--externs=${EMSCRIPTEN_DIR}/src/closure-externs/webgpu-externs.js \ - -O3 \ ") else() From 0653a4b524e7bbb6e91d2fe02c827ee7782d5b65 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sun, 16 Feb 2025 18:47:51 -0600 Subject: [PATCH 10/30] needs large stack size --- cmake/example.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/example.cmake b/cmake/example.cmake index 192358f..8216077 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -51,7 +51,7 @@ if(EMSCRIPTEN) -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \ -sEXPORTED_RUNTIME_METHODS=ccall \ -sUSE_GLFW=3 \ - -sALLOW_MEMORY_GROWTH=1 \ + -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \ -sASYNCIFY \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ From 9f4059b7d1ae5a0eba4fa26772a42a73064a66bf Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sun, 16 Feb 2025 18:50:13 -0600 Subject: [PATCH 11/30] use stack in MB instead --- cmake/example.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/example.cmake b/cmake/example.cmake index 8216077..6f195ec 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -51,7 +51,7 @@ if(EMSCRIPTEN) -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \ -sEXPORTED_RUNTIME_METHODS=ccall \ -sUSE_GLFW=3 \ - -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \ + -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \ -sASYNCIFY \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ From 78ae4685a15faf8f962d8f63a66f26fa86ed24ca Mon Sep 17 00:00:00 2001 From: MichealReed Date: Mon, 17 Feb 2025 17:30:11 -0600 Subject: [PATCH 12/30] must set DAWN_EMSCRIPTEN_TOOLCHAIN for build too --- cmake/dawn.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake index b9394d4..46d7403 100644 --- a/cmake/dawn.cmake +++ b/cmake/dawn.cmake @@ -6,6 +6,7 @@ set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "") if(EMSCRIPTEN) set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "") set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "") + set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE) endif() # Enable find for no dawn rebuilds with flutter run From 6197322e8e48e1c761043d6ca6badc89724b95e3 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Mon, 17 Feb 2025 22:59:26 -0600 Subject: [PATCH 13/30] EOF fixes --- cmake/dawn.cmake | 2 +- cmake/example.cmake | 2 +- cmake/templates/index.html.in | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake index 46d7403..2ead9ae 100644 --- a/cmake/dawn.cmake +++ b/cmake/dawn.cmake @@ -119,4 +119,4 @@ if(EMSCRIPTEN) target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js) target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js) else() -endif() \ No newline at end of file +endif() diff --git a/cmake/example.cmake b/cmake/example.cmake index 6f195ec..7cf1f8d 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -94,4 +94,4 @@ if(EMSCRIPTEN) ${CMAKE_CURRENT_BINARY_DIR}/index.html @ONLY) -endif() \ No newline at end of file +endif() diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in index 1bd64ca..b6f130c 100644 --- a/cmake/templates/index.html.in +++ b/cmake/templates/index.html.in @@ -19,4 +19,4 @@ } - \ No newline at end of file + From 9ac780bef1c6813f43855f7d9d7d33a733876c45 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Wed, 19 Feb 2025 16:30:50 -0600 Subject: [PATCH 14/30] refactors async --- cmake/templates/index.html.in | 2 +- examples/hello_world/run.cpp | 10 +- gpu.hpp | 838 ++++++++++++++++++++++------------ numeric_types/half.cpp | 15 +- 4 files changed, 550 insertions(+), 315 deletions(-) diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in index b6f130c..6b5957b 100644 --- a/cmake/templates/index.html.in +++ b/cmake/templates/index.html.in @@ -12,7 +12,7 @@ if (typeof Module !== 'undefined') { Module.onRuntimeInitialized = function() { // Optionally, pass arguments to main in an array. - Module._main([]); + Module.ccall('main', 'number', [], [], { async: true }); }; } else { console.error('Module is undefined. Check that your generated JS file is loaded properly.'); diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index 7453869..06970a7 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -38,12 +38,14 @@ int main(int argc, char **argv) { Tensor output = createTensor(ctx, Shape{N}, kf32); std::promise promise; std::future future = promise.get_future(); - Kernel op = createKernel(ctx, {kGelu, 256, kf32}, + std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf32}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, op, promise); - wait(ctx, future); - toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + Kernel op = waitForFuture(ctx.instance, kernelFuture); + std::future dispatchFuture = dispatchKernel(ctx, op); + waitForFuture(ctx.instance, dispatchFuture); + std::future cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + waitForFuture(ctx.instance, cpuFuture); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]); } diff --git a/gpu.hpp b/gpu.hpp index edc8b38..052c674 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1,6 +1,7 @@ #ifndef GPU_HPP #define GPU_HPP +#include "webgpu.h" #include #include #include @@ -15,15 +16,15 @@ #include // std::pair #include -#include "webgpu.h" +#ifndef __EMSCRIPTEN__ -#include "numeric_types/half.hpp" -#include "utils/logging.hpp" - -#ifdef __EMSCRIPTEN__ +#else #include "emscripten/emscripten.h" #endif +#include "numeric_types/half.hpp" +#include "utils/logging.hpp" + #ifdef USE_DAWN_API #include "dawn/native/DawnNative.h" #endif @@ -430,8 +431,8 @@ struct CallbackData { WGPUBuffer buffer; // managed by owning Kernel size_t bufferSize; void *output; // non-owning, only for target memory in toCPU, not used for - // kernel invocations - std::promise *promise; + // kernel invocations + std::shared_ptr> promise; std::future *future; }; @@ -530,32 +531,27 @@ struct Context { // Default constructor Context() = default; - Context(Context&& other) noexcept - : instance(other.instance), - adapter(other.adapter), - device(other.device), + Context(Context &&other) noexcept + : instance(other.instance), adapter(other.adapter), device(other.device), queue(other.queue), // Re‐initialize pools to point to *this*: - pool(this), - kernelPool(this), - adapterStatus(other.adapterStatus), - deviceStatus(other.deviceStatus) - { + pool(this), kernelPool(this), adapterStatus(other.adapterStatus), + deviceStatus(other.deviceStatus) { LOG(kDefLog, kTrace, "Moving Context ownership"); // Move over the resources in the pools: - pool.data = std::move(other.pool.data); + pool.data = std::move(other.pool.data); kernelPool.data = std::move(other.kernelPool.data); // Null out handles in the source so its destructor won't release them. other.instance = nullptr; - other.adapter = nullptr; - other.device = nullptr; - other.queue = nullptr; + other.adapter = nullptr; + other.device = nullptr; + other.queue = nullptr; // other.adapterStatus = 0; // other.deviceStatus = 0; } - Context& operator=(Context&& other) noexcept { + Context &operator=(Context &&other) noexcept { if (this != &other) { // Free any existing resources. In most cases, this should be a no-op // since we typically shouldn't have two active initialized Context @@ -625,7 +621,7 @@ inline Tensor createTensor(TensorPool &pool, WGPUDevice &device, size_t numElements = size(shape); size_t size = sizeBytes(dtype) * numElements; WGPUBufferDescriptor bufferDesc = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = usage, .size = size, }; @@ -794,6 +790,162 @@ inline void check(bool condition, const char *message, } } +/** + * @brief Pumps events until the provided future is ready. + * + * This helper template function continuously checks the status of the provided std::future + * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop + * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it + * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future + * is ready, its value is returned. + * + * @tparam T The type of the value contained in the future. + * @param instance The WGPUInstance used to process events. + * @param f The future to wait on. + * @return T The value retrieved from the ready future. + * + * @code + * std::future deviceFuture = requestDeviceAsync(adapter, devDescriptor); + * WGPUDevice device = waitForFuture(instance, deviceFuture); + * @endcode + */ +template +T waitForFuture(WGPUInstance instance, std::future &f) { +#ifdef __EMSCRIPTEN__ + // Poll until the future is ready. + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + // Yield control to the JS event loop. + emscripten_sleep(1); + } + return f.get(); +#else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + wgpuInstanceProcessEvents(instance); + } + return f.get(); +#endif +} + +// Context Callbacks & Helpers + +/** + * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request. + * + * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether + * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds) + * and sets an exception on the associated promise. On success, it sets the value of the promise with + * the obtained adapter. Finally, it frees the allocated memory for the promise pointer. + * + * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success. + * @param adapter The WGPUAdapter obtained on a successful request. + * @param message A string view containing additional information about the adapter request. + * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param userdata2 Unused. + */ +inline void adapterCallback(WGPURequestAdapterStatus status, + WGPUAdapter adapter, WGPUStringView message, + void *userdata1, void * /*userdata2*/) { + auto *promisePtr = + reinterpret_cast> *>(userdata1); + if (status != WGPURequestAdapterStatus_Success) { +#ifdef __EMSCRIPTEN__ + LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s", + static_cast(message.length), message.data); +#endif + (*promisePtr) + ->set_exception(std::make_exception_ptr( + std::runtime_error("Request WebGPU adapter failed"))); + } else { + (*promisePtr)->set_value(adapter); + } + delete promisePtr; +} + +/** + * @brief Callback function invoked upon completion of an asynchronous WebGPU device request. + * + * This callback is triggered when the request for a WebGPU device completes. It verifies that + * the device was successfully created. On success, the callback sets the value of the associated + * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated + * memory for the promise pointer. + * + * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success. + * @param device The WGPUDevice obtained on successful request. + * @param message A string view containing additional information about the device request. + * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param userdata2 Unused. + */ +inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device, + WGPUStringView message, void *userdata1, + void * /*userdata2*/) { + auto *promisePtr = + reinterpret_cast> *>(userdata1); + if (status != WGPURequestDeviceStatus_Success) { + (*promisePtr) + ->set_exception(std::make_exception_ptr( + std::runtime_error("Request WebGPU device failed"))); + } else { + LOG(kDefLog, kTrace, "Device Request succeeded %p", + static_cast(device)); + (*promisePtr)->set_value(device); + } + delete promisePtr; +} + +/** + * @brief Asynchronously requests a WebGPU adapter from the given instance. + * + * This helper function wraps the asynchronous call to request an adapter using the WebGPU API. + * It sets up a promise and registers an adapter callback, returning a future that will eventually + * hold the requested WGPUAdapter. + * + * @param instance The WGPUInstance from which to request the adapter. + * @param adapterOpts The options for requesting the adapter. + * @return std::future A future that will eventually hold the created WGPUAdapter. + */ +inline std::future +requestAdapterAsync(WGPUInstance instance, + const WGPURequestAdapterOptions &adapterOpts) { + auto promise = std::make_shared>(); + auto *promisePtr = new std::shared_ptr>(promise); + + WGPURequestAdapterCallbackInfo callbackInfo{ + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = adapterCallback, + .userdata1 = promisePtr, + .userdata2 = nullptr}; + wgpuInstanceRequestAdapter(instance, &adapterOpts, callbackInfo); + return promise->get_future(); +} + +/** + * @brief Asynchronously requests a WebGPU device from a given adapter. + * + * This helper function wraps the asynchronous call to request a device using the WebGPU API. + * It sets up a promise and registers a device callback, returning a future that will be fulfilled + * once the device is available. + * + * @param adapter The WGPUAdapter to request the device from. + * @param devDescriptor The descriptor specifying the characteristics of the requested device. + * @return std::future A future that will eventually hold the created WGPUDevice. + */ +inline std::future +requestDeviceAsync(WGPUAdapter adapter, + const WGPUDeviceDescriptor &devDescriptor) { + auto promise = std::make_shared>(); + auto *promisePtr = new std::shared_ptr>(promise); + + WGPURequestDeviceCallbackInfo deviceCallbackInfo{ + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = deviceCallback, + .userdata1 = promisePtr, + .userdata2 = nullptr}; + wgpuAdapterRequestDevice(adapter, &devDescriptor, deviceCallbackInfo); + return promise->get_future(); +} + /** * @brief Factory function to create a GPU context, which aggregates WebGPU API * handles to interact with the GPU including the instance, adapter, device, and @@ -812,12 +964,10 @@ inline void check(bool condition, const char *message, * @return Context instance representing the created GPU context * */ -inline Context createContext( - const WGPUInstanceDescriptor &desc = {}, - const WGPURequestAdapterOptions &adapterOpts = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) -{ - Context ctx; // stack-allocated +inline Context createContext(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + Context ctx; // Stack-allocated Context. #ifdef __EMSCRIPTEN__ ctx.instance = wgpuCreateInstance(nullptr); @@ -826,115 +976,50 @@ inline Context createContext( #endif check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); + // Request the adapter asynchronously. LOG(kDefLog, kTrace, "Requesting adapter"); - { - struct AdapterData { - WGPUAdapter adapter = nullptr; - bool requestEnded = false; - WGPURequestAdapterStatus status; - }; - AdapterData adapterData; - - auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status, - WGPUAdapter adapter, - WGPUStringView message, - void *pUserData, void *) { - auto &ad = *reinterpret_cast(pUserData); - ad.status = status; -#ifdef __EMSCRIPTEN__ - if (status != WGPURequestAdapterStatus_Success) { - LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s", - static_cast(message.length), message.data); - } -#endif - check(status == WGPURequestAdapterStatus_Success, - "Request WebGPU adapter", __FILE__, __LINE__); - ad.adapter = adapter; - ad.requestEnded = true; - }; - - WGPURequestAdapterCallbackInfo callbackInfo { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onAdapterRequestEnded, - .userdata1 = &adapterData, - .userdata2 = nullptr - }; - wgpuInstanceRequestAdapter(ctx.instance, &adapterOpts, callbackInfo); - - while (!adapterData.requestEnded) { - processEvents(ctx.instance); - } - ctx.adapter = adapterData.adapter; - ctx.adapterStatus = adapterData.status; + try { + auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts); + // Pump events until the adapter future is ready. + ctx.adapter = waitForFuture(ctx.instance, adapterFuture); + ctx.adapterStatus = WGPURequestAdapterStatus_Success; + } catch (const std::exception &ex) { + check(false, ex.what(), __FILE__, __LINE__); } + // Request the device asynchronously. LOG(kDefLog, kTrace, "Requesting device"); - { - struct DeviceData { - WGPUDevice device = nullptr; - bool requestEnded = false; - WGPURequestDeviceStatus status; - }; - DeviceData devData; - - auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status, - WGPUDevice device, - WGPUStringView message, - void *pUserData, void *) { - auto &dd = *reinterpret_cast(pUserData); - dd.status = status; - check(status == WGPURequestDeviceStatus_Success, - "Could not get WebGPU device.", __FILE__, __LINE__); - LOG(kDefLog, kTrace, "Device Request succeeded %p", - static_cast(device)); - dd.device = device; - dd.requestEnded= true; - }; - - WGPURequestDeviceCallbackInfo deviceCallbackInfo { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onDeviceRequestEnded, - .userdata1= &devData, - .userdata2= nullptr - }; - wgpuAdapterRequestDevice(ctx.adapter, &devDescriptor, deviceCallbackInfo); - - LOG(kDefLog, kTrace, "Waiting for device request to end"); - while (!devData.requestEnded) { - processEvents(ctx.instance); - } + try { + auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); + // Pump events until the device future is ready. + ctx.device = waitForFuture(ctx.instance, deviceFuture); + ctx.deviceStatus = WGPURequestDeviceStatus_Success; LOG(kDefLog, kTrace, "Device request ended"); - ctx.device = devData.device; - ctx.deviceStatus = devData.status; - - // If the device was created, set up logging and fetch the queue - if (devData.status == WGPURequestDeviceStatus_Success) { - #ifndef __EMSCRIPTEN__ - WGPULoggingCallbackInfo loggingCallbackInfo { + // If the device was created, set up logging and fetch the queue. +#ifndef __EMSCRIPTEN__ + WGPULoggingCallbackInfo loggingCallbackInfo{ .nextInChain = nullptr, .callback = - [](WGPULoggingType type, WGPUStringView message, - void *, void *) { - LOG(kDefLog, kError, "Device logging callback: %.*s", - static_cast(message.length), message.data); - if (type == WGPULoggingType_Error) { - throw std::runtime_error("Device error logged."); - } - }, + [](WGPULoggingType type, WGPUStringView message, void *, void *) { + LOG(kDefLog, kError, "Device logging callback: %.*s", + static_cast(message.length), message.data); + if (type == WGPULoggingType_Error) { + throw std::runtime_error("Device error logged."); + } + }, .userdata1 = nullptr, - .userdata2 = nullptr - }; - wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); - #endif - ctx.queue = wgpuDeviceGetQueue(ctx.device); - } + .userdata2 = nullptr}; + wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); +#endif + ctx.queue = wgpuDeviceGetQueue(ctx.device); + } catch (const std::exception &ex) { + check(false, ex.what(), __FILE__, __LINE__); } return std::move(ctx); } - #ifdef USE_DAWN_API /** * @brief Factory function to create a GPU context, which aggregates WebGPU API @@ -1066,11 +1151,76 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, } #endif -inline void wait(Context &ctx, std::future &future) { - while (future.wait_for(std::chrono::seconds(0)) != - std::future_status::ready) { - processEvents(ctx.instance); - } +/** + * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping. + * + * This callback is triggered when the GPU buffer mapping for a readback buffer is completed. + * It verifies that the mapping operation was successful, retrieves the mapped memory, + * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer, + * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data. + * + * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success. + * @param message A string view containing additional information about the mapping operation. + * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer, + * buffer size, destination CPU memory pointer, and a promise for signaling completion. + * @param userdata2 Unused. + */ +inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, + void *userdata1, void * /*userdata2*/) { + CallbackData *cbData = reinterpret_cast(userdata1); + // Check that mapping succeeded. + check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__, + __LINE__); + + // Get the mapped memory. + const void *mappedData = + wgpuBufferGetConstMappedRange(cbData->buffer, 0, cbData->bufferSize); + check(mappedData, "Get mapped range", __FILE__, __LINE__); + + // Copy the data from the mapped GPU buffer to the CPU memory. + memcpy(cbData->output, mappedData, cbData->bufferSize); + + // Unmap the buffer. + wgpuBufferUnmap(cbData->buffer); + + // Signal that the copy has completed. + // Ensure you use the arrow operator on the shared_ptr to call set_value(). + cbData->promise->set_value(); + + // Clean up the dynamically allocated callback data. + delete cbData; +} + +/** + * @brief Callback function invoked when the GPU queue’s submitted work is complete. + * + * This callback is registered with the GPU queue after submitting work. When invoked, + * it verifies that all queued work completed successfully, and then sets up the buffer + * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback + * buffer is mapped to access the processed data on the CPU. + * + * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer, + * buffer size, destination CPU memory pointer, and a promise to signal completion. + * @param userdata2 Unused. + */ +inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + CallbackData *cbData = reinterpret_cast(userdata1); + // Ensure the queue work finished successfully. + check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__, + __LINE__); + + // Set up the buffer mapping callback information. + WGPUBufferMapCallbackInfo mapCallbackInfo; + mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + mapCallbackInfo.callback = bufferMapCallback; + mapCallbackInfo.userdata1 = cbData; + mapCallbackInfo.userdata2 = nullptr; + + // Begin the asynchronous mapping of the readback buffer. + wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize, + mapCallbackInfo); } /** @@ -1085,45 +1235,35 @@ inline void wait(Context &ctx, std::future &future) { * toCPU(ctx, tensor, data, bufferSize); * @endcode */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, - CopyData &op) { +inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize, CopyData &op) { + // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); - CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise, - &op.future}; - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - const auto *data = static_cast(userdata1); - WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUMapAsyncStatus status, WGPUStringView message, - void *userdata1, void *userdata2) { - const auto *data = static_cast(userdata1); - check(status == WGPUMapAsyncStatus_Success, - "Map readbackBuffer", __FILE__, __LINE__); - const void *mappedData = wgpuBufferGetConstMappedRange( - data->buffer, /*offset=*/0, data->bufferSize); - check(mappedData, "Get mapped range", __FILE__, __LINE__); - memcpy(data->output, mappedData, data->bufferSize); - wgpuBufferUnmap(data->buffer); - data->promise->set_value(); - }, - .userdata1 = const_cast(data), - .userdata2 = nullptr}; - wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0, - data->bufferSize, mapCallbackInfo); - }, - .userdata1 = &callbackData, - .userdata2 = nullptr}; + // Create a promise and get its future. + auto promise = std::make_shared>(); + + // Allocate callback data so it remains valid until the async + // chain finishes. + CallbackData *cbData = new CallbackData{ + op.readbackBuffer, // The GPU buffer to be read back. + bufferSize, + data, // CPU memory destination. + promise // The promise to be signaled. + }; + + // Set up the work-done callback to initiate the buffer mapping. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo; + workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + workDoneCallbackInfo.callback = queueWorkDoneCallback; + workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data. + workDoneCallbackInfo.userdata2 = nullptr; + + // Begin the asynchronous chain by registering the queue work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - wait(ctx, op.future); + return promise->get_future(); } /** @@ -1141,31 +1281,59 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, * @param[in] bufferSize Size of the data buffer in bytes * @param[out] data Pointer to the CPU memory to copy the data to */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { - CopyData op; - op.future = op.promise.get_future(); - { - WGPUBufferDescriptor readbackBufferDescriptor = { - .label = {.data = nullptr, .length = 0}, - .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, - .size = bufferSize, - }; - op.readbackBuffer = - wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); - } - { - WGPUCommandEncoder commandEncoder; - commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0, - op.readbackBuffer, 0, bufferSize); - op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); - wgpuCommandEncoderRelease(commandEncoder); - check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__); - } - toCPU(ctx, tensor, data, bufferSize, op); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } +inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize) { + // Create a promise that will later be satisfied when the async copy + // completes. + auto promise = std::make_shared>(); + + // Create a readback buffer that will be used for copying and mapping. + WGPUBufferDescriptor readbackBufferDescriptor = { + .label = {.data = nullptr, .length = 0}, + .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, + .size = bufferSize, + }; + WGPUBuffer readbackBuffer = + wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); + + // Create a command encoder and record a copy from the tensor GPU buffer + WGPUCommandEncoder commandEncoder = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0, + readbackBuffer, 0, bufferSize); + // Finish recording by creating a command buffer and release the encoder. + WGPUCommandBuffer commandBuffer = + wgpuCommandEncoderFinish(commandEncoder, nullptr); + wgpuCommandEncoderRelease(commandEncoder); + check(commandBuffer, "Create command buffer", __FILE__, __LINE__); + + // Submit the work to the queue and release the command buffer immediately. + wgpuQueueSubmit(ctx.queue, 1, &commandBuffer); + wgpuCommandBufferRelease(commandBuffer); + + // Allocate callback data + CallbackData *cbData = new CallbackData{ + readbackBuffer, // The readback buffer to map. + bufferSize, // The size of the copy. + data, // CPU memory destination. + promise // The promise to signal when done. + }; + + // Set up the work-done callback. When the queue’s submitted work is + // completed, it is routed to queueWorkDoneCallback which then starts the + // asynchronous map. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = queueWorkDoneCallback, + .userdata1 = cbData, + .userdata2 = nullptr, + }; + + // Register the callback. The async chain continues inside + // queueWorkDoneCallback. + wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + + return promise->get_future(); } /** @@ -1176,76 +1344,74 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { * @param[out] data Array of floats to copy the data to * * @code - * toCPU(ctx, tensor, data); + * std::future toCPUFuture = toCPU(ctx, tensor, data); + * WaitForFuture(ctx.instance, toCPUFuture); * @endcode */ template -void toCPU(Context &ctx, Tensor &tensor, std::array &data) { - toCPU(ctx, tensor, data.data(), sizeof(data)); +inline std::future toCPU(Context &ctx, Tensor &tensor, + std::array &data) { + return toCPU(ctx, tensor, data.data(), sizeof(data)); } -inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { +inline std::future toCPU(Context &ctx, WGPUBuffer buffer, void *data, + size_t size) { + // The size (in bytes) for the copy. uint64_t bufferSize = size; + + // Create an operation structure (here we reuse CopyData solely for its + // members that we need to create a readback buffer and command buffer). CopyData op; - op.future = op.promise.get_future(); + + // Create the promise that will be fulfilled once the copy is done. + auto promise = std::make_shared>(); + + // Create a readback buffer that we can map for reading. { WGPUBufferDescriptor readbackBufferDescriptor = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, .size = bufferSize, }; op.readbackBuffer = wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); } + + // Create a command encoder which copies from the provided buffer to the + // readback buffer. { - WGPUCommandEncoder commandEncoder; - commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + WGPUCommandEncoder commandEncoder = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0, op.readbackBuffer, 0, bufferSize); op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); wgpuCommandEncoderRelease(commandEncoder); check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__); } + + // Submit the command and release the command buffer. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); - CallbackData callbackData = {op.readbackBuffer, static_cast(bufferSize), data, &op.promise, - &op.future}; + // Allocate callback data + CallbackData *cbData = new CallbackData{ + op.readbackBuffer, // The readback buffer created above. + static_cast(bufferSize), // Size of the copy. + data, // Destination CPU memory. + promise // Our promise to satisfy when done. + }; + + // Set up the queue work-done callback info. WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - const auto *data = static_cast(userdata1); - WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUMapAsyncStatus status, WGPUStringView message, - void *userdata1, void *userdata2) { - const auto *data = static_cast(userdata1); - check(status == WGPUMapAsyncStatus_Success, - "Map readbackBuffer", __FILE__, __LINE__); - const void *mappedData = wgpuBufferGetConstMappedRange( - data->buffer, /*offset=*/0, data->bufferSize); - check(mappedData, "Get mapped range", __FILE__, __LINE__); - memcpy(data->output, mappedData, data->bufferSize); - wgpuBufferUnmap(data->buffer); - data->promise->set_value(); - }, - .userdata1 = const_cast(data), - .userdata2 = nullptr}; - wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0, - data->bufferSize, mapCallbackInfo); - }, - .userdata1 = &callbackData, + .callback = queueWorkDoneCallback, // Our free function callback. + .userdata1 = cbData, // Pass the callback data pointer. .userdata2 = nullptr}; + + // Start the asynchronous chain by registering the work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - wait(ctx, op.future); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } + return promise->get_future(); } /** @@ -1376,6 +1542,19 @@ inline Shape cdiv(Shape total, Shape group) { return result; } +/** + * @brief Packages the shader compilation information along with a promise for asynchronous signaling. + * + * This structure holds a pointer to a CompilationInfo instance that collects + * details such as status, messages, line numbers, and positions from the shader compilation. + * It also contains a shared pointer to a std::promise which is used to signal the completion + * of the asynchronous shader compilation process. + */ +struct CompData { + CompilationInfo *compInfo; + std::shared_ptr> compPromise; +}; + /** * @brief A factory function to create a kernel on the GPU. The kernel is * created with the given WGSL code, input tensors, output tensor, and @@ -1399,34 +1578,38 @@ inline Shape cdiv(Shape total, Shape group) { * @return Kernel instance representing the created kernel * * @code - * Kernel kernel = createKernel(ctx, code, dataBindings, numInputs, + * std::future kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize); + * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); * @endcode - * output, nThreads, params, paramsSize); + */ -inline Kernel createKernel(Context& ctx, const KernelCode &code, - const Tensor *dataBindings, size_t numTensors, - const size_t *viewOffsets, - const Shape &totalWorkgroups, - const void *params = nullptr, size_t paramsSize = 0, - CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +inline std::future +createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, + size_t numTensors, const size_t *viewOffsets, + const Shape &totalWorkgroups, const void *params = nullptr, + size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { // Create a cache key by the pointer values of the data bindings and the // kernel code if (cacheKey != nullptr && ctx.kernelPool.data.find(cacheKey) != ctx.kernelPool.data.end()) { - LOG(kDefLog, kInfo, "Kernel cache hit"); - return ctx.kernelPool.data[cacheKey]; + std::promise ready; + ready.set_value(ctx.kernelPool.data[cacheKey]); + return ready.get_future(); } + // Create an outer promise for the new kernel. + std::promise outerPromise; + std::future outerFuture = outerPromise.get_future(); + assert(totalWorkgroups.rank == 3); WGPUDevice device = ctx.device; WGPUQueue queue = ctx.queue; Kernel op(new RawKernel()); - // paramIndex is the index into bgLayoutEntries for the parameters buffer If // there are no parameters for the kernel, paramsSize == 0 and paramIndex is // effectively undefined (== -1) - size_t paramIndex = -1; + size_t paramIndex = static_cast(-1); // Note: paramIndex is undefined unless paramsSize > 0 size_t numBindings = numTensors; if (paramsSize > 0) { @@ -1435,11 +1618,13 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, // op.buffers, op.bufferSizes and // bgLayoutEntries } + op->buffers = std::make_unique(numBindings); op->bufferSizes = std::make_unique(numBindings); op->numBindings = numBindings; - std::vector bgLayoutEntries(numBindings); + // Create layout entries for input buffers + std::vector bgLayoutEntries(numBindings); for (size_t i = 0; i < numTensors; ++i) { bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{ .binding = static_cast(i), @@ -1452,8 +1637,6 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, }; } if (paramsSize > 0) { - LOG(kDefLog, kInfo, "Create layout entry for the params buffer"); - // Create layout entry for the params buffer bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{ .binding = static_cast(paramIndex), .visibility = WGPUShaderStage_Compute, @@ -1466,10 +1649,11 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, } WGPUBindGroupLayoutDescriptor bgLayoutDesc = { .entryCount = static_cast(bgLayoutEntries.size()), - .entries = bgLayoutEntries.data(), - }; + .entries = bgLayoutEntries.data()}; WGPUBindGroupLayout bgLayout = wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc); + + // Assign buffers from dataBindings. for (size_t i = 0; i < numTensors; ++i) { op->buffers[i] = dataBindings[i].data.buffer; op->bufferSizes[i] = dataBindings[i].data.size; @@ -1477,7 +1661,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, // Create a buffer for the Params struct if (paramsSize > 0) { WGPUBufferDescriptor paramsBufferDesc = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, .size = paramsSize, .mappedAtCreation = false, @@ -1489,6 +1673,8 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, } else { LOG(kDefLog, kTrace, "No params buffer needed"); } + + // Build bind group entries and the bind group. std::vector bindGroupEntries(numBindings); for (size_t i = 0; i < numTensors; ++i) { bindGroupEntries[i] = WGPUBindGroupEntry{ @@ -1516,6 +1702,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, }; op->bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc); + // Create pipeline layout. WGPUPipelineLayoutDescriptor pipelineLayoutDesc = { .bindGroupLayoutCount = 1, .bindGroupLayouts = &bgLayout, @@ -1523,63 +1710,101 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, WGPUPipelineLayout pipelineLayout = wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc); + // Prepare the WGSL source and shader module descriptor. WGPUShaderSourceWGSL wgslDesc = { .chain = {.sType = WGPUSType_ShaderSourceWGSL}, .code = {.data = code.data.c_str(), .length = code.data.length()}}; - WGPUShaderModuleDescriptor shaderModuleDesc = {}; shaderModuleDesc.nextInChain = &wgslDesc.chain; shaderModuleDesc.label = {code.label.c_str(), code.label.length()}; - WGPUComputePipelineDescriptor computePipelineDesc = {}; - computePipelineDesc.layout = pipelineLayout; - computePipelineDesc.compute.module = + // Create the shader module. + WGPUShaderModule shaderModule = wgpuDeviceCreateShaderModule(device, &shaderModuleDesc); + // If compilation info is requested, register the callback immediately. + if (compilationInfo) { + auto compPromise = std::make_shared>(); + std::future compFuture = compPromise->get_future(); + // Allocate helper data to pass to the callback. + auto *compData = new CompData{compilationInfo, compPromise}; + + auto compilationCallback = [](WGPUCompilationInfoRequestStatus status, + WGPUCompilationInfo const *info, + void *userdata1, void * /*userdata2*/) { + CompData *cd = reinterpret_cast(userdata1); + if (info && cd->compInfo) { + cd->compInfo->status = status; + for (uint32_t i = 0; i < info->messageCount; ++i) { + cd->compInfo->messages.push_back( + std::string(info->messages[i].message.data, + info->messages[i].message.length)); + cd->compInfo->lineNums.push_back(info->messages[i].lineNum); + cd->compInfo->linePos.push_back(info->messages[i].linePos); + } + cd->compInfo->finished = true; + } + cd->compPromise->set_value(); + delete cd; + }; + + WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {}; + compilationCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + compilationCallbackInfo.callback = compilationCallback; + compilationCallbackInfo.userdata1 = compData; + compilationCallbackInfo.userdata2 = nullptr; + + // Register callback and then wait for the result. + wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo); + waitForFuture(ctx.instance, compFuture); + } + + // Now create the compute pipeline using the shader module. + WGPUComputePipelineDescriptor computePipelineDesc = {}; + computePipelineDesc.layout = pipelineLayout; + computePipelineDesc.compute.module = shaderModule; computePipelineDesc.compute.entryPoint = {code.entryPoint.c_str(), code.entryPoint.length()}; computePipelineDesc.label = {code.label.c_str(), code.label.length()}; - op->computePipeline = wgpuDeviceCreateComputePipeline(device, &computePipelineDesc); + op->totalWorkgroups = {totalWorkgroups[0], totalWorkgroups[1], totalWorkgroups[2]}; + resetCommandBuffer(device, op); if (cacheKey != nullptr) ctx.kernelPool.data[cacheKey] = op; - auto compilationInfoCallback = [](WGPUCompilationInfoRequestStatus status, - WGPUCompilationInfo const *compilationInfo, - void *userdata1, void *userdata2) { - CompilationInfo *result = static_cast(userdata1); - if (compilationInfo && result) { - result->status = status; - for (uint32_t i = 0; i < compilationInfo->messageCount; ++i) { - printf("Message %d: %.*s\n", i, - static_cast(compilationInfo->messages[i].message.length), - compilationInfo->messages[i].message.data); - result->messages.push_back( - std::string(compilationInfo->messages[i].message.data, - compilationInfo->messages[i].message.length)); - result->lineNums.push_back(compilationInfo->messages[i].lineNum); - result->linePos.push_back(compilationInfo->messages[i].linePos); - } - result->finished = true; - } else { - LOG(kDefLog, kTrace, "No compilation info or result"); - } - }; - - WGPUCompilationInfoCallbackInfo compilationCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = compilationInfoCallback, - .userdata1 = static_cast(compilationInfo), - .userdata2 = nullptr}; + outerPromise.set_value(op); + return outerFuture; +} - while (compilationInfo && !compilationInfo->finished) { - processEvents(ctx.instance); +/** + * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. + * + * This callback is invoked when the GPU queue signals the completion of the submitted + * workload for a kernel dispatch. It receives the work-done status and a userdata pointer, + * which is expected to be a heap‑allocated pointer to a std::promise. + * + * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception. + * After setting the promise state, the allocated memory for the promise is freed. + * + * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A heap allocated pointer to std::promise which is set when the work is done. + * @param userdata2 Unused. + */ +inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + // Cast the userdata pointer back to our heap‑allocated promise. + auto *p = reinterpret_cast *>(userdata1); + if (status == WGPUQueueWorkDoneStatus_Success) { + p->set_value(); + } else { + p->set_exception(std::make_exception_ptr( + std::runtime_error("Queue work did not complete successfully."))); } - return op; + delete p; // free the heap allocation } /** @@ -1599,17 +1824,17 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, * @return Kernel instance representing the created kernel * * @code - * Kernel kernel = createKernel(ctx, code, tensorData, output, + * std::future kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params); + * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); * @endcode - * totalWorkgroups, params); */ template -Kernel createKernel(Context &ctx, const KernelCode &code, - const Bindings &dataBindings, - const Shape &totalWorkgroups, - const ParamsType ¶ms = ParamsType{}, - CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +std::future createKernel(Context &ctx, const KernelCode &code, + const Bindings &dataBindings, + const Shape &totalWorkgroups, + const ParamsType ¶ms = ParamsType{}, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { if constexpr (!IsNoParam) { return createKernel(ctx, code, dataBindings.data.data(), numInputs, dataBindings.viewOffsets.data(), totalWorkgroups, @@ -1637,30 +1862,37 @@ Kernel createKernel(Context &ctx, const KernelCode &code, * @param[in] promise Promise to set when the kernel has finished executing * * @code - * dispatchKernel(ctx, kernel); + * std::future dispatchFuture = dispatchKernel(ctx, kernel); + * WaitForFuture(ctx.instance, dispatchFuture); * @endcode */ -inline void dispatchKernel(Context &ctx, Kernel &kernel, - std::promise &promise) { +inline std::future dispatchKernel(Context &ctx, Kernel &kernel) { + // If the kernel was used before, reset the command buffer. if (kernel->used) { resetCommandBuffer(ctx.device, kernel); } + + // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &kernel->commandBuffer); wgpuCommandBufferRelease(kernel->commandBuffer); kernel->used = true; - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - auto *promise = static_cast *>(userdata1); - promise->set_value(); - }, - .userdata1 = &promise, - .userdata2 = nullptr}; + // Allocate a promise on the heap so it remains valid beyond this function’s + // scope. + std::promise *promise = new std::promise(); + std::future future = promise->get_future(); + + // Set up the callback info. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {}; + workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + workDoneCallbackInfo.callback = dispatchKernelCallback; + workDoneCallbackInfo.userdata1 = reinterpret_cast(promise); + workDoneCallbackInfo.userdata2 = nullptr; + + // IMPORTANT: Pass the address of the callback info structure. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + + return future; } } // namespace gpu diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index fe5aab7..75d9dc4 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -189,7 +189,8 @@ void testContainers() { std::array h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0}; Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data()); std::array h2; - toCPU(ctx, devH, h2.data(), sizeof(h2)); + std::future toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2)); + waitForFuture(ctx.instance, toCPUFuture); for (int i = 0; i < 8; ++i) { printResult(h[i].data == h2[i].data, "Container round trip", static_cast(h[i]), static_cast(h2[i])); @@ -228,13 +229,13 @@ fn main( } Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf16); - std::promise promise; - std::future future = promise.get_future(); - Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, + std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, op, promise); - wait(ctx, future); - toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + Kernel op = waitForFuture(ctx.instance, kernelFuture); + std::future dispatchFuture = dispatchKernel(ctx, op); + waitForFuture(ctx.instance, dispatchFuture); + std::future toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + waitForFuture(ctx.instance, toCPUFuture); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", static_cast(inputArr[i]), static_cast(outputArr[i])); From 14e7ab59a67329573bc69a7dfce5d431ba8777b3 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Wed, 19 Feb 2025 18:06:26 -0600 Subject: [PATCH 15/30] use async context waitForContext() --- cmake/example.cmake | 4 +- examples/hello_world/run.cpp | 4 +- gpu.hpp | 279 ++++++++++++++++++++++------------- numeric_types/half.cpp | 5 +- 4 files changed, 180 insertions(+), 112 deletions(-) diff --git a/cmake/example.cmake b/cmake/example.cmake index 7cf1f8d..5953876 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -45,14 +45,16 @@ if(EMSCRIPTEN) # Set Emscripten-specific link flags that enable WASM output and expose certain symbols. # Needed to use updated version, emdawnwebgpu set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\ + -O3 \ -sUSE_WEBGPU=0 \ -sWASM=1 \ -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \ -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \ -sEXPORTED_RUNTIME_METHODS=ccall \ -sUSE_GLFW=3 \ - -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \ + -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \ -sASYNCIFY \ + -sASYNCIFY_DEBUG \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \ diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index 06970a7..c9f22c7 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -28,7 +28,7 @@ int main(int argc, char **argv) { printf("--------------\n\n"); // std::unique_ptr ctx = createContext(); - Context ctx = createContext(); + Context ctx = waitForContext(); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -36,8 +36,6 @@ int main(int argc, char **argv) { } Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf32); - std::promise promise; - std::future future = promise.get_future(); std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf32}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); diff --git a/gpu.hpp b/gpu.hpp index 052c674..0119108 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -793,10 +793,11 @@ inline void check(bool condition, const char *message, /** * @brief Pumps events until the provided future is ready. * - * This helper template function continuously checks the status of the provided std::future - * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop - * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it - * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future + * This helper template function continuously checks the status of the provided + * std::future until it becomes ready. On Emscripten builds, it yields + * control to the JavaScript event loop using emscripten_sleep to allow + * asynchronous callbacks to execute. On other platforms, it processes events + * from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future * is ready, its value is returned. * * @tparam T The type of the value contained in the future. @@ -805,8 +806,8 @@ inline void check(bool condition, const char *message, * @return T The value retrieved from the ready future. * * @code - * std::future deviceFuture = requestDeviceAsync(adapter, devDescriptor); - * WGPUDevice device = waitForFuture(instance, deviceFuture); + * std::future deviceFuture = requestDeviceAsync(adapter, + * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture); * @endcode */ template @@ -831,17 +832,56 @@ T waitForFuture(WGPUInstance instance, std::future &f) { // Context Callbacks & Helpers /** - * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request. + * @brief Waits for the provided std::future to become ready by polling its status. * - * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether - * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds) - * and sets an exception on the associated promise. On success, it sets the value of the promise with - * the obtained adapter. Finally, it frees the allocated memory for the promise pointer. + * This helper template function continuously checks the status of the provided std::future until it is ready. + * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior. + * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks. + * Once the future is ready, its value is returned. * - * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success. + * @tparam T The type of the value contained in the future. + * @param f The future to wait on. + * @return T The value retrieved from the ready future. + * + * @code + * std::future contextFuture = createContext(); + * Context ctx = waitForContextFuture(contextFuture); + * @endcode + */ +template T waitForContextFuture(std::future &f) { + #ifdef __EMSCRIPTEN__ + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + emscripten_sleep(1); // Yield back to the JS event loop. + } + return f.get(); + #else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return f.get(); + #endif + } + +/** + * @brief Adapter callback function invoked upon completion of an asynchronous + * WebGPU adapter request. + * + * This callback is triggered when the request for a WebGPU adapter completes. + * It verifies whether the adapter was successfully obtained. On failure, it + * logs an error message (in Emscripten builds) and sets an exception on the + * associated promise. On success, it sets the value of the promise with the + * obtained adapter. Finally, it frees the allocated memory for the promise + * pointer. + * + * @param status The status of the adapter request. Expected to be + * WGPURequestAdapterStatus_Success on success. * @param adapter The WGPUAdapter obtained on a successful request. - * @param message A string view containing additional information about the adapter request. - * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param message A string view containing additional information about the + * adapter request. + * @param userdata1 A pointer to a heap-allocated + * std::shared_ptr>. * @param userdata2 Unused. */ inline void adapterCallback(WGPURequestAdapterStatus status, @@ -864,17 +904,22 @@ inline void adapterCallback(WGPURequestAdapterStatus status, } /** - * @brief Callback function invoked upon completion of an asynchronous WebGPU device request. + * @brief Callback function invoked upon completion of an asynchronous WebGPU + * device request. * - * This callback is triggered when the request for a WebGPU device completes. It verifies that - * the device was successfully created. On success, the callback sets the value of the associated - * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated - * memory for the promise pointer. + * This callback is triggered when the request for a WebGPU device completes. It + * verifies that the device was successfully created. On success, the callback + * sets the value of the associated promise; otherwise, it sets an exception. + * After fulfilling the promise, it frees the allocated memory for the promise + * pointer. * - * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success. + * @param status The status of the device request. Expected to be + * WGPURequestDeviceStatus_Success on success. * @param device The WGPUDevice obtained on successful request. - * @param message A string view containing additional information about the device request. - * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param message A string view containing additional information about the + * device request. + * @param userdata1 A pointer to a heap-allocated + * std::shared_ptr>. * @param userdata2 Unused. */ inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device, @@ -897,13 +942,14 @@ inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device, /** * @brief Asynchronously requests a WebGPU adapter from the given instance. * - * This helper function wraps the asynchronous call to request an adapter using the WebGPU API. - * It sets up a promise and registers an adapter callback, returning a future that will eventually - * hold the requested WGPUAdapter. + * This helper function wraps the asynchronous call to request an adapter using + * the WebGPU API. It sets up a promise and registers an adapter callback, + * returning a future that will eventually hold the requested WGPUAdapter. * * @param instance The WGPUInstance from which to request the adapter. * @param adapterOpts The options for requesting the adapter. - * @return std::future A future that will eventually hold the created WGPUAdapter. + * @return std::future A future that will eventually hold the + * created WGPUAdapter. */ inline std::future requestAdapterAsync(WGPUInstance instance, @@ -923,13 +969,15 @@ requestAdapterAsync(WGPUInstance instance, /** * @brief Asynchronously requests a WebGPU device from a given adapter. * - * This helper function wraps the asynchronous call to request a device using the WebGPU API. - * It sets up a promise and registers a device callback, returning a future that will be fulfilled - * once the device is available. + * This helper function wraps the asynchronous call to request a device using + * the WebGPU API. It sets up a promise and registers a device callback, + * returning a future that will be fulfilled once the device is available. * * @param adapter The WGPUAdapter to request the device from. - * @param devDescriptor The descriptor specifying the characteristics of the requested device. - * @return std::future A future that will eventually hold the created WGPUDevice. + * @param devDescriptor The descriptor specifying the characteristics of the + * requested device. + * @return std::future A future that will eventually hold the + * created WGPUDevice. */ inline std::future requestDeviceAsync(WGPUAdapter adapter, @@ -964,60 +1012,62 @@ requestDeviceAsync(WGPUAdapter adapter, * @return Context instance representing the created GPU context * */ -inline Context createContext(const WGPUInstanceDescriptor &desc = {}, - const WGPURequestAdapterOptions &adapterOpts = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { - Context ctx; // Stack-allocated Context. +inline std::future +createContext(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { -#ifdef __EMSCRIPTEN__ - ctx.instance = wgpuCreateInstance(nullptr); -#else - ctx.instance = wgpuCreateInstance(&desc); -#endif - check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); + auto promise = std::make_shared>(); - // Request the adapter asynchronously. - LOG(kDefLog, kTrace, "Requesting adapter"); + // On native platforms, run our context creation in a detached thread. + + Context ctx; + ctx.instance = wgpuCreateInstance(&desc); + if (!ctx.instance) { + promise->set_exception(std::make_exception_ptr( + std::runtime_error("Failed to create WebGPU instance."))); + return promise->get_future(); + } try { auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts); - // Pump events until the adapter future is ready. ctx.adapter = waitForFuture(ctx.instance, adapterFuture); ctx.adapterStatus = WGPURequestAdapterStatus_Success; } catch (const std::exception &ex) { - check(false, ex.what(), __FILE__, __LINE__); + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); } - - // Request the device asynchronously. - LOG(kDefLog, kTrace, "Requesting device"); try { auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); - // Pump events until the device future is ready. ctx.device = waitForFuture(ctx.instance, deviceFuture); ctx.deviceStatus = WGPURequestDeviceStatus_Success; - LOG(kDefLog, kTrace, "Device request ended"); - - // If the device was created, set up logging and fetch the queue. -#ifndef __EMSCRIPTEN__ - WGPULoggingCallbackInfo loggingCallbackInfo{ - .nextInChain = nullptr, - .callback = - [](WGPULoggingType type, WGPUStringView message, void *, void *) { - LOG(kDefLog, kError, "Device logging callback: %.*s", - static_cast(message.length), message.data); - if (type == WGPULoggingType_Error) { - throw std::runtime_error("Device error logged."); - } - }, - .userdata1 = nullptr, - .userdata2 = nullptr}; - wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); -#endif - ctx.queue = wgpuDeviceGetQueue(ctx.device); } catch (const std::exception &ex) { - check(false, ex.what(), __FILE__, __LINE__); + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); } + ctx.queue = wgpuDeviceGetQueue(ctx.device); + promise->set_value(std::move(ctx)); + + return promise->get_future(); +} - return std::move(ctx); +/** + * @brief Synchronously waits for and returns the created GPU context. + * + * This function invokes the asynchronous createContext() factory function to create a GPU + * context, then waits for its completion using waitForContextFuture. The returned Context + * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent + * GPU operations. + * + * @return Context The fully initialized GPU context. + * + * @code + * Context ctx = waitForContext(); + * // Now ctx can be used for GPU operations. + * @endcode + */ +inline Context waitForContext() { + std::future contextFuture = createContext(); + return waitForContextFuture(contextFuture); } #ifdef USE_DAWN_API @@ -1152,17 +1202,22 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, #endif /** - * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping. - * - * This callback is triggered when the GPU buffer mapping for a readback buffer is completed. - * It verifies that the mapping operation was successful, retrieves the mapped memory, - * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer, - * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data. - * - * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success. - * @param message A string view containing additional information about the mapping operation. - * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer, - * buffer size, destination CPU memory pointer, and a promise for signaling completion. + * @brief Callback function invoked upon completion of an asynchronous GPU + * buffer mapping. + * + * This callback is triggered when the GPU buffer mapping for a readback buffer + * is completed. It verifies that the mapping operation was successful, + * retrieves the mapped memory, copies the data from the GPU buffer to a CPU + * memory region, unmaps the buffer, signals the completion by fulfilling the + * associated promise, and cleans up the allocated callback data. + * + * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success + * on success. + * @param message A string view containing additional information about the + * mapping operation. + * @param userdata1 A pointer to a heap-allocated CallbackData structure + * containing the GPU buffer, buffer size, destination CPU memory pointer, and a + * promise for signaling completion. * @param userdata2 Unused. */ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, @@ -1192,16 +1247,20 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, } /** - * @brief Callback function invoked when the GPU queue’s submitted work is complete. - * - * This callback is registered with the GPU queue after submitting work. When invoked, - * it verifies that all queued work completed successfully, and then sets up the buffer - * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback - * buffer is mapped to access the processed data on the CPU. - * - * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success. - * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer, - * buffer size, destination CPU memory pointer, and a promise to signal completion. + * @brief Callback function invoked when the GPU queue’s submitted work is + * complete. + * + * This callback is registered with the GPU queue after submitting work. When + * invoked, it verifies that all queued work completed successfully, and then + * sets up the buffer mapping callback to initiate the asynchronous mapping of a + * readback buffer. The readback buffer is mapped to access the processed data + * on the CPU. + * + * @param status The status of the completed work. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A pointer to a heap-allocated CallbackData structure + * containing the readback buffer, buffer size, destination CPU memory pointer, + * and a promise to signal completion. * @param userdata2 Unused. */ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, @@ -1543,12 +1602,14 @@ inline Shape cdiv(Shape total, Shape group) { } /** - * @brief Packages the shader compilation information along with a promise for asynchronous signaling. + * @brief Packages the shader compilation information along with a promise for + * asynchronous signaling. * * This structure holds a pointer to a CompilationInfo instance that collects - * details such as status, messages, line numbers, and positions from the shader compilation. - * It also contains a shared pointer to a std::promise which is used to signal the completion - * of the asynchronous shader compilation process. + * details such as status, messages, line numbers, and positions from the shader + * compilation. It also contains a shared pointer to a std::promise which + * is used to signal the completion of the asynchronous shader compilation + * process. */ struct CompData { CompilationInfo *compInfo; @@ -1578,10 +1639,11 @@ struct CompData { * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize); + * std::future kernelFuture = createKernel(ctx, code, dataBindings, + numInputs, output, nThreads, params, paramsSize); * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); * @endcode - + */ inline std::future createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, @@ -1783,15 +1845,19 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, /** * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. * - * This callback is invoked when the GPU queue signals the completion of the submitted - * workload for a kernel dispatch. It receives the work-done status and a userdata pointer, - * which is expected to be a heap‑allocated pointer to a std::promise. + * This callback is invoked when the GPU queue signals the completion of the + * submitted workload for a kernel dispatch. It receives the work-done status + * and a userdata pointer, which is expected to be a heap‑allocated pointer to a + * std::promise. * - * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception. - * After setting the promise state, the allocated memory for the promise is freed. + * On success, the promise is fulfilled by calling set_value(). Otherwise, it is + * set with an exception. After setting the promise state, the allocated memory + * for the promise is freed. * - * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success. - * @param userdata1 A heap allocated pointer to std::promise which is set when the work is done. + * @param status The status of the work done. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A heap allocated pointer to std::promise which is set + * when the work is done. * @param userdata2 Unused. */ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, @@ -1824,8 +1890,9 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params); - * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance, + * kernelFuture); * @endcode */ template diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index 75d9dc4..21a0005 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -185,7 +185,7 @@ void testContainers() { testRoundTrip(h[3]); } { - Context ctx = createContext(); + Context ctx = waitForContext(); std::array h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0}; Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data()); std::array h2; @@ -215,13 +215,14 @@ fn main( } } )"; - Context ctx = createContext( + std::future futureContext = createContext( {}, {}, /*device descriptor, enabling f16 in WGSL*/ { .requiredFeatureCount = 1, .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(), }); + Context ctx = waitForContextFuture(futureContext); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { From 9a08f8a875d74fda1644adbb367edfdf2f70838a Mon Sep 17 00:00:00 2001 From: MichealReed Date: Thu, 20 Feb 2025 13:54:02 -0600 Subject: [PATCH 16/30] adds sync wrappers --- cmake/example.cmake | 1 - examples/hello_world/run.cpp | 11 +- examples/render/run.cpp | 6 +- gpu.hpp | 372 +++++++++++++++++++++++++---------- numeric_types/half.cpp | 19 +- 5 files changed, 283 insertions(+), 126 deletions(-) diff --git a/cmake/example.cmake b/cmake/example.cmake index 5953876..cf697b5 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -54,7 +54,6 @@ if(EMSCRIPTEN) -sUSE_GLFW=3 \ -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \ -sASYNCIFY \ - -sASYNCIFY_DEBUG \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \ diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index c9f22c7..77549cf 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -28,7 +28,7 @@ int main(int argc, char **argv) { printf("--------------\n\n"); // std::unique_ptr ctx = createContext(); - Context ctx = waitForContext(); + Context ctx = createContext(); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -36,14 +36,11 @@ int main(int argc, char **argv) { } Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf32); - std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf32}, + Kernel op = createKernel(ctx, {kGelu, 256, kf32}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - Kernel op = waitForFuture(ctx.instance, kernelFuture); - std::future dispatchFuture = dispatchKernel(ctx, op); - waitForFuture(ctx.instance, dispatchFuture); - std::future cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); - waitForFuture(ctx.instance, cpuFuture); + dispatchKernel(ctx, op); + toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]); } diff --git a/examples/render/run.cpp b/examples/render/run.cpp index f9a90f9..64122cd 100644 --- a/examples/render/run.cpp +++ b/examples/render/run.cpp @@ -124,10 +124,8 @@ int main(int argc, char **argv) { cdiv({NCOLS, NROWS, 1}, wgSize), params); printf("\033[2J\033[H"); while (true) { - std::promise promise; - std::future future = promise.get_future(); - dispatchKernel(ctx, renderKernel, promise); - wait(ctx, future); + + dispatchKernel(ctx, renderKernel); toCPU(ctx, devScreen, screen.data(), sizeof(screen)); params.time = getCurrentTimeInMilliseconds() - zeroTime; diff --git a/gpu.hpp b/gpu.hpp index 0119108..e050c87 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -807,11 +807,10 @@ inline void check(bool condition, const char *message, * * @code * std::future deviceFuture = requestDeviceAsync(adapter, - * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture); + * devDescriptor); WGPUDevice device = wait(instance, deviceFuture); * @endcode */ -template -T waitForFuture(WGPUInstance instance, std::future &f) { +template T wait(Context &ctx, std::future &f) { #ifdef __EMSCRIPTEN__ // Poll until the future is ready. while (f.wait_for(std::chrono::milliseconds(0)) != @@ -823,7 +822,7 @@ T waitForFuture(WGPUInstance instance, std::future &f) { #else while (f.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { - wgpuInstanceProcessEvents(instance); + wgpuInstanceProcessEvents(ctx.instance); } return f.get(); #endif @@ -832,12 +831,15 @@ T waitForFuture(WGPUInstance instance, std::future &f) { // Context Callbacks & Helpers /** - * @brief Waits for the provided std::future to become ready by polling its status. + * @brief Waits for the provided std::future to become ready by polling its + * status. * - * This helper template function continuously checks the status of the provided std::future until it is ready. - * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior. - * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks. - * Once the future is ready, its value is returned. + * This helper template function continuously checks the status of the provided + * std::future until it is ready. On Emscripten builds, it yields control to + * the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous + * behavior. On non-Emscripten platforms, it sleeps for a short duration (10 + * milliseconds) between checks. Once the future is ready, its value is + * returned. * * @tparam T The type of the value contained in the future. * @param f The future to wait on. @@ -849,20 +851,20 @@ T waitForFuture(WGPUInstance instance, std::future &f) { * @endcode */ template T waitForContextFuture(std::future &f) { - #ifdef __EMSCRIPTEN__ - while (f.wait_for(std::chrono::milliseconds(0)) != - std::future_status::ready) { - emscripten_sleep(1); // Yield back to the JS event loop. - } - return f.get(); - #else - while (f.wait_for(std::chrono::milliseconds(0)) != - std::future_status::ready) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - return f.get(); - #endif +#ifdef __EMSCRIPTEN__ + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + emscripten_sleep(1); // Yield back to the JS event loop. + } + return f.get(); +#else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); } + return f.get(); +#endif +} /** * @brief Adapter callback function invoked upon completion of an asynchronous @@ -1013,9 +1015,9 @@ requestDeviceAsync(WGPUAdapter adapter, * */ inline std::future -createContext(const WGPUInstanceDescriptor &desc = {}, - const WGPURequestAdapterOptions &adapterOpts = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { +createContextAsync(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { auto promise = std::make_shared>(); @@ -1030,7 +1032,7 @@ createContext(const WGPUInstanceDescriptor &desc = {}, } try { auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts); - ctx.adapter = waitForFuture(ctx.instance, adapterFuture); + ctx.adapter = wait(ctx, adapterFuture); ctx.adapterStatus = WGPURequestAdapterStatus_Success; } catch (const std::exception &ex) { promise->set_exception(std::make_exception_ptr(ex)); @@ -1038,7 +1040,7 @@ createContext(const WGPUInstanceDescriptor &desc = {}, } try { auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); - ctx.device = waitForFuture(ctx.instance, deviceFuture); + ctx.device = wait(ctx, deviceFuture); ctx.deviceStatus = WGPURequestDeviceStatus_Success; } catch (const std::exception &ex) { promise->set_exception(std::make_exception_ptr(ex)); @@ -1053,10 +1055,11 @@ createContext(const WGPUInstanceDescriptor &desc = {}, /** * @brief Synchronously waits for and returns the created GPU context. * - * This function invokes the asynchronous createContext() factory function to create a GPU - * context, then waits for its completion using waitForContextFuture. The returned Context - * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent - * GPU operations. + * This function invokes the asynchronous createContext() factory function to + * create a GPU context, then waits for its completion using + * waitForContextFuture. The returned Context holds handles to the WebGPU + * instance, adapter, device, and queue, and is used for subsequent GPU + * operations. * * @return Context The fully initialized GPU context. * @@ -1065,8 +1068,11 @@ createContext(const WGPUInstanceDescriptor &desc = {}, * // Now ctx can be used for GPU operations. * @endcode */ -inline Context waitForContext() { - std::future contextFuture = createContext(); +inline Context createContext(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + std::future contextFuture = + createContextAsync(desc, adapterOpts, devDescriptor); return waitForContextFuture(contextFuture); } @@ -1294,8 +1300,8 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, * toCPU(ctx, tensor, data, bufferSize); * @endcode */ -inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize, CopyData &op) { +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize, CopyData &op) { // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); @@ -1340,8 +1346,8 @@ inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, * @param[in] bufferSize Size of the data buffer in bytes * @param[out] data Pointer to the CPU memory to copy the data to */ -inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize) { +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize) { // Create a promise that will later be satisfied when the async copy // completes. auto promise = std::make_shared>(); @@ -1395,26 +1401,8 @@ inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, return promise->get_future(); } -/** - * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU - * memory for an array of floats instead of a pointer to a float buffer. - * @param[in] ctx Context instance to manage the operation - * @param[in] tensor Tensor instance representing the GPU buffer to copy from - * @param[out] data Array of floats to copy the data to - * - * @code - * std::future toCPUFuture = toCPU(ctx, tensor, data); - * WaitForFuture(ctx.instance, toCPUFuture); - * @endcode - */ -template -inline std::future toCPU(Context &ctx, Tensor &tensor, - std::array &data) { - return toCPU(ctx, tensor, data.data(), sizeof(data)); -} - -inline std::future toCPU(Context &ctx, WGPUBuffer buffer, void *data, - size_t size) { +inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, + size_t size) { // The size (in bytes) for the copy. uint64_t bufferSize = size; @@ -1473,6 +1461,92 @@ inline std::future toCPU(Context &ctx, WGPUBuffer buffer, void *data, return promise->get_future(); } +/** + * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU + * memory for an array of floats instead of a pointer to a float buffer. + * @param[in] ctx Context instance to manage the operation + * @param[in] tensor Tensor instance representing the GPU buffer to copy from + * @param[out] data Array of floats to copy the data to + * + * @code + * std::future toCPUFuture = toCPU(ctx, tensor, data); + * wait(ctx, toCPUFuture); + * @endcode + */ +template +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, + std::array &data) { + return toCPUAsync(ctx, tensor, data.data(), sizeof(data)); +} + +/** + * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU + * memory. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param tensor Tensor instance representing the GPU buffer to copy from + * @param data Pointer to the CPU memory to copy the data to + * @param bufferSize Size of the data buffer in bytes + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, tensor, data, bufferSize, instance); + * @endcode + */ +inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { + auto future = toCPUAsync(ctx, tensor, data, bufferSize); + wait(ctx, future); +} + +/** + * @brief Synchronous wrapper for copying from a GPU buffer to CPU memory. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param buffer WGPUBuffer instance representing the GPU buffer to copy from + * @param data Pointer to the CPU memory to copy the data to + * @param size Size of the data buffer in bytes + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, buffer, data, size, instance); + * @endcode + */ +inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { + auto future = toCPUAsync(ctx, buffer, data, size); + wait(ctx, future); +} + +/** + * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU + * memory for an array of floats instead of a pointer to a float buffer. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param tensor Tensor instance representing the GPU buffer to copy from + * @param data Array of floats to copy the data to + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, tensor, data, instance); + * @endcode + */ +template +inline void toCPU(Context &ctx, Tensor &tensor, std::array &data) { + auto future = toCPUAsync(ctx, tensor, data); + wait(ctx, future); +} + /** * @brief Copies data from CPU memory to a GPU buffer. The toGPU overloads are * effectively a convenience wrapper around the WebGPU API call @@ -1617,9 +1691,9 @@ struct CompData { }; /** - * @brief A factory function to create a kernel on the GPU. The kernel is - * created with the given WGSL code, input tensors, output tensor, and - * optional parameters. + * @brief A factory function to create a kernel asynchronously on the GPU. + * The kernel is created with the given WGSL code, input tensors, + * output tensor, and optional parameters. * * Note that the values of the input tensors are not used here, only the * reference handles to the underlying buffers as well as the size of the @@ -1639,18 +1713,19 @@ struct CompData { * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, dataBindings, + * std::future kernelFuture = createKernelAsync(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize); - * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); + * Kernel kernel = wait(ctx.instance, kernelFuture); * @endcode */ inline std::future -createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, - size_t numTensors, const size_t *viewOffsets, - const Shape &totalWorkgroups, const void *params = nullptr, - size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +createKernelAsync(Context &ctx, const KernelCode &code, + const Tensor *dataBindings, size_t numTensors, + const size_t *viewOffsets, const Shape &totalWorkgroups, + const void *params = nullptr, size_t paramsSize = 0, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { // Create a cache key by the pointer values of the data bindings and the // kernel code if (cacheKey != nullptr && @@ -1818,7 +1893,7 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, // Register callback and then wait for the result. wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo); - waitForFuture(ctx.instance, compFuture); + wait(ctx, compFuture); } // Now create the compute pipeline using the shader module. @@ -1842,35 +1917,81 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, return outerFuture; } -/** - * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. +/* + * @brief Overload which wraps the createKernelAsync factory function to create + * a kernel on the GPU. This overload uses takes a pointer and size for the + * input tensors instead of a static collection and a void pointer for params + * instead of a static type. * - * This callback is invoked when the GPU queue signals the completion of the - * submitted workload for a kernel dispatch. It receives the work-done status - * and a userdata pointer, which is expected to be a heap‑allocated pointer to a - * std::promise. + * @param[in] ctx Context instance to manage the kernel + * @param[in] code WGSL code for the kernel + * @param[in] dataBindings Pointer to a span of tensors bound to the kernel + * @param[in] numTensors Number of tensors in the dataBindings span + * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be + * a Shape of rank == 3. + * @param[in] params Optional parameters for the kernel. If the kernel does + * not have any parameters, use NoParam. + * @return Kernel instance representing the created kernel * - * On success, the promise is fulfilled by calling set_value(). Otherwise, it is - * set with an exception. After setting the promise state, the allocated memory - * for the promise is freed. + * @code + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance, + * kernelFuture); + * @endcode + */ +inline Kernel createKernel(Context &ctx, const KernelCode &code, + const Tensor *dataBindings, size_t numTensors, + const size_t *viewOffsets, + const Shape &totalWorkgroups, + const void *params = nullptr, size_t paramsSize = 0, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { + std::future kernelFuture = createKernelAsync( + ctx, code, dataBindings, numTensors, viewOffsets, totalWorkgroups, params, + paramsSize, compilationInfo, cacheKey); + return wait(ctx, kernelFuture); +} + +/** + * @brief Overload which wraps the createKernelAsync factory function to create + * a kernel asynchronously on the GPU. This overload uses takes a static + * collection of input tensors instead of a pointer and a statically determined + * ParamsType instead of casting params to a void pointer. * - * @param status The status of the work done. Expected to be - * WGPUQueueWorkDoneStatus_Success on success. - * @param userdata1 A heap allocated pointer to std::promise which is set - * when the work is done. - * @param userdata2 Unused. + * @param[in] ctx Context instance to manage the kernel + * @param[in] code WGSL code for the kernel + * @param[in] dataBindings A Bindings of tensors whose GPU buffers are bound + * to the kernel as inputs and outputs. + * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be + * a Shape of rank == 3. + * @param[in] params Optional parameters for the kernel. If the kernel does + * not have any parameters, use NoParam. + * @return Kernel instance representing the created kernel + * + * @code + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance, + * kernelFuture); + * @endcode */ -inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, - void *userdata1, void * /*userdata2*/) { - // Cast the userdata pointer back to our heap‑allocated promise. - auto *p = reinterpret_cast *>(userdata1); - if (status == WGPUQueueWorkDoneStatus_Success) { - p->set_value(); +template +std::future +createKernelAsync(Context &ctx, const KernelCode &code, + const Bindings &dataBindings, + const Shape &totalWorkgroups, + const ParamsType ¶ms = ParamsType{}, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { + if constexpr (!IsNoParam) { + return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs, + dataBindings.viewOffsets.data(), totalWorkgroups, + reinterpret_cast(¶ms), + sizeof(ParamsType), compilationInfo, cacheKey); } else { - p->set_exception(std::make_exception_ptr( - std::runtime_error("Queue work did not complete successfully."))); + return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs, + dataBindings.viewOffsets.data(), totalWorkgroups, + nullptr, 0, compilationInfo, cacheKey); } - delete p; // free the heap allocation } /** @@ -1890,18 +2011,17 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, tensorData, - * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance, - * kernelFuture); + * Kernel kernel = createKernel(ctx, code, tensorData, output,totalWorkgroups, + * params); * @endcode */ template -std::future createKernel(Context &ctx, const KernelCode &code, - const Bindings &dataBindings, - const Shape &totalWorkgroups, - const ParamsType ¶ms = ParamsType{}, - CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +Kernel createKernel(Context &ctx, const KernelCode &code, + const Bindings &dataBindings, + const Shape &totalWorkgroups, + const ParamsType ¶ms = ParamsType{}, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { if constexpr (!IsNoParam) { return createKernel(ctx, code, dataBindings.data.data(), numInputs, dataBindings.viewOffsets.data(), totalWorkgroups, @@ -1914,6 +2034,37 @@ std::future createKernel(Context &ctx, const KernelCode &code, } } +/** + * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. + * + * This callback is invoked when the GPU queue signals the completion of the + * submitted workload for a kernel dispatch. It receives the work-done status + * and a userdata pointer, which is expected to be a heap‑allocated pointer to a + * std::promise. + * + * On success, the promise is fulfilled by calling set_value(). Otherwise, it is + * set with an exception. After setting the promise state, the allocated memory + * for the promise is freed. + * + * @param status The status of the work done. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A heap allocated pointer to std::promise which is set + * when the work is done. + * @param userdata2 Unused. + */ +inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + // Cast the userdata pointer back to our heap‑allocated promise. + auto *p = reinterpret_cast *>(userdata1); + if (status == WGPUQueueWorkDoneStatus_Success) { + p->set_value(); + } else { + p->set_exception(std::make_exception_ptr( + std::runtime_error("Queue work did not complete successfully."))); + } + delete p; // free the heap allocation +} + /** * @brief Asynchronously submits a kernel to the GPU queue for execution. * It also sets up a callback to notify when the kernel has finished executing @@ -1930,10 +2081,10 @@ std::future createKernel(Context &ctx, const KernelCode &code, * * @code * std::future dispatchFuture = dispatchKernel(ctx, kernel); - * WaitForFuture(ctx.instance, dispatchFuture); + * wait(ctx.instance, dispatchFuture); * @endcode */ -inline std::future dispatchKernel(Context &ctx, Kernel &kernel) { +inline std::future dispatchKernelAsync(Context &ctx, Kernel &kernel) { // If the kernel was used before, reset the command buffer. if (kernel->used) { resetCommandBuffer(ctx.device, kernel); @@ -1962,6 +2113,23 @@ inline std::future dispatchKernel(Context &ctx, Kernel &kernel) { return future; } +/** + * @brief Synchronous wrapper for dispatchKernelAsync. This function submits + * the kernel to the GPU queue and waits for it to finish executing. + * + * @param[in] ctx Context instance to manage the kernel, from which the queue + * for the GPU is obtained + * @param[in] kernel Kernel instance to dispatch + * + * @code + * dispatchKernel(ctx, kernel); + * @endcode + */ +inline void dispatchKernel(Context &ctx, Kernel &kernel) { + auto future = dispatchKernelAsync(ctx, kernel); + wait(ctx, future); +} + } // namespace gpu #endif // GPU_H diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index 21a0005..c183754 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -185,12 +185,11 @@ void testContainers() { testRoundTrip(h[3]); } { - Context ctx = waitForContext(); + Context ctx = createContext(); std::array h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0}; Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data()); std::array h2; - std::future toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2)); - waitForFuture(ctx.instance, toCPUFuture); + toCPU(ctx, devH, h2.data(), sizeof(h2)); for (int i = 0; i < 8; ++i) { printResult(h[i].data == h2[i].data, "Container round trip", static_cast(h[i]), static_cast(h2[i])); @@ -215,14 +214,13 @@ fn main( } } )"; - std::future futureContext = createContext( + Context ctx = createContext( {}, {}, /*device descriptor, enabling f16 in WGSL*/ { .requiredFeatureCount = 1, .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(), }); - Context ctx = waitForContextFuture(futureContext); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -230,20 +228,17 @@ fn main( } Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf16); - std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, + Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - Kernel op = waitForFuture(ctx.instance, kernelFuture); - std::future dispatchFuture = dispatchKernel(ctx, op); - waitForFuture(ctx.instance, dispatchFuture); - std::future toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); - waitForFuture(ctx.instance, toCPUFuture); + dispatchKernel(ctx, op); + toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", static_cast(inputArr[i]), static_cast(outputArr[i])); } } -int testMain() { +int testHalfMain() { printf("\nHalf-precision float tests\n==========================\n"); printf("\nRegular values float round trips\n\n"); From 95e587d71d25ab74207648ca91500a7594bff870 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Thu, 20 Feb 2025 16:34:35 -0600 Subject: [PATCH 17/30] refactors the byIdx context function and sets USE_DAWN_API compile def on native --- cmake/dawn.cmake | 2 + cmake/gpu.cmake | 2 + examples/hello_world/run.cpp | 6 + gpu.hpp | 308 ++++++++++++++++++++++------------- 4 files changed, 201 insertions(+), 117 deletions(-) diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake index 2ead9ae..c6fed94 100644 --- a/cmake/dawn.cmake +++ b/cmake/dawn.cmake @@ -7,6 +7,8 @@ if(EMSCRIPTEN) set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "") set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "") set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE) +else() + add_compile_definitions(USE_DAWN_API) endif() # Enable find for no dawn rebuilds with flutter run diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 6cce9e6..f936991 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -32,7 +32,9 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS}) set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX) target_include_directories(gpu PUBLIC "${PROJECT_ROOT}") if(NOT EMSCRIPTEN) + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/") target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/") + target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/") else() target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/") diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index 77549cf..b44934b 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -28,7 +28,13 @@ int main(int argc, char **argv) { printf("--------------\n\n"); // std::unique_ptr ctx = createContext(); + #ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); + auto adaptersList = listAdapters(ctx); + LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str()); + #else Context ctx = createContext(); + #endif static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { diff --git a/gpu.hpp b/gpu.hpp index e050c87..906371c 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -16,9 +16,8 @@ #include // std::pair #include -#ifndef __EMSCRIPTEN__ -#else +#ifdef __EMSCRIPTEN__ #include "emscripten/emscripten.h" #endif @@ -255,6 +254,26 @@ inline std::string toString(const Shape &shape) { */ inline std::string toString(size_t value) { return std::to_string(value); } +/** + * @brief Converts a WGPUStringView to an std::string. + * + * If the view's data is null, an empty string is returned. If the view's + * length equals WGPU_STRLEN, it is assumed to be null‑terminated; otherwise, + * the explicit length is used. + * + * @param strView The WGPUStringView to convert. + * @return std::string The resulting standard string. + */ +inline std::string formatWGPUStringView(WGPUStringView strView) { + if (!strView.data) { + return ""; + } + if (strView.length == WGPU_STRLEN) { + return std::string(strView.data); + } + return std::string(strView.data, strView.length); +} + /** * @brief simple in-place string replacement helper function for substituting * placeholders in a WGSL string template. @@ -1076,136 +1095,191 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {}, return waitForContextFuture(contextFuture); } -#ifdef USE_DAWN_API +#ifndef __EMSCRIPTEN__ +#if USE_DAWN_API /** - * @brief Factory function to create a GPU context, which aggregates WebGPU API - * handles to interact with the GPU including the instance, adapter, device, and - * queue. + * @brief Retrieves the list of available GPU adapters from the Dawn instance. * - * The function takes gpu index to support for multi GPUs. - * To activate this function, it needs not only webgpu's headers but also DAWN's - * headers. + * This function creates a Dawn instance using the provided context's instance + * handle, then enumerates and returns the available GPU adapters as a vector. * - * If dawn is used, it also sets up an error callback for device loss. + * @param ctx The Context containing the WebGPU instance handle. + * @return std::vector A vector of available GPU + * adapters. + * + * @code + * std::vector adapters = getAdapters(ctx); + * @endcode + */ +inline std::vector getAdapters(Context &ctx) { + dawn::native::Instance dawnInstance( + reinterpret_cast(ctx.instance)); + return dawnInstance.EnumerateAdapters(); +} + +/** + * @brief Formats the given vector of Dawn adapters into a single concatenated string. * - * @param[in] gpuIdx GPU index - * @param[in] desc Instance descriptor for the WebGPU instance (optional) - * @param[in] devDescriptor Device descriptor for the WebGPU device (optional) - * @return Context instance representing the created GPU context + * This function iterates over each Dawn adapter in the provided vector, retrieves its + * description using the WebGPU API, and converts the description from a WGPUStringView + * to an std::string using the formatWGPUStringView helper. The resulting descriptions + * are concatenated into a single string separated by newline characters. * + * @param adapters A vector of Dawn adapters obtained from a WebGPU instance. + * @return std::string A newline-delimited string listing each adapter's description. + * * @code - * Context ctx = createContextByGpuIdx(1); + * std::string adapterList = formatAdapters(adapters); * @endcode */ -inline Context -createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { - Context context; - { -#ifdef __EMSCRIPTEN__ - // Emscripten does not support the instance descriptor - // and throws an assertion error if it is not nullptr. - context.instance = wgpuCreateInstance(nullptr); -#else - context.instance = wgpuCreateInstance(&desc); -#endif - // check status - check(context.instance, "Initialize WebGPU", __FILE__, __LINE__); +inline std::string formatAdapters(const std::vector &adapters) { + std::string adapterList; + for (size_t i = 0; i < adapters.size(); ++i) { + auto adapterPtr = adapters[i].Get(); + if (adapterPtr) { + WGPUAdapterInfo info = {}; + wgpuAdapterGetInfo(adapterPtr, &info); + std::string desc = formatWGPUStringView(info.description); + adapterList += "GPU Adapter [" + std::to_string(i) + "]: " + desc + "\n"; + wgpuAdapterInfoFreeMembers(info); + } } + return adapterList; +} - LOG(kDefLog, kInfo, "Requesting adapter"); - { - std::vector adapters = - dawn::native::Instance( - reinterpret_cast(context.instance)) - .EnumerateAdapters(); - LOG(kDefLog, kInfo, "The number of GPUs=%d\n", adapters.size()); - // Note: Second gpu is not available on Macos, but the number of GPUs is 2 - // on Macos. - // Calling wgpuAdapterGetInfo function for the second gpu becomes - // segfault. When you check all GPUs on linux, uncomment out following - // codes. - // - // for (size_t i = 0; i < adapters.size(); i++) { - // WGPUAdapterInfo info {}; - // auto ptr = adapters[i].Get(); - // if (ptr && adapters[i]) { - // wgpuAdapterGetInfo(ptr, &info); - // LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", i, info.description); - // wgpuAdapterInfoFreeMembers(info); - // } - // } - - { - LOG(kDefLog, kInfo, "Use GPU(Adapter)[%d]\n", gpuIdx); - auto ptr = adapters[gpuIdx].Get(); - if (ptr) { - WGPUAdapterInfo info{}; - wgpuAdapterGetInfo(ptr, &info); - LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", gpuIdx, - info.description); - wgpuAdapterInfoFreeMembers(info); - } - context.adapter = adapters[gpuIdx].Get(); - dawn::native::GetProcs().adapterAddRef(context.adapter); - } +/** + * @brief Lists the available GPU adapters in the current WebGPU instance. + * + * This function retrieves the list of available GPU adapters using the + * getAdapters helper function, then formats and returns the adapter + * descriptions as a single string using the formatAdapters helper function. + * + * @param ctx The Context containing the WebGPU instance handle. + * @return std::string A newline-delimited string listing each adapter's + * description. + * + * @code + * std::string adapterList = listAdapters(ctx); + * @endcode + */ +inline std::string listAdapters(Context &ctx) { + auto adapters = getAdapters(ctx); + return formatAdapters(adapters); +} + +/** + * @brief Asynchronously creates a GPU context using the specified GPU index. + * + * This function creates a WebGPU instance, retrieves the available GPU + * adapters, and selects the adapter at the specified index. It then requests a + * device from the selected adapter and sets up a logging callback for device + * errors. The function returns a future that will be fulfilled with the + * created Context once all operations are complete. + * + * @param gpuIdx The index of the GPU adapter to use. + * @param desc Instance descriptor for the WebGPU instance (optional) + * @param devDescriptor Device descriptor for the WebGPU device (optional) + * @return std::future A future that will eventually hold the created + * Context. + * + * @code + * std::future contextFuture = createContextByGpuIdxAsync(0); + * Context ctx = waitForContextFuture(contextFuture); + * @endcode + */ +inline std::future +createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + auto promise = std::make_shared>(); + Context ctx; + + ctx.instance = wgpuCreateInstance(&desc); + + if (!ctx.instance) { + promise->set_exception(std::make_exception_ptr( + std::runtime_error("Failed to create WebGPU instance."))); + return promise->get_future(); } + check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); - LOG(kDefLog, kInfo, "Requesting device"); - { - struct DeviceData { - WGPUDevice device = nullptr; - bool requestEnded = false; - }; - DeviceData devData; - - auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status, - WGPUDevice device, WGPUStringView message, - void *pUserData, void *) { - DeviceData &devData = *reinterpret_cast(pUserData); - check(status == WGPURequestDeviceStatus_Success, - "Could not get WebGPU device.", __FILE__, __LINE__); - LOG(kDefLog, kTrace, "Device Request succeeded %x", - static_cast(device)); - devData.device = device; - devData.requestEnded = true; - }; + // Use helper functions to obtain and format the adapters. + auto adapters = getAdapters(ctx); - WGPURequestDeviceCallbackInfo deviceCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onDeviceRequestEnded, - .userdata1 = &devData, - .userdata2 = nullptr}; - wgpuAdapterRequestDevice(context.adapter, &devDescriptor, - deviceCallbackInfo); - - LOG(kDefLog, kInfo, "Waiting for device request to end"); - while (!devData.requestEnded) { - processEvents(context.instance); - } - LOG(kDefLog, kInfo, "Device request ended"); - assert(devData.requestEnded); - context.device = devData.device; - - WGPULoggingCallbackInfo loggingCallbackInfo = { - .nextInChain = nullptr, - .callback = - [](WGPULoggingType type, WGPUStringView message, void *userdata1, - void *userdata2) { - LOG(kDefLog, kError, "Device logging callback: %.*s", - static_cast(message.length), message.data); - if (type == WGPULoggingType_Error) { - throw std::runtime_error("Device error logged."); - } - }, - .userdata1 = nullptr, - .userdata2 = nullptr}; - wgpuDeviceSetLoggingCallback(context.device, loggingCallbackInfo); + if (gpuIdx >= adapters.size()) { + promise->set_exception( + std::make_exception_ptr(std::runtime_error("Invalid GPU index."))); + return promise->get_future(); + } + LOG(kDefLog, kInfo, "Using GPU Adapter[%d]", gpuIdx); + auto adapterPtr = adapters[gpuIdx].Get(); + if (adapterPtr) { + WGPUAdapterInfo info = {}; + wgpuAdapterGetInfo(adapterPtr, &info); + LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s", gpuIdx, + formatWGPUStringView(info.description).c_str()); + wgpuAdapterInfoFreeMembers(info); + } + ctx.adapter = reinterpret_cast(adapterPtr); + dawn::native::GetProcs().adapterAddRef(ctx.adapter); + + LOG(kDefLog, kInfo, "Requesting device"); + // Request the device asynchronously (using our requestDeviceAsync helper). + auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); + try { + ctx.device = wait(ctx, deviceFuture); + ctx.deviceStatus = WGPURequestDeviceStatus_Success; + } catch (const std::exception &ex) { + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); } - context.queue = wgpuDeviceGetQueue(context.device); - return context; + + WGPULoggingCallbackInfo loggingCallbackInfo{ + .nextInChain = nullptr, + .callback = + [](WGPULoggingType type, WGPUStringView message, void *userdata1, + void *userdata2) { + LOG(kDefLog, kError, "Device logging callback: %.*s", + static_cast(message.length), message.data); + if (type == WGPULoggingType_Error) { + throw std::runtime_error("Device error logged."); + } + }, + .userdata1 = nullptr, + .userdata2 = nullptr}; + wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); + ctx.queue = wgpuDeviceGetQueue(ctx.device); + promise->set_value(std::move(ctx)); + return promise->get_future(); } -#endif + +/** + * @brief Synchronously creates a GPU context using the specified GPU index. + * + * This function calls the asynchronous createContextByGpuIdxAsync function to + * create a GPU context, then waits for its completion using + * waitForContextFuture. The returned Context holds handles to the WebGPU + * instance, adapter, device, and queue, and is used for subsequent GPU + * operations. + * + * @param gpuIdx The index of the GPU adapter to use. + * @param desc Instance descriptor for the WebGPU instance (optional) + * @param devDescriptor Device descriptor for the WebGPU device (optional) + * @return Context The fully initialized GPU context. + * + * @code + * Context ctx = createContextByGpuIdx(0); + * @endcode + */ +inline Context createContextByGpuIdx(int gpuIdx, + const WGPUInstanceDescriptor &desc = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + std::future contextFuture = + createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor); + return waitForContextFuture(contextFuture); +} + +#endif // USE_DAWN_API +#endif // __EMSCRIPTEN__ /** * @brief Callback function invoked upon completion of an asynchronous GPU From 70d980287f9a7cca8889e166d67b802bc4b69319 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:14:41 -0600 Subject: [PATCH 18/30] tests toCPU, adds offset, adds gpuflow doc, default cmakelists builds test/test_gpu.cpp --- CMakeLists.txt | 18 +++++ docs/gpuflow.md | 78 +++++++++++++++++++ gpu.hpp | 110 ++++++++++++++++---------- test/test_gpu.cpp | 193 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 357 insertions(+), 42 deletions(-) create mode 100644 docs/gpuflow.md create mode 100644 test/test_gpu.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 816cdf3..a17602e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,24 @@ endif() include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake") include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake") +target_link_libraries(gpu PRIVATE webgpu_dawn) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test) + +add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp) +target_link_libraries(test_gpu PRIVATE gpu) + +# Platform-specific post-build actions (e.g. copying DLLs for MSVC) +if(MSVC) + add_custom_command( + TARGET test_gpu POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${DAWN_BUILD_DIR}/$/webgpu_dawn.dll + $ + COMMENT "Copying webgpu_dawn.dll to the build directory" + ) +endif() + add_library(gpud SHARED gpu.hpp) set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(gpud PRIVATE gpu) diff --git a/docs/gpuflow.md b/docs/gpuflow.md new file mode 100644 index 0000000..d4eb37a --- /dev/null +++ b/docs/gpuflow.md @@ -0,0 +1,78 @@ +# GPU.cpp Lifecycle + +```mermaid +flowchart TD + %% Data Preparation & Upload + subgraph "Data Preparation & Upload" + A["CPU Data"] + B["Define Data Properties
(shape, type, size)"] + C["Create GPU Buffer
(allocate raw buffer)"] + D["Create Tensor
(allocates Array with one
or more buffers
and associates Shape)"] + + E["Upload Data via toGPU
(raw buffer)
toGPU
(ctx, data, buffer, size)"] + F["Upload Data via toGPU
(Tensor overload)
toGPU(ctx, data, tensor)"] + G["Optional: Upload Kernel Parameters
toGPU(ctx, params, Kernel)"] + end + + %% Buffer Setup & Bindings + subgraph "Buffer & Binding Setup" + H["Define Bindings
(Bindings, TensorView)"] + I["Map GPU buffers
to shader bindings
(Collection from Tensor
or single buffers)"] + end + + %% Kernel Setup & Execution + subgraph "Kernel Setup & Execution" + J["Define KernelCode
(WGSL template, workgroup size, precision)"] + K["Create Kernel"] + L["Dispatch Kernel"] + end + + %% GPU Execution & Result Readback + subgraph "GPU Execution & Result Readback" + M["Kernel Execution
(GPU shader runs)"] + N["Readback Data
(toCPU variants)"] + end + + %% Context & Resources + O["Context
(Device, Queue,
TensorPool, KernelPool)"] + + %% Flow Connections + A --> B + B --> C + B --> D + C --> E + D --> F + F --> H + E --> H + H --> I + I --> K + J --> K + G --- K + K --> L + L --> M + M --> N + + %% Context shared by all stages + O --- D + O --- E + O --- F + O --- K + O --- L + O --- N +``` + +Rank 0: Scalar +Rank 1: Vector +Rank 2: Matrix +Rank 3: 3D Tensor (or Cube) +Rank 4: 4D Tensor +Rank ..: Higher Dimensional Tensors + + +• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`. +• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};` +• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel. +• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup. +• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration. +• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.). +• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor. diff --git a/gpu.hpp b/gpu.hpp index 906371c..931d646 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -16,7 +16,6 @@ #include // std::pair #include - #ifdef __EMSCRIPTEN__ #include "emscripten/emscripten.h" #endif @@ -1106,7 +1105,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {}, * @param ctx The Context containing the WebGPU instance handle. * @return std::vector A vector of available GPU * adapters. - * + * * @code * std::vector adapters = getAdapters(ctx); * @endcode @@ -1118,21 +1117,25 @@ inline std::vector getAdapters(Context &ctx) { } /** - * @brief Formats the given vector of Dawn adapters into a single concatenated string. + * @brief Formats the given vector of Dawn adapters into a single concatenated + * string. * - * This function iterates over each Dawn adapter in the provided vector, retrieves its - * description using the WebGPU API, and converts the description from a WGPUStringView - * to an std::string using the formatWGPUStringView helper. The resulting descriptions - * are concatenated into a single string separated by newline characters. + * This function iterates over each Dawn adapter in the provided vector, + * retrieves its description using the WebGPU API, and converts the description + * from a WGPUStringView to an std::string using the formatWGPUStringView + * helper. The resulting descriptions are concatenated into a single string + * separated by newline characters. * * @param adapters A vector of Dawn adapters obtained from a WebGPU instance. - * @return std::string A newline-delimited string listing each adapter's description. - * + * @return std::string A newline-delimited string listing each adapter's + * description. + * * @code * std::string adapterList = formatAdapters(adapters); * @endcode */ -inline std::string formatAdapters(const std::vector &adapters) { +inline std::string +formatAdapters(const std::vector &adapters) { std::string adapterList; for (size_t i = 0; i < adapters.size(); ++i) { auto adapterPtr = adapters[i].Get(); @@ -1157,7 +1160,7 @@ inline std::string formatAdapters(const std::vector &adap * @param ctx The Context containing the WebGPU instance handle. * @return std::string A newline-delimited string listing each adapter's * description. - * + * * @code * std::string adapterList = listAdapters(ctx); * @endcode @@ -1181,7 +1184,7 @@ inline std::string listAdapters(Context &ctx) { * @param devDescriptor Device descriptor for the WebGPU device (optional) * @return std::future A future that will eventually hold the created * Context. - * + * * @code * std::future contextFuture = createContextByGpuIdxAsync(0); * Context ctx = waitForContextFuture(contextFuture); @@ -1270,9 +1273,9 @@ createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, * Context ctx = createContextByGpuIdx(0); * @endcode */ -inline Context createContextByGpuIdx(int gpuIdx, - const WGPUInstanceDescriptor &desc = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { +inline Context +createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { std::future contextFuture = createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor); return waitForContextFuture(contextFuture); @@ -1365,17 +1368,19 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, /** * @brief Copies data from a GPU buffer to CPU memory. * @param[in] ctx Context instance to manage the operation - * @param[in] tensor Tensor instance representing the GPU buffer to copy from * @param[out] data Pointer to the CPU memory to copy the data to * @param[in] bufferSize Size of the data buffer in bytes * @param[in] op StagingBuffer instance to manage the operation + * @param[in] sourceOffset Offset in the GPU buffer to start copying from. * * @code * toCPU(ctx, tensor, data, bufferSize); * @endcode */ -inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize, CopyData &op) { + +// NOTE: I think this one is redundant? CopyData not used externally. +inline std::future toCPUAsync(Context &ctx, void *data, size_t bufferSize, + CopyData &op, size_t sourceOffset = 0) { // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); @@ -1388,8 +1393,8 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, CallbackData *cbData = new CallbackData{ op.readbackBuffer, // The GPU buffer to be read back. bufferSize, - data, // CPU memory destination. - promise // The promise to be signaled. + data, // CPU memory destination. + promise, // The promise to be signaled. }; // Set up the work-done callback to initiate the buffer mapping. @@ -1402,6 +1407,11 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Begin the asynchronous chain by registering the queue work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + // Release the readback buffer as it is no longer needed. + if (op.readbackBuffer) { + wgpuBufferRelease(op.readbackBuffer); + } + return promise->get_future(); } @@ -1417,11 +1427,13 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, * * @param[in] ctx Context instance to manage the operation * @param[in] tensor Tensor instance representing the GPU buffer to copy from - * @param[in] bufferSize Size of the data buffer in bytes + * @param[in] bufferSize Size to read in bytes as out data. * @param[out] data Pointer to the CPU memory to copy the data to + * @param[in] sourceOffset Offset in the GPU buffer to start copying from. */ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize) { + size_t bufferSize, + size_t sourceOffset = 0) { // Create a promise that will later be satisfied when the async copy // completes. auto promise = std::make_shared>(); @@ -1430,7 +1442,7 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, WGPUBufferDescriptor readbackBufferDescriptor = { .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, - .size = bufferSize, + .size = bufferSize, // Size of the readback buffer. }; WGPUBuffer readbackBuffer = wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); @@ -1438,8 +1450,9 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Create a command encoder and record a copy from the tensor GPU buffer WGPUCommandEncoder commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0, - readbackBuffer, 0, bufferSize); + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, + sourceOffset, readbackBuffer, 0, + bufferSize); // Finish recording by creating a command buffer and release the encoder. WGPUCommandBuffer commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); @@ -1472,13 +1485,16 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // queueWorkDoneCallback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + if (readbackBuffer) { + wgpuBufferRelease(readbackBuffer); + } + return promise->get_future(); } inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, - size_t size) { - // The size (in bytes) for the copy. - uint64_t bufferSize = size; + size_t bufferSize, + size_t sourceOffset = 0) { // Create an operation structure (here we reuse CopyData solely for its // members that we need to create a readback buffer and command buffer). @@ -1503,7 +1519,7 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, { WGPUCommandEncoder commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0, + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, sourceOffset, op.readbackBuffer, 0, bufferSize); op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); wgpuCommandEncoderRelease(commandEncoder); @@ -1516,10 +1532,10 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, // Allocate callback data CallbackData *cbData = new CallbackData{ - op.readbackBuffer, // The readback buffer created above. - static_cast(bufferSize), // Size of the copy. - data, // Destination CPU memory. - promise // Our promise to satisfy when done. + op.readbackBuffer, // The readback buffer created above. + bufferSize, // Size of the copy. + data, // Destination CPU memory. // Offset in the GPU buffer. + promise // Our promise to satisfy when done. }; // Set up the queue work-done callback info. @@ -1532,6 +1548,10 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, // Start the asynchronous chain by registering the work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + if (op.readbackBuffer) { + wgpuBufferRelease(op.readbackBuffer); + } + return promise->get_future(); } @@ -1548,9 +1568,11 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, * @endcode */ template -inline std::future toCPUAsync(Context &ctx, Tensor &tensor, - std::array &data) { - return toCPUAsync(ctx, tensor, data.data(), sizeof(data)); +inline std::future +toCPUAsync(Context &ctx, Tensor &tensor, std::array &data, + size_t sourceOffset = 0) { + return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset + ); } /** @@ -1571,8 +1593,10 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, * toCPU(ctx, tensor, data, bufferSize, instance); * @endcode */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { - auto future = toCPUAsync(ctx, tensor, data, bufferSize); +inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, + size_t sourceOffset = 0) { + auto future = + toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset); wait(ctx, future); } @@ -1593,8 +1617,9 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { * toCPU(ctx, buffer, data, size, instance); * @endcode */ -inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { - auto future = toCPUAsync(ctx, buffer, data, size); +inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size, + size_t sourceOffset = 0) { + auto future = toCPUAsync(ctx, buffer, data, size, sourceOffset); wait(ctx, future); } @@ -1616,8 +1641,9 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { * @endcode */ template -inline void toCPU(Context &ctx, Tensor &tensor, std::array &data) { - auto future = toCPUAsync(ctx, tensor, data); +inline void toCPU(Context &ctx, Tensor &tensor, std::array &data, + size_t sourceOffset = 0) { + auto future = toCPUAsync(ctx, tensor, data, sourceOffset); wait(ctx, future); } diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp new file mode 100644 index 0000000..2cc4290 --- /dev/null +++ b/test/test_gpu.cpp @@ -0,0 +1,193 @@ +//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp +#include "gpu.hpp" +#include +#include +#include +#include +#include +#include + +using namespace gpu; + +// A simple WGSL copy kernel that copies input to output. +static const char *kCopyKernel = R"( +@group(0) @binding(0) var inp: array<{{precision}}>; +@group(0) @binding(1) var out: array<{{precision}}>; +@group(0) @binding(1) var dummy: array<{{precision}}>; +@compute @workgroup_size({{workgroupSize}}) +fn main(@builtin(global_invocation_id) gid: vec3) { + let i: u32 = gid.x; + if (i < arrayLength(&inp)) { + out[i] = inp[i]; + } +} +)"; + +// Test using the overload that takes a Tensor. +void testToCPUWithTensor() { + LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); + + // Create a real GPU context. + #ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); + #else + Context ctx = createContext(); + #endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + outputData[i] = 0.0f; + } + + // Create input and output tensors. + Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + Tensor outputTensor = createTensor(ctx, Shape{N}, kf32); + + // Create and dispatch the copy kernel. + Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32}, + Bindings{inputTensor, outputTensor}, + {cdiv(N, 256), 1, 1}); + dispatchKernel(ctx, copyKernel); + + // Synchronously copy GPU output to CPU using the tensor overload. + toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); +} + +// Test using the overload that takes a raw GPU buffer. +// We reuse the Tensor's underlying buffer for this test. +void testToCPUWithBuffer() { + LOG(kDefLog, kInfo, "Running testToCPUWithBuffer..."); + + #ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); + #else + Context ctx = createContext(); + #endif + + constexpr size_t N = 1024; + std::array data, outputData; + for (size_t i = 0; i < N; ++i) { + data[i] = static_cast(i * 2); + outputData[i] = 0.0f; + } + + // Create a tensor to allocate a GPU buffer and initialize it. + Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data()); + + // Now extract the raw GPU buffer from the tensor. + WGPUBuffer gpuBuffer = tensor.data.buffer; + + // Use the WGPUBuffer overload. This call returns a future. + auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0); + wait(ctx, future); + + // Verify that the CPU output matches the original data. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == data[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); +} + +void testToCPUWithTensorSourceOffset() { + LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset..."); +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t numElements = 25; + constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements + constexpr size_t copyCount = 10; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with known data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 50); // Arbitrary values + } + // Create a tensor from the full data. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + + // Allocate a destination CPU buffer exactly as large as the data we want to copy. + std::vector cpuOutput(copyCount, -1.0f); + + // Set sourceOffset to skip the first few float elements + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + // Call the tensor overload with sourceOffset and destOffset = 0. + auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); +} + +void testToCPUWithBufferSourceOffset() { + LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset..."); +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t numElements = 30; + constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements + constexpr size_t copyCount = 12; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with arbitrary data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 100); + } + // Create a tensor to initialize a GPU buffer. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + // Extract the raw GPU buffer from the tensor. + WGPUBuffer buffer = tensor.data.buffer; + + // Allocate a destination CPU buffer exactly as large as needed. + std::vector cpuOutput(copyCount, -2.0f); + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + + // Call the buffer overload with sourceOffset and destOffset = 0. + auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify that the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); +} + +int main() { + LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testToCPUWithTensor(); + testToCPUWithBuffer(); + testToCPUWithTensorSourceOffset(); + testToCPUWithBufferSourceOffset(); + LOG(kDefLog, kInfo, "All tests passed."); + return 0; +} \ No newline at end of file From 16feb9e9f32e8cc2bbc12019a448c856a061d19f Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:16:44 -0600 Subject: [PATCH 19/30] remove path --- test/test_gpu.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 2cc4290..0954e44 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,4 +1,3 @@ -//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp #include "gpu.hpp" #include #include From e61e80917a73406e8fb8af5a94c743982231f51b Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:18:53 -0600 Subject: [PATCH 20/30] format --- test/test_gpu.cpp | 294 +++++++++++++++++++++++----------------------- 1 file changed, 149 insertions(+), 145 deletions(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 0954e44..48aa1bc 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,7 +1,7 @@ #include "gpu.hpp" #include -#include #include +#include #include #include #include @@ -24,169 +24,173 @@ fn main(@builtin(global_invocation_id) gid: vec3) { // Test using the overload that takes a Tensor. void testToCPUWithTensor() { - LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); - - // Create a real GPU context. - #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); - #else - Context ctx = createContext(); - #endif - - constexpr size_t N = 1024; - std::array inputData, outputData; - for (size_t i = 0; i < N; ++i) { - inputData[i] = static_cast(i); - outputData[i] = 0.0f; - } - - // Create input and output tensors. - Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); - Tensor outputTensor = createTensor(ctx, Shape{N}, kf32); - - // Create and dispatch the copy kernel. - Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32}, - Bindings{inputTensor, outputTensor}, - {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, copyKernel); - - // Synchronously copy GPU output to CPU using the tensor overload. - toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData)); - - // Verify the output matches the input. - for (size_t i = 0; i < N; ++i) { - LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); - LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); - assert(outputData[i] == inputData[i]); - } - LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); + LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); + +// Create a real GPU context. +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + outputData[i] = 0.0f; + } + + // Create input and output tensors. + Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + Tensor outputTensor = createTensor(ctx, Shape{N}, kf32); + + // Create and dispatch the copy kernel. + Kernel copyKernel = + createKernel(ctx, {kCopyKernel, 256, kf32}, + Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1}); + dispatchKernel(ctx, copyKernel); + + // Synchronously copy GPU output to CPU using the tensor overload. + toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); } // Test using the overload that takes a raw GPU buffer. // We reuse the Tensor's underlying buffer for this test. void testToCPUWithBuffer() { - LOG(kDefLog, kInfo, "Running testToCPUWithBuffer..."); - - #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); - #else - Context ctx = createContext(); - #endif - - constexpr size_t N = 1024; - std::array data, outputData; - for (size_t i = 0; i < N; ++i) { - data[i] = static_cast(i * 2); - outputData[i] = 0.0f; - } - - // Create a tensor to allocate a GPU buffer and initialize it. - Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data()); - - // Now extract the raw GPU buffer from the tensor. - WGPUBuffer gpuBuffer = tensor.data.buffer; - - // Use the WGPUBuffer overload. This call returns a future. - auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0); - wait(ctx, future); - - // Verify that the CPU output matches the original data. - for (size_t i = 0; i < N; ++i) { - LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); - assert(outputData[i] == data[i]); - } - LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); + LOG(kDefLog, kInfo, "Running testToCPUWithBuffer..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array data, outputData; + for (size_t i = 0; i < N; ++i) { + data[i] = static_cast(i * 2); + outputData[i] = 0.0f; + } + + // Create a tensor to allocate a GPU buffer and initialize it. + Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data()); + + // Now extract the raw GPU buffer from the tensor. + WGPUBuffer gpuBuffer = tensor.data.buffer; + + // Use the WGPUBuffer overload. This call returns a future. + auto future = + toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0); + wait(ctx, future); + + // Verify that the CPU output matches the original data. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == data[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); } void testToCPUWithTensorSourceOffset() { - LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset..."); + LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset..."); #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); + Context ctx = createContextByGpuIdx(0); #else - Context ctx = createContext(); + Context ctx = createContext(); #endif - constexpr size_t numElements = 25; - constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements - constexpr size_t copyCount = 10; // Number of floats to copy - size_t copySize = copyCount * sizeof(float); - - // Create an input array with known data. - std::array inputData{}; - for (size_t i = 0; i < numElements; ++i) { - inputData[i] = static_cast(i + 50); // Arbitrary values - } - // Create a tensor from the full data. - Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); - - // Allocate a destination CPU buffer exactly as large as the data we want to copy. - std::vector cpuOutput(copyCount, -1.0f); - - // Set sourceOffset to skip the first few float elements - size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); - // Call the tensor overload with sourceOffset and destOffset = 0. - auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes); - wait(ctx, future); - - // Verify the copied data matches the expected subset. - for (size_t i = 0; i < copyCount; ++i) { - float expected = inputData[sourceOffsetElements + i]; - float actual = cpuOutput[i]; - LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); - LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); - assert(expected == actual); - } - LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); + constexpr size_t numElements = 25; + constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements + constexpr size_t copyCount = 10; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with known data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 50); // Arbitrary values + } + // Create a tensor from the full data. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + + // Allocate a destination CPU buffer exactly as large as the data we want to + // copy. + std::vector cpuOutput(copyCount, -1.0f); + + // Set sourceOffset to skip the first few float elements + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + // Call the tensor overload with sourceOffset and destOffset = 0. + auto future = + toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); } void testToCPUWithBufferSourceOffset() { - LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset..."); + LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset..."); #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); + Context ctx = createContextByGpuIdx(0); #else - Context ctx = createContext(); + Context ctx = createContext(); #endif - constexpr size_t numElements = 30; - constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements - constexpr size_t copyCount = 12; // Number of floats to copy - size_t copySize = copyCount * sizeof(float); - - // Create an input array with arbitrary data. - std::array inputData{}; - for (size_t i = 0; i < numElements; ++i) { - inputData[i] = static_cast(i + 100); - } - // Create a tensor to initialize a GPU buffer. - Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); - // Extract the raw GPU buffer from the tensor. - WGPUBuffer buffer = tensor.data.buffer; - - // Allocate a destination CPU buffer exactly as large as needed. - std::vector cpuOutput(copyCount, -2.0f); - size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); - - // Call the buffer overload with sourceOffset and destOffset = 0. - auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes); - wait(ctx, future); - - // Verify that the copied data matches the expected subset. - for (size_t i = 0; i < copyCount; ++i) { - float expected = inputData[sourceOffsetElements + i]; - float actual = cpuOutput[i]; - LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); - LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); - assert(expected == actual); - } - LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); + constexpr size_t numElements = 30; + constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements + constexpr size_t copyCount = 12; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with arbitrary data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 100); + } + // Create a tensor to initialize a GPU buffer. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + // Extract the raw GPU buffer from the tensor. + WGPUBuffer buffer = tensor.data.buffer; + + // Allocate a destination CPU buffer exactly as large as needed. + std::vector cpuOutput(copyCount, -2.0f); + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + + // Call the buffer overload with sourceOffset and destOffset = 0. + auto future = + toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify that the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); } int main() { - LOG(kDefLog, kInfo, "Running GPU integration tests..."); - testToCPUWithTensor(); - testToCPUWithBuffer(); - testToCPUWithTensorSourceOffset(); - testToCPUWithBufferSourceOffset(); - LOG(kDefLog, kInfo, "All tests passed."); - return 0; -} \ No newline at end of file + LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testToCPUWithTensor(); + testToCPUWithBuffer(); + testToCPUWithTensorSourceOffset(); + testToCPUWithBufferSourceOffset(); + LOG(kDefLog, kInfo, "All tests passed."); + return 0; +} From ad8698dc1cb10ac89f020e6920d680328a6200ae Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:22:25 -0600 Subject: [PATCH 21/30] doc formatting --- docs/gpuflow.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/gpuflow.md b/docs/gpuflow.md index d4eb37a..420397d 100644 --- a/docs/gpuflow.md +++ b/docs/gpuflow.md @@ -61,18 +61,18 @@ flowchart TD O --- N ``` +• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`. +• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};` +• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel. +• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup. +• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration. +• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.). +• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor. + +`gpu::Tensor` Ranks: Rank 0: Scalar Rank 1: Vector Rank 2: Matrix Rank 3: 3D Tensor (or Cube) Rank 4: 4D Tensor -Rank ..: Higher Dimensional Tensors - - -• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`. -• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};` -• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel. -• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup. -• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration. -• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.). -• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor. +Rank (max 8): Higher Dimensional Tensors \ No newline at end of file From 025af2a8f4621ba9612354a6e524044da2188ac3 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:29:25 -0600 Subject: [PATCH 22/30] doc nits --- docs/gpuflow.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gpuflow.md b/docs/gpuflow.md index 420397d..fee9d4c 100644 --- a/docs/gpuflow.md +++ b/docs/gpuflow.md @@ -11,7 +11,7 @@ flowchart TD E["Upload Data via toGPU
(raw buffer)
toGPU
(ctx, data, buffer, size)"] F["Upload Data via toGPU
(Tensor overload)
toGPU(ctx, data, tensor)"] - G["Optional: Upload Kernel Parameters
toGPU(ctx, params, Kernel)"] + G["Optional:
Kernel Parameters
toGPU(ctx, params, Kernel)"] end %% Buffer Setup & Bindings From 3776dcd50152ba4fc18ca9029006bd9e9588dca7 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 11:33:30 -0600 Subject: [PATCH 23/30] set project root on root cmakelists --- CMakeLists.txt | 2 +- cmake/gpu.cmake | 1 - docs/gpuflow.md | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a17602e..85911a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # and cmake/gpu.cmake for more details cmake_minimum_required(VERSION 3.28) project(gpu) - +set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with # LSP set(CMAKE_CXX_STANDARD 20) diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index f936991..d991a18 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -39,4 +39,3 @@ else() target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/") endif() - diff --git a/docs/gpuflow.md b/docs/gpuflow.md index fee9d4c..d13a228 100644 --- a/docs/gpuflow.md +++ b/docs/gpuflow.md @@ -75,4 +75,4 @@ Rank 1: Vector Rank 2: Matrix Rank 3: 3D Tensor (or Cube) Rank 4: 4D Tensor -Rank (max 8): Higher Dimensional Tensors \ No newline at end of file +Rank (max 8): Higher Dimensional Tensors From d58e1911b5e015ea073c4e4350d3378a2edf80bd Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 16:15:51 -0600 Subject: [PATCH 24/30] fix linux issue with callback info --- gpu.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index 931d646..8c661bc 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -1354,11 +1355,12 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, __LINE__); // Set up the buffer mapping callback information. - WGPUBufferMapCallbackInfo mapCallbackInfo; - mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; - mapCallbackInfo.callback = bufferMapCallback; - mapCallbackInfo.userdata1 = cbData; - mapCallbackInfo.userdata2 = nullptr; + WGPUBufferMapCallbackInfo mapCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = bufferMapCallback, + .userdata1 = cbData, // Pass the callback data. + .userdata2 = nullptr // No additional user data. + }; // Begin the asynchronous mapping of the readback buffer. wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize, From 498ba74b73962d8b647b844fc570cf758ebaf467 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 17:54:16 -0600 Subject: [PATCH 25/30] should not release readback buffer --- gpu.hpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index 8c661bc..4854338 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1486,11 +1486,7 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Register the callback. The async chain continues inside // queueWorkDoneCallback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - - if (readbackBuffer) { - wgpuBufferRelease(readbackBuffer); - } - + return promise->get_future(); } @@ -1550,10 +1546,6 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, // Start the asynchronous chain by registering the work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } - return promise->get_future(); } From 2db9be10fb3f0298294ba199d71eca894746e3a6 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 18:07:09 -0600 Subject: [PATCH 26/30] clean up callback syntax --- gpu.hpp | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index 4854338..b057514 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1306,7 +1306,7 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, */ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, void *userdata1, void * /*userdata2*/) { - CallbackData *cbData = reinterpret_cast(userdata1); + const CallbackData *cbData = static_cast(userdata1); // Check that mapping succeeded. check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__, __LINE__); @@ -1349,17 +1349,17 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, */ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, void *userdata1, void * /*userdata2*/) { - CallbackData *cbData = reinterpret_cast(userdata1); + const CallbackData *cbData = static_cast(userdata1); // Ensure the queue work finished successfully. check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__, __LINE__); // Set up the buffer mapping callback information. WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = bufferMapCallback, - .userdata1 = cbData, // Pass the callback data. - .userdata2 = nullptr // No additional user data. + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = bufferMapCallback, + .userdata1 = const_cast(cbData), // Pass the callback data. + .userdata2 = nullptr // No additional user data. }; // Begin the asynchronous mapping of the readback buffer. @@ -1400,11 +1400,11 @@ inline std::future toCPUAsync(Context &ctx, void *data, size_t bufferSize, }; // Set up the work-done callback to initiate the buffer mapping. - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo; - workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; - workDoneCallbackInfo.callback = queueWorkDoneCallback; - workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data. - workDoneCallbackInfo.userdata2 = nullptr; + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = queueWorkDoneCallback, + .userdata1 = const_cast(cbData), + .userdata2 = nullptr}; // Begin the asynchronous chain by registering the queue work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); @@ -1486,7 +1486,7 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Register the callback. The async chain continues inside // queueWorkDoneCallback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - + return promise->get_future(); } @@ -1562,11 +1562,10 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, * @endcode */ template -inline std::future -toCPUAsync(Context &ctx, Tensor &tensor, std::array &data, - size_t sourceOffset = 0) { - return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset - ); +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, + std::array &data, + size_t sourceOffset = 0) { + return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset); } /** @@ -1589,8 +1588,7 @@ toCPUAsync(Context &ctx, Tensor &tensor, std::array &data, */ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, size_t sourceOffset = 0) { - auto future = - toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset); + auto future = toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset); wait(ctx, future); } From 752a53a3d426fb5bb87a89f31b601817adea25c7 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 18:38:09 -0600 Subject: [PATCH 27/30] add stress test --- test/test_gpu.cpp | 78 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 9 deletions(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 48aa1bc..99a1af6 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,12 +1,34 @@ #include "gpu.hpp" #include #include +#include #include #include #include #include using namespace gpu; +using namespace std::chrono; + + +// Forward declarations: +void testToCPUWithTensor(); +void testToCPUWithBuffer(); +void testToCPUWithTensorSourceOffset(); +void testToCPUWithBufferSourceOffset(); +void stressTestToCPU(); + +int main() { + LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testToCPUWithTensor(); + testToCPUWithBuffer(); + testToCPUWithTensorSourceOffset(); + testToCPUWithBufferSourceOffset(); + stressTestToCPU(); + LOG(kDefLog, kInfo, "All tests passed."); + return 0; +} + // A simple WGSL copy kernel that copies input to output. static const char *kCopyKernel = R"( @@ -22,6 +44,7 @@ fn main(@builtin(global_invocation_id) gid: vec3) { } )"; + // Test using the overload that takes a Tensor. void testToCPUWithTensor() { LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); @@ -185,12 +208,49 @@ void testToCPUWithBufferSourceOffset() { LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); } -int main() { - LOG(kDefLog, kInfo, "Running GPU integration tests..."); - testToCPUWithTensor(); - testToCPUWithBuffer(); - testToCPUWithTensorSourceOffset(); - testToCPUWithBufferSourceOffset(); - LOG(kDefLog, kInfo, "All tests passed."); - return 0; -} +void stressTestToCPU() { + LOG(kDefLog, kInfo, "Running stressTestToCPU for 2 seconds..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + // Create a persistent tensor with some test data. + std::vector inputData(N, 0.0f); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + } + Tensor tensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + + // Prepare to run for one second. + auto startTime = high_resolution_clock::now(); + std::vector> futures; + size_t opCount = 0; + while (high_resolution_clock::now() - startTime < seconds(2)) { + // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes) + auto outputData = std::make_shared>(N, 0.0f); + // Use the tensor overload; we’re copying the entire tensor (destOffset = 0) + LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float)); + // log count + LOG(kDefLog, kInfo, "opCount = %zu", opCount); + auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0); + futures.push_back(std::move(fut)); + ++opCount; + } + + // Wait for all submitted operations to complete. + for (auto &f : futures) { + wait(ctx, f); + } + + auto endTime = high_resolution_clock::now(); + auto totalMs = duration_cast(endTime - startTime).count(); + double throughput = (opCount / (totalMs / 1000.0)); + + LOG(kDefLog, kInfo, "Stress test completed:\n" + " %zu GPU to CPU operations in %lld ms\n" + " Throughput: %.2f ops/sec", opCount, totalMs, throughput); +} \ No newline at end of file From 5f82ff4d9e0fdd1de7f2ccf8e0a0a6d8e981b2fb Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 19:07:20 -0600 Subject: [PATCH 28/30] linux has a segfault if wait for events after. --- test/test_gpu.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 99a1af6..aa42b83 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -227,7 +227,6 @@ void stressTestToCPU() { // Prepare to run for one second. auto startTime = high_resolution_clock::now(); - std::vector> futures; size_t opCount = 0; while (high_resolution_clock::now() - startTime < seconds(2)) { // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes) @@ -237,14 +236,9 @@ void stressTestToCPU() { // log count LOG(kDefLog, kInfo, "opCount = %zu", opCount); auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0); - futures.push_back(std::move(fut)); + wait(ctx, fut); ++opCount; } - - // Wait for all submitted operations to complete. - for (auto &f : futures) { - wait(ctx, f); - } auto endTime = high_resolution_clock::now(); auto totalMs = duration_cast(endTime - startTime).count(); From 28dabf277eebb9fb5541870014287a9d7f533036 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sun, 23 Feb 2025 10:22:27 -0600 Subject: [PATCH 29/30] EOF newline --- test/test_gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index aa42b83..b855712 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -247,4 +247,4 @@ void stressTestToCPU() { LOG(kDefLog, kInfo, "Stress test completed:\n" " %zu GPU to CPU operations in %lld ms\n" " Throughput: %.2f ops/sec", opCount, totalMs, throughput); -} \ No newline at end of file +} From 39c816ca6b4ba0dff8808b680e0cf8f7b36973d4 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 1 Mar 2025 17:34:42 -0600 Subject: [PATCH 30/30] added sleeptime optional arg --- gpu.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index b057514..69ed0e9 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -869,7 +869,7 @@ template T wait(Context &ctx, std::future &f) { * Context ctx = waitForContextFuture(contextFuture); * @endcode */ -template T waitForContextFuture(std::future &f) { +template T waitForContextFuture(std::future &f, size_t sleepTime = 10) { #ifdef __EMSCRIPTEN__ while (f.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { @@ -879,7 +879,7 @@ template T waitForContextFuture(std::future &f) { #else while (f.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); + std::this_thread::sleep_for(std::chrono::milliseconds(sleepTime)); } return f.get(); #endif