From b748399f1f567640577a1bfd46a38814bf5c22e9 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Tue, 11 Feb 2025 01:43:13 -0600
Subject: [PATCH 01/30] simplify cmake, build dawn in repo, fix render for
 windows

---
 .gitignore              |   3 +
 CMakeLists.txt          |  29 +------
 cmake/example.cmake     |  27 +++----
 cmake/gpu.cmake         | 170 +++++++++++++++++++++++++---------------
 cmake/webgpu.cmake      |  61 --------------
 examples/render/run.cpp |  11 +--
 6 files changed, 132 insertions(+), 169 deletions(-)
 delete mode 100644 cmake/webgpu.cmake

diff --git a/.gitignore b/.gitignore
index 1a8b5bc..c7f60c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,6 @@ build
 .cache
 compile_commands.json
 
+# editor specific
+.vscode/*
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index db89df7..a464b34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,23 +1,11 @@
 cmake_minimum_required(VERSION 3.28)
 project(gpu)
 
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/webgpu.cmake")
-
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
                                       # LSP
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-option(USE_LOCAL_LIBS
-       "Use local libraries instead of fetching from the internet" OFF)
-
-# Ensure the build type is set
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE
-        Release
-        CACHE STRING "Choose the type of build: Debug or Release" FORCE)
-endif()
-
 option(FASTBUILD "Option to enable fast builds" OFF)
 if(FASTBUILD)
     set(CMAKE_BUILD_TYPE None) # Avoid default flags of predefined build types
@@ -30,21 +18,8 @@ if(DEBUG)
     set(CMAKE_CXX_FLAGS "-O0 -g")
 endif()
 
-if(WIN64)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN")
-endif()
-
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
 
-message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
-message(
-    STATUS
-        "Include directories for wgpu: ${CMAKE_CURRENT_SOURCE_DIR}/third_party/headers"
-)
-
 add_library(gpud SHARED gpu.hpp)
 set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(gpud PRIVATE wgpu)
-target_link_libraries(gpud PRIVATE webgpu)
-target_link_libraries(gpud PRIVATE gpu)
-install(TARGETS gpud)
+target_link_libraries(gpud PRIVATE gpu)
\ No newline at end of file
diff --git a/cmake/example.cmake b/cmake/example.cmake
index eba8e7c..41b15fd 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -1,17 +1,17 @@
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
                                       # LSP
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
 get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
 
 # Construct potential paths
-set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
-set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}")
+set(FILEPATH_CURRENT_DIR "${DIRECTORY}/")
+set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/")
 
 # Include file finding utility script
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake")
+include("${FILEPATH_PROJECT_ROOT}/cmake/find_gpu.cmake")
 
 # Check if the file exists in the current directory
 find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME}
@@ -49,20 +49,19 @@ endif()
 
 if(NOT TARGET gpu)
     message(STATUS "GPU_LIB not found")
-    include("${TARGET_FILE_PATH}/cmake/webgpu.cmake")
     include("${TARGET_FILE_PATH}/cmake/gpu.cmake")
 endif()
-
 add_executable(${PROJECT_NAME} run.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE gpu)
-target_link_libraries(${PROJECT_NAME} PRIVATE wgpu)
-target_link_libraries(${PROJECT_NAME} PRIVATE webgpu)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${WEBGPU_DAWN})
 
-if(WIN32)
-    # Ensure DLL is copied if on Windows
+if(MSVC)
+# Copy webgpu_dawn.dll to the build directory
     add_custom_command(
-        TARGET ${PROJECT_NAME}
-        POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DLL_PATH}
-                $<TARGET_FILE_DIR:${PROJECT_NAME}>)
+        TARGET ${PROJECT_NAME} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+                ${DAWN_INSTALL_PREFIX}/${CMAKE_BUILD_TYPE}/webgpu_dawn.dll
+                $<TARGET_FILE_DIR:${PROJECT_NAME}>
+    )
 endif()
+
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 08db244..15f3b43 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -1,69 +1,115 @@
-get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
-
-# Construct potential paths
-set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
-set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}")
-
-# Include file finding utility script
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake")
-
-# Check if the file exists in the current directory
-find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH)
-if("${TARGET_FILE_PATH}" STREQUAL "")
-    find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH)
-    if("${TARGET_FILE_PATH}" STREQUAL "")
-        message(
-            FATAL_ERROR
-                "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../"
-        )
-    endif()
-endif()
+set(FILENAME "gpu.hpp")
 
-# Define architecture and build type directories or file names
-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(ARCH "x64")
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
+    set(FILEPATH_PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 else()
-    set(ARCH "x86")
+    get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+    get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
+    
+    set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/")
 endif()
 
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(BUILD_TYPE "Debug")
+
+include(FetchContent)
+
+set(FETCHCONTENT_BASE_DIR "${FILEPATH_PROJECT_ROOT}/third_party/fetchcontent/_deps")
+set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_TYPE}" CACHE INTERNAL "Dawn install location" FORCE)
+
+
+# Before fetching, set configuration options for Dawn.
+# These CMake variables are “global” (cached INTERNAL) so that Dawn’s own CMakeLists.txt
+# will pick them up. Adjust them as needed.
+set(DAWN_BUILD_TYPE          ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn build type" FORCE)
+set(DCMAKE_INSTALL_PREFIX   ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE)
+
+# Dawn options
+set(DAWN_FETCH_DEPENDENCIES ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
+set(DAWN_ENABLE_INSTALL      ON  CACHE INTERNAL "Enable Dawn installation" FORCE)
+set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE INTERNAL "Build Dawn monolithically" FORCE)
+set(DAWN_BUILD_EXAMPLES      OFF CACHE INTERNAL "Build Dawn examples" FORCE)
+set(DAWN_BUILD_SAMPLES      OFF CACHE INTERNAL "Build Dawn samples" FORCE)
+set(DAWN_BUILD_TESTS         OFF CACHE INTERNAL "Build Dawn tests" FORCE)
+set(DAWN_BUILD_UTILS         OFF CACHE INTERNAL "Build Dawn utilities" FORCE)
+set(TINT_BUILD_TESTS        OFF CACHE INTERNAL "Build Tint Tests" FORCE)
+set(TINT_BUILD_IR_BINARY    OFF CACHE INTERNAL "Build Tint IR binary" FORCE)
+set(TINT_BUILD_CMD_TOOLS   OFF CACHE INTERNAL "Build Tint command line tools" FORCE)
+set(BUILD_SHARED_LIBS       OFF CACHE INTERNAL "Build shared libraries" FORCE)
+
+
+# Set up an install location for Dawn – you can change this to a specific location.
+
+
+FetchContent_Declare(
+    dawn
+    DOWNLOAD_COMMAND
+    cd ${FETCHCONTENT_BASE_DIR}/dawn-src &&
+    git init &&
+    git fetch --depth=1 https://dawn.googlesource.com/dawn &&
+    git reset --hard FETCH_HEAD
+)
+ 
+
+# This call will download the repository and add it as a subdirectory.
+FetchContent_MakeAvailable(dawn)
+
+ 
+# At this point, assuming Dawn’s CMakeLists.txt is written so that an install step is available,
+# we trigger a build of its install target. This custom target will build (and install) Dawn
+# into ${DAWN_INSTALL_PREFIX}. (If Dawn already adds an install target, you may simply depend on it.)
+add_custom_target(build_dawn_config ALL
+    COMMAND ${CMAKE_COMMAND} ${FETCHCONTENT_BASE_DIR}/dawn-src 
+        -B ${DAWN_INSTALL_PREFIX}
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+        -DDAWN_FETCH_DEPENDENCIES=ON
+        -DDAWN_ENABLE_INSTALL=ON
+        -DDAWN_BUILD_MONOLITHIC_LIBRARY=OFF
+        -DDAWN_BUILD_EXAMPLES=OFF
+        -DDAWN_BUILD_SAMPLES=OFF
+        -DDAWN_BUILD_TESTS=OFF
+        -DDAWN_BUILD_UTILS=OFF
+        -DTINT_BUILD_TESTS=OFF
+        -DTINT_BUILD_IR_BINARY=OFF
+        -DTINT_BUILD_CMD_TOOLS=OFF
+        -DBUILD_SHARED_LIBS=OFF
+        -G "${CMAKE_GENERATOR}"
+    COMMENT "Configuring Dawn build with custom options in ${DAWN_INSTALL_PREFIX}"
+)
+
+add_custom_target(build_dawn_install ALL
+    COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --target install
+    COMMENT "Installing Dawn into ${DAWN_INSTALL_PREFIX}"
+)
+
+include(${FETCHCONTENT_BASE_DIR}/dawn-build/cmake/DawnTargets.cmake)
+
+set(GPU_SOURCES
+    "${FILEPATH_PROJECT_ROOT}/gpu.cpp"
+    "${FILEPATH_PROJECT_ROOT}/numeric_types/half.cpp"
+)
+
+set(GPU_HEADERS
+    "${FILEPATH_PROJECT_ROOT}/gpu.hpp"
+    "${FILEPATH_PROJECT_ROOT}/utils/logging.hpp"
+    "${FILEPATH_PROJECT_ROOT}/utils/array_utils.hpp"
+    "${FILEPATH_PROJECT_ROOT}/numeric_types/half.hpp"
+)
+
+if(EMSCRIPTEN)
+    file(REMOVE "${FILEPATH_PROJECT_ROOT}/webgpu/webgpu.h")
 else()
-    set(BUILD_TYPE "Release")
+    list(APPEND GPU_HEADERS "${DAWN_INSTALL_PREFIX}/gen/webgpu-headers/webgpu.h")
 endif()
 
-add_library(webgpulib SHARED IMPORTED)
-add_library(gpu INTERFACE)
-add_library(wgpu INTERFACE)
-add_dependencies(gpu webgpulib)
-# Define the header-only library
-target_include_directories(gpu INTERFACE ${TARGET_FILE_PATH})
-
-# Add headers webgpu.h
-target_include_directories(wgpu
-                           INTERFACE ${TARGET_FILE_PATH}/third_party/headers)
-include(ExternalProject)
-
-set(DAWN_EXT_PREFIX "${TARGET_FILE_PATH}/third_party/local/dawn")
-
-ExternalProject_Add(
-    dawn_project
-    PREFIX ${DAWN_EXT_PREFIX}
-    GIT_REPOSITORY "https://dawn.googlesource.com/dawn"
-    GIT_TAG "main"
-    SOURCE_DIR "${DAWN_EXT_PREFIX}/source"
-    BINARY_DIR "${DAWN_EXT_PREFIX}/build"
-    INSTALL_DIR "${DAWN_EXT_PREFIX}/install"
-    GIT_SUBMODULES ""
-    # setting cmake args doesn't work and I don't know why
-    CONFIGURE_COMMAND
-        ${CMAKE_COMMAND} -S ${DAWN_EXT_PREFIX}/source -B
-        ${DAWN_EXT_PREFIX}/build -DDAWN_FETCH_DEPENDENCIES=ON
-        -DDAWN_ENABLE_INSTALL=ON -DDAWN_BUILD_MONOLITHIC_LIBRARY=ON
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -G ${CMAKE_GENERATOR}
-    INSTALL_COMMAND ${CMAKE_COMMAND} --install . --prefix
-                    ${DAWN_EXT_PREFIX}/install
-    LOG_INSTALL ON)
-find_library(LIBDAWN dawn PATHS "${DAWN_EXT_PREFIX}/install/lib")
-target_link_libraries(webgpulib INTERFACE ${LIBDAWN})
+
+# Create the INTERFACE library ‘gpu’
+add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
+target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}")
+target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}/third_party/headers")
+
+# Ensure that the gpu target is built only after Dawn has been installed.
+add_dependencies(gpu build_dawn_install)
+
+find_library(WEBGPU_DAWN
+    NAMES webgpu_dawn
+    HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/Debug/"
+)
\ No newline at end of file
diff --git a/cmake/webgpu.cmake b/cmake/webgpu.cmake
deleted file mode 100644
index c63f1e2..0000000
--- a/cmake/webgpu.cmake
+++ /dev/null
@@ -1,61 +0,0 @@
-# Specify the filename to search for
-set(FILENAME "gpu.hpp")
-
-get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
-
-# Construct potential paths
-set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
-set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}")
-
-# Include file finding utility script
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake")
-
-# Check if the file exists in the current directory
-find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH)
-if("${TARGET_FILE_PATH}" STREQUAL "")
-    find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH)
-    if("${TARGET_FILE_PATH}" STREQUAL "")
-        message(
-            FATAL_ERROR
-                "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../"
-        )
-    endif()
-endif()
-
-include(FetchContent)
-
-set(FETCHCONTENT_BASE_DIR "${TARGET_FILE_PATH}/third_party/fetchcontent")
-set(WEBGPU_DIST_LOCAL_PATH
-    "${TARGET_FILE_PATH}/third_party/local/WebGPU-distribution")
-
-if(USE_LOCAL_LIBS)
-    set(WEBGPU_DIST_GIT_REPO ${WEBGPU_DIST_LOCAL_PATH})
-    message(STATUS "Using local WebGPU distribution: ${WEBGPU_DIST_LOCAL_PATH}")
-else()
-    set(WEBGPU_DIST_GIT_REPO
-        "https://github.com/eliemichel/WebGPU-distribution")
-endif()
-
-option(WEBGPU_TAG "WebGPU distribution tag to use")
-if(NOT WEBGPU_TAG)
-    set(WEBGPU_TAG "dawn")
-endif()
-message(STATUS "Using WebGPU distribution tag: ${WEBGPU_TAG}")
-
-if(WEBGPU_TAG STREQUAL "dawn")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN")
-    # use specific commit set(WEBGPU_TAG
-    # "1025b977e1927b6d0327e67352f90feb4bcf8274") set(WEBGPU_TAG
-    # "acf972b7b909f52e183bdae3971b93bb13d4a29e")
-    # add_compile_options(-UABSL_INTERNAL_AT_LEAST_CXX20) set(CMAKE_CXX_FLAGS
-    # "${CMAKE_CXX_FLAGS} -UABSL_INTERNAL_AT_LEAST_CXX20")
-    message(STATUS "Using Dawn backend")
-endif()
-
-FetchContent_Declare(
-    webgpu
-    GIT_REPOSITORY ${WEBGPU_DIST_GIT_REPO}
-    GIT_TAG ${WEBGPU_TAG}
-    GIT_SHALLOW TRUE)
-FetchContent_MakeAvailable(webgpu)
diff --git a/examples/render/run.cpp b/examples/render/run.cpp
index f2c6bec..f9a90f9 100644
--- a/examples/render/run.cpp
+++ b/examples/render/run.cpp
@@ -149,11 +149,12 @@ int main(int argc, char **argv) {
 
     std::array<char, screen.size()> raster;
     for (size_t i = 0; i < screen.size(); ++i) {
-      size_t index =
-          std::min(sizeof(intensity) - 2,
-                   std::max(0ul, static_cast<size_t>(screen[i] *
-                                                     (sizeof(intensity) - 2))));
-      raster[i] = intensity[index];
+        // Convert all values to size_t to ensure proper type matching
+        const size_t intensity_max = sizeof(intensity) - 2;
+        const size_t scaled_value = static_cast<size_t>(screen[i] * intensity_max);
+        size_t index = std::min(intensity_max, 
+                               std::max(static_cast<size_t>(0), scaled_value));
+        raster[i] = intensity[index];
     }
 
     char buffer[(NROWS + 2) * (NCOLS + 2)];

From bbc3addc4a8fb5ed7bf3c9ecf525a2c91f70ff6a Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Tue, 11 Feb 2025 13:28:05 -0600
Subject: [PATCH 02/30] More simplification

---
 CMakeLists.txt       |  2 +
 cmake/example.cmake  | 68 ++++++++++--------------------
 cmake/find_gpu.cmake | 30 --------------
 cmake/gpu.cmake      | 99 ++++++++++++++++++++------------------------
 4 files changed, 70 insertions(+), 129 deletions(-)
 delete mode 100644 cmake/find_gpu.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a464b34..ca735a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,3 +1,5 @@
+# This only builds a shared lib, see cmake/example.cmake
+# and cmake/gpu.cmake for more details
 cmake_minimum_required(VERSION 3.28)
 project(gpu)
 
diff --git a/cmake/example.cmake b/cmake/example.cmake
index 41b15fd..d92c204 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -1,32 +1,20 @@
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
-                                      # LSP
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+# Getting Started with CMAKE
+# Each example includes this and sets PROJECT_NAME
+# cd examples/hello_world
+# cmake -S . build/ -DCMAKE_BUILD_TYPE=Release
+# cmake --build build/ --config Release
+# ./build/hello_world
+
+if(NOT MSVC)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 20)
+endif()
 
+# Path finding logic to find our root recipes from nested folders
 get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
 get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
 
-# Construct potential paths
-set(FILEPATH_CURRENT_DIR "${DIRECTORY}/")
-set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/")
-
-# Include file finding utility script
-include("${FILEPATH_PROJECT_ROOT}/cmake/find_gpu.cmake")
-
-# Check if the file exists in the current directory
-find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME}
-                  TARGET_FILE_PATH)
-if("${TARGET_FILE_PATH}" STREQUAL "")
-    find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME}
-                      TARGET_FILE_PATH)
-    if("${TARGET_FILE_PATH}" STREQUAL "")
-        message(
-            FATAL_ERROR
-                "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../"
-        )
-    endif()
-endif()
-
 # Ensure the build type is set
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE
@@ -34,34 +22,24 @@ if(NOT CMAKE_BUILD_TYPE)
         CACHE STRING "Choose the type of build: Debug or Release" FORCE)
 endif()
 
-# Define architecture and build type directories or file names
-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(ARCH "x64")
-else()
-    set(ARCH "x86")
-endif()
-
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(BUILD_TYPE "Debug")
-else()
-    set(BUILD_TYPE "Release")
-endif()
+# Include the gpu.cpp + Dawn library
+include("${PROJECT_ROOT}/cmake/gpu.cmake")
 
-if(NOT TARGET gpu)
-    message(STATUS "GPU_LIB not found")
-    include("${TARGET_FILE_PATH}/cmake/gpu.cmake")
-endif()
+# Create the executable
 add_executable(${PROJECT_NAME} run.cpp)
+
+# Link gpu + dawn library
 target_link_libraries(${PROJECT_NAME} PRIVATE gpu)
-target_link_libraries(${PROJECT_NAME} PRIVATE ${WEBGPU_DAWN})
 
+# Certain platforms need to copy the library files to the build directory
 if(MSVC)
-# Copy webgpu_dawn.dll to the build directory
+    # Copy webgpu_dawn.dll to the build directory
+    # CMake multigenerators like MSVC need --config Release on
+    # the cmake --build command or they will output to /Debug
     add_custom_command(
         TARGET ${PROJECT_NAME} POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E copy
                 ${DAWN_INSTALL_PREFIX}/${CMAKE_BUILD_TYPE}/webgpu_dawn.dll
-                $<TARGET_FILE_DIR:${PROJECT_NAME}>
-    )
+                $<TARGET_FILE_DIR:${PROJECT_NAME}>)
 endif()
 
diff --git a/cmake/find_gpu.cmake b/cmake/find_gpu.cmake
deleted file mode 100644
index b6b7dad..0000000
--- a/cmake/find_gpu.cmake
+++ /dev/null
@@ -1,30 +0,0 @@
-# file name to find
-set(FILENAME "gpu.hpp")
-
-# Function to check for file existence up the directory hierarchy
-function(find_project_root current_dir filename result_var)
-    set(found FALSE) # Flag to indicate if the file is found
-    set(current_check_dir "${current_dir}") # Start from the given directory
-    # using 1 is jsut to supress the cmane-format warning
-    foreach(i RANGE 0 2 1)
-        set(filepath "${current_check_dir}/${filename}")
-
-        if(EXISTS "${filepath}")
-            set(${result_var}
-                "${current_check_dir}"
-                PARENT_SCOPE)
-            set(found TRUE)
-            break()
-        endif()
-
-        # Move one level up
-        get_filename_component(current_check_dir "${current_check_dir}"
-                               DIRECTORY)
-    endforeach()
-
-    if(NOT found)
-        set(${result_var}
-            ""
-            PARENT_SCOPE) # Set to empty if not found
-    endif()
-endfunction()
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 15f3b43..c8f011a 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -1,44 +1,46 @@
 set(FILENAME "gpu.hpp")
 
+# Setup project root here.
 if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
-    set(FILEPATH_PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+    set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 else()
     get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
     get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
-    
-    set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/")
+
+    set(PROJECT_ROOT "${PROJECT_ROOT}/")
 endif()
 
+message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}")
+
 
 include(FetchContent)
 
-set(FETCHCONTENT_BASE_DIR "${FILEPATH_PROJECT_ROOT}/third_party/fetchcontent/_deps")
+set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party/fetchcontent/_deps")
 set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_TYPE}" CACHE INTERNAL "Dawn install location" FORCE)
 
 
 # Before fetching, set configuration options for Dawn.
-# These CMake variables are “global” (cached INTERNAL) so that Dawn’s own CMakeLists.txt
-# will pick them up. Adjust them as needed.
-set(DAWN_BUILD_TYPE          ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn build type" FORCE)
 set(DCMAKE_INSTALL_PREFIX   ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE)
+set(CMAKE_CONFIGURATION_TYPES ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn configuration types" FORCE)
 
-# Dawn options
-set(DAWN_FETCH_DEPENDENCIES ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
-set(DAWN_ENABLE_INSTALL      ON  CACHE INTERNAL "Enable Dawn installation" FORCE)
-set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE INTERNAL "Build Dawn monolithically" FORCE)
+# Dawn options for more,
+# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt
+set(DAWN_ALWAYS_ASSERT     OFF CACHE INTERNAL "Always assert in Dawn" FORCE)
+set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE)
 set(DAWN_BUILD_EXAMPLES      OFF CACHE INTERNAL "Build Dawn examples" FORCE)
 set(DAWN_BUILD_SAMPLES      OFF CACHE INTERNAL "Build Dawn samples" FORCE)
 set(DAWN_BUILD_TESTS         OFF CACHE INTERNAL "Build Dawn tests" FORCE)
-set(DAWN_BUILD_UTILS         OFF CACHE INTERNAL "Build Dawn utilities" FORCE)
+set(DAWN_ENABLE_INSTALL      ON  CACHE INTERNAL "Enable Dawn installation" FORCE)
+set(DAWN_FETCH_DEPENDENCIES ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
+
 set(TINT_BUILD_TESTS        OFF CACHE INTERNAL "Build Tint Tests" FORCE)
 set(TINT_BUILD_IR_BINARY    OFF CACHE INTERNAL "Build Tint IR binary" FORCE)
 set(TINT_BUILD_CMD_TOOLS   OFF CACHE INTERNAL "Build Tint command line tools" FORCE)
-set(BUILD_SHARED_LIBS       OFF CACHE INTERNAL "Build shared libraries" FORCE)
-
 
-# Set up an install location for Dawn – you can change this to a specific location.
+set(BUILD_SHARED_LIBS       OFF CACHE INTERNAL "Build shared libraries" FORCE)
 
 
+# Fetch Setup
 FetchContent_Declare(
     dawn
     DOWNLOAD_COMMAND
@@ -49,67 +51,56 @@ FetchContent_Declare(
 )
  
 
-# This call will download the repository and add it as a subdirectory.
+# Download the repository and add it as a subdirectory.
 FetchContent_MakeAvailable(dawn)
 
  
-# At this point, assuming Dawn’s CMakeLists.txt is written so that an install step is available,
-# we trigger a build of its install target. This custom target will build (and install) Dawn
-# into ${DAWN_INSTALL_PREFIX}. (If Dawn already adds an install target, you may simply depend on it.)
-add_custom_target(build_dawn_config ALL
+# Since we require Dawn to be built before linking against it, we need to configure it now.
+execute_process(
     COMMAND ${CMAKE_COMMAND} ${FETCHCONTENT_BASE_DIR}/dawn-src 
         -B ${DAWN_INSTALL_PREFIX}
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-        -DDAWN_FETCH_DEPENDENCIES=ON
-        -DDAWN_ENABLE_INSTALL=ON
-        -DDAWN_BUILD_MONOLITHIC_LIBRARY=OFF
-        -DDAWN_BUILD_EXAMPLES=OFF
-        -DDAWN_BUILD_SAMPLES=OFF
-        -DDAWN_BUILD_TESTS=OFF
-        -DDAWN_BUILD_UTILS=OFF
-        -DTINT_BUILD_TESTS=OFF
-        -DTINT_BUILD_IR_BINARY=OFF
-        -DTINT_BUILD_CMD_TOOLS=OFF
-        -DBUILD_SHARED_LIBS=OFF
         -G "${CMAKE_GENERATOR}"
-    COMMENT "Configuring Dawn build with custom options in ${DAWN_INSTALL_PREFIX}"
 )
 
-add_custom_target(build_dawn_install ALL
-    COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --target install
-    COMMENT "Installing Dawn into ${DAWN_INSTALL_PREFIX}"
+# Build Dawn
+execute_process(
+    WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}/dawn-src
+    COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --config ${CMAKE_BUILD_TYPE}
 )
 
-include(${FETCHCONTENT_BASE_DIR}/dawn-build/cmake/DawnTargets.cmake)
-
+# Add sources
 set(GPU_SOURCES
-    "${FILEPATH_PROJECT_ROOT}/gpu.cpp"
-    "${FILEPATH_PROJECT_ROOT}/numeric_types/half.cpp"
+    "${PROJECT_ROOT}/gpu.cpp"
+    "${PROJECT_ROOT}/numeric_types/half.cpp"
 )
 
+# Add headers
 set(GPU_HEADERS
-    "${FILEPATH_PROJECT_ROOT}/gpu.hpp"
-    "${FILEPATH_PROJECT_ROOT}/utils/logging.hpp"
-    "${FILEPATH_PROJECT_ROOT}/utils/array_utils.hpp"
-    "${FILEPATH_PROJECT_ROOT}/numeric_types/half.hpp"
+    "${PROJECT_ROOT}/gpu.hpp"
+    "${PROJECT_ROOT}/utils/logging.hpp"
+    "${PROJECT_ROOT}/utils/array_utils.hpp"
+    "${PROJECT_ROOT}/numeric_types/half.hpp"
 )
 
+# Emscripten includes a header automatically
 if(EMSCRIPTEN)
-    file(REMOVE "${FILEPATH_PROJECT_ROOT}/webgpu/webgpu.h")
+    file(REMOVE "${PROJECT_ROOT}/webgpu/webgpu.h")
 else()
-    list(APPEND GPU_HEADERS "${DAWN_INSTALL_PREFIX}/gen/webgpu-headers/webgpu.h")
+    list(APPEND GPU_HEADERS "${PROJECT_ROOT}/third_party/headers/webgpu/webgpu.h")
 endif()
 
 
-# Create the INTERFACE library ‘gpu’
+# Create the STATIC library for gpu
 add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
-target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}")
-target_include_directories(gpu PUBLIC "${FILEPATH_PROJECT_ROOT}/third_party/headers")
+target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
+target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers")
 
-# Ensure that the gpu target is built only after Dawn has been installed.
-add_dependencies(gpu build_dawn_install)
-
-find_library(WEBGPU_DAWN
+# Find the monolithic library for Dawn
+find_library(WEBGPU_DAWN_MONOLITHIC
     NAMES webgpu_dawn
-    HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/Debug/"
-)
\ No newline at end of file
+    HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}"
+)
+
+# Link the monolithic library
+target_link_libraries(gpu PRIVATE ${WEBGPU_DAWN_MONOLITHIC})

From 2360ba9af7432c33a2703eb2a88706d4150387dc Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Tue, 11 Feb 2025 13:44:09 -0600
Subject: [PATCH 03/30] cleanup

---
 CMakeLists.txt  | 2 +-
 cmake/gpu.cmake | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca735a9..e8e569a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,4 +24,4 @@ include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
 
 add_library(gpud SHARED gpu.hpp)
 set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(gpud PRIVATE gpu)
\ No newline at end of file
+target_link_libraries(gpud PRIVATE gpu)
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index c8f011a..11d6c67 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -41,6 +41,8 @@ set(BUILD_SHARED_LIBS       OFF CACHE INTERNAL "Build shared libraries" FORCE)
 
 
 # Fetch Setup
+# Add a commit hash to pin the version of Dawn.
+# git fetch --depth=1 url <commit hash>
 FetchContent_Declare(
     dawn
     DOWNLOAD_COMMAND

From 30f7594896ecd9c4b7616bd9ad0d03598f0b4939 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Tue, 11 Feb 2025 15:59:24 -0600
Subject: [PATCH 04/30] build path for msvc find library

---
 cmake/gpu.cmake | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 11d6c67..1767a50 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -6,7 +6,6 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
 else()
     get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
     get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
-
     set(PROJECT_ROOT "${PROJECT_ROOT}/")
 endif()
 
@@ -21,7 +20,6 @@ set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_T
 
 # Before fetching, set configuration options for Dawn.
 set(DCMAKE_INSTALL_PREFIX   ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE)
-set(CMAKE_CONFIGURATION_TYPES ${CMAKE_BUILD_TYPE} CACHE INTERNAL "Dawn configuration types" FORCE)
 
 # Dawn options for more,
 # see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt
@@ -98,11 +96,18 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
 target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
 target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers")
 
-# Find the monolithic library for Dawn
-find_library(WEBGPU_DAWN_MONOLITHIC
+# find_library, windows adds extra folder
+if(MSVC)
+    find_library(WEBGPU_DAWN_MONOLITHIC
     NAMES webgpu_dawn
-    HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}"
-)
+    PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}"
+    )
+else()
+    find_library(WEBGPU_DAWN_MONOLITHIC
+    NAMES webgpu_dawn
+    PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native"
+    )
+endif()
 
 # Link the monolithic library
 target_link_libraries(gpu PRIVATE ${WEBGPU_DAWN_MONOLITHIC})

From 82ff79d1b2853e29f9e5de81c93bef9670535b4d Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Tue, 11 Feb 2025 16:13:29 -0600
Subject: [PATCH 05/30] require the libs so we fail early

---
 cmake/gpu.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 1767a50..b687e83 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -101,11 +101,13 @@ if(MSVC)
     find_library(WEBGPU_DAWN_MONOLITHIC
     NAMES webgpu_dawn
     PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}"
+    REQUIRED
     )
 else()
     find_library(WEBGPU_DAWN_MONOLITHIC
     NAMES webgpu_dawn
     PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native"
+    REQUIRED
     )
 endif()
 

From ebd5bcf1b7f3dc7d9f888039adba36cbf6e39b4f Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Tue, 11 Feb 2025 17:22:24 -0600
Subject: [PATCH 06/30] use hints for MSVC

---
 cmake/gpu.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index b687e83..52a348b 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -100,7 +100,7 @@ target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers")
 if(MSVC)
     find_library(WEBGPU_DAWN_MONOLITHIC
     NAMES webgpu_dawn
-    PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}"
+    HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}"
     REQUIRED
     )
 else()

From d1c0b81a529f49c9bddb2e27021ce3624824c32e Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sun, 16 Feb 2025 18:24:49 -0600
Subject: [PATCH 07/30] adds emscripten support

---
 .gitignore                        |   2 +
 CMakeLists.txt                    |   1 +
 cmake/dawn.cmake                  | 149 ++++++++++++++++++++++++++++++
 cmake/example.cmake               |  94 ++++++++++++++-----
 cmake/gpu.cmake                   |  89 ++----------------
 cmake/templates/index.html.in     |  22 +++++
 examples/shadertui/CMakeLists.txt |   1 +
 gpu.hpp                           |   6 +-
 numeric_types/half.cpp            |   2 +-
 9 files changed, 260 insertions(+), 106 deletions(-)
 create mode 100644 cmake/dawn.cmake
 create mode 100644 cmake/templates/index.html.in

diff --git a/.gitignore b/.gitignore
index c7f60c3..4dc9cf7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 build/*
 # any build subdirectory in the tree
 **/build/
+**/build_web/
 examples/hello_gpu/build/*
 examples/raymarch/build/*
 docs/html
@@ -8,6 +9,7 @@ source
 .DS_Store
 third_party/lib/*
 third_party/local/*
+third_party/dawn/*
 
 # formatter files
 .cmake-format.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8e569a..816cdf3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ if(DEBUG)
     set(CMAKE_CXX_FLAGS "-O0 -g")
 endif()
 
+include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
 
 add_library(gpud SHARED gpu.hpp)
diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
new file mode 100644
index 0000000..f7ab748
--- /dev/null
+++ b/cmake/dawn.cmake
@@ -0,0 +1,149 @@
+# Setup directories
+set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party")
+set(DAWN_DIR "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "")
+set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "")
+
+if(EMSCRIPTEN)
+    set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
+    set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
+endif()
+
+function(find_dawn_library)
+    if(MSVC)
+        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
+            NAMES webgpu_dawn
+            HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
+        )
+        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
+            NAMES webgpu_dawn
+            HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
+        )
+    elseif(NOT EMSCRIPTEN AND NOT MSVC)
+        find_library(WEBGPU_DAWN_LIB
+            NAMES webgpu_dawn
+            PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
+            REQUIRED
+        )
+    endif()
+    
+    # Set result variables in parent scope
+    set(DAWN_BUILD_FOUND ON PARENT_SCOPE)
+    if(MSVC)
+        set(WEBGPU_DAWN_DEBUG ${WEBGPU_DAWN_DEBUG} PARENT_SCOPE)
+        set(WEBGPU_DAWN_RELEASE ${WEBGPU_DAWN_RELEASE} PARENT_SCOPE)
+    else()
+        set(WEBGPU_DAWN_LIB ${WEBGPU_DAWN_LIB} PARENT_SCOPE)
+    endif()
+endfunction()
+
+# Enable find for no dawn rebuilds with flutter run
+set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE)
+set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE)
+if(ENABLE_DAWN_FIND)
+    # find_library, windows adds extra folder
+    if(MSVC)
+        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
+        )
+        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
+        )
+        set(DAWN_BUILD_FOUND ON)
+    elseif(NOT EMSCRIPTEN AND NOT MSVC)
+        find_library(WEBGPU_DAWN_LIB
+        NAMES webgpu_dawn
+        PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
+        REQUIRED
+        )
+        set(DAWN_BUILD_FOUND ON)
+    else()
+        set(DAWN_BUILD_FOUND ON)
+    endif()
+endif()
+
+# Dawn options for more,
+# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt
+set(DAWN_ALWAYS_ASSERT     OFF CACHE INTERNAL "Always assert in Dawn" FORCE)
+set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE)
+set(DAWN_BUILD_EXAMPLES      OFF CACHE INTERNAL "Build Dawn examples" FORCE)
+set(DAWN_BUILD_SAMPLES      OFF CACHE INTERNAL "Build Dawn samples" FORCE)
+set(DAWN_BUILD_TESTS         OFF CACHE INTERNAL "Build Dawn tests" FORCE)
+set(DAWN_ENABLE_INSTALL      OFF  CACHE INTERNAL "Enable Dawn installation" FORCE)
+set(DAWN_FETCH_DEPENDENCIES ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
+set(TINT_BUILD_TESTS        OFF CACHE INTERNAL "Build Tint Tests" FORCE)
+set(TINT_BUILD_IR_BINARY    OFF CACHE INTERNAL "Build Tint IR binary" FORCE)
+set(TINT_BUILD_CMD_TOOLS   OFF CACHE INTERNAL "Build Tint command line tools" FORCE)
+
+if(NOT DAWN_BUILD_FOUND)
+    include(FetchContent)
+    message("webgpu_dawn not found start building")
+    if(EMSCRIPTEN)
+        set(EMSCRIPTEN_DIR "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "" FORCE)
+        set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EMSCRIPTEN_DIR} CACHE INTERNAL "" FORCE)
+    endif()
+
+    FetchContent_Declare(
+        dawn
+        DOWNLOAD_DIR ${DAWN_DIR}
+        SOURCE_DIR ${DAWN_DIR}
+        SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp
+        BINARY_DIR ${DAWN_BUILD_DIR}
+        DOWNLOAD_COMMAND
+        cd ${DAWN_DIR} &&
+        git init &&
+        git fetch --depth=1 https://dawn.googlesource.com/dawn &&
+        git reset --hard FETCH_HEAD
+    )
+
+    # Download the repository and add it as a subdirectory.
+    FetchContent_MakeAvailable(dawn)
+
+    # attempt fix flutter rebuilds
+    set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "")
+
+    execute_process(
+        WORKING_DIRECTORY ${DAWN_DIR}
+        COMMAND ${CMAKE_COMMAND} -S ${DAWN_DIR}
+            -B ${DAWN_BUILD_DIR}
+    )
+
+    # Build Dawn
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} --build ${DAWN_BUILD_DIR}
+    )
+    
+    # find_library, windows adds extra folder
+    if(MSVC)
+        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
+        )
+        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
+        )
+        set(DAWN_BUILD_FOUND ON)
+    elseif(NOT EMSCRIPTEN AND NOT MSVC)
+        find_library(WEBGPU_DAWN_LIB
+        NAMES webgpu_dawn
+        PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
+        REQUIRED
+        )
+        set(DAWN_BUILD_FOUND ON)
+    else()
+        set(DAWN_BUILD_FOUND ON)
+    endif()
+endif()
+
+if(EMSCRIPTEN)
+    add_library(webgpu_dawn INTERFACE IMPORTED)
+    target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include)
+    target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/webgpu.h)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js)
+else()
+endif()
\ No newline at end of file
diff --git a/cmake/example.cmake b/cmake/example.cmake
index d92c204..99578fd 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -1,9 +1,14 @@
 # Getting Started with CMAKE
-# Each example includes this and sets PROJECT_NAME
-# cd examples/hello_world
-# cmake -S . build/ -DCMAKE_BUILD_TYPE=Release
-# cmake --build build/ --config Release
-# ./build/hello_world
+# Each example includes this and sets PROJECT_NAME.
+#
+# Example usage:
+#   cd examples/hello_world
+#   cmake -S . build/ -DCMAKE_BUILD_TYPE=Release
+#   cmake --build build/ --config Release
+#   ./build/hello_world   (or serve the output .js/.wasm for Emscripten)
+#   or for emscripten
+#   emcmake cmake -S . -B ./build_web -DCMAKE_BUILD_TYPE=Release
+#   cmake --build build_web --config Release
 
 if(NOT MSVC)
     set(CMAKE_CXX_STANDARD 17)
@@ -11,35 +16,82 @@ else()
     set(CMAKE_CXX_STANDARD 20)
 endif()
 
-# Path finding logic to find our root recipes from nested folders
+# Locate the project root (two levels up from the current source dir)
 get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
 get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
 
-# Ensure the build type is set
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE
-        Release
-        CACHE STRING "Choose the type of build: Debug or Release" FORCE)
-endif()
-
-# Include the gpu.cpp + Dawn library
+# Include external libraries and helper scripts (dawn and gpu)
+include("${PROJECT_ROOT}/cmake/dawn.cmake")
 include("${PROJECT_ROOT}/cmake/gpu.cmake")
 
 # Create the executable
 add_executable(${PROJECT_NAME} run.cpp)
 
-# Link gpu + dawn library
+# Platform-specific linking & build settings
+if(EMSCRIPTEN)
+    # Emscripten-specific configuration
+
+    # Define a web output directory (adjust as needed)
+    set(WEB_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/web_build")
+
+    # If necessary, include the generated WebGPU include dirs first.
+    include_directories(BEFORE "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
+
+    # Create a helper library for WebGPU support.
+    add_library(webgpu_web "${DAWN_DIR}/third_party/emdawnwebgpu/webgpu.cpp")
+    target_link_libraries(${PROJECT_NAME} PRIVATE webgpu_web)
+
+    # Set Emscripten-specific link flags that enable WASM output and expose certain symbols.
+    # Needed to use updated version, emdawnwebgpu
+    set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\
+        -sUSE_WEBGPU=0 \
+        -sWASM=1 \
+        -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \
+        -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
+        -sEXPORTED_RUNTIME_METHODS=ccall \
+        -sUSE_GLFW=3 \
+        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \
+        -sASYNCIFY \
+        --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
+        --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
+        --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \
+        --js-library=${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js \
+        --closure-args=--externs=${EMSCRIPTEN_DIR}/src/closure-externs/webgpu-externs.js \
+        -O3 \
+    ")
+
+else()
+    # Non-Emscripten (desktop) linking
+    if(MSVC)
+        target_link_libraries(gpu 
+            PRIVATE 
+                $<$<CONFIG:Debug>:${WEBGPU_DAWN_DEBUG}>
+                $<$<CONFIG:Release>:${WEBGPU_DAWN_RELEASE}>
+        )
+    else()
+        target_link_libraries(gpu PRIVATE webgpu_dawn)
+    endif()
+endif()
+
+# Link the gpu/dawn library to the executable.
 target_link_libraries(${PROJECT_NAME} PRIVATE gpu)
 
-# Certain platforms need to copy the library files to the build directory
+# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
 if(MSVC)
-    # Copy webgpu_dawn.dll to the build directory
-    # CMake multigenerators like MSVC need --config Release on
-    # the cmake --build command or they will output to /Debug
     add_custom_command(
         TARGET ${PROJECT_NAME} POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E copy
-                ${DAWN_INSTALL_PREFIX}/${CMAKE_BUILD_TYPE}/webgpu_dawn.dll
-                $<TARGET_FILE_DIR:${PROJECT_NAME}>)
+                ${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
+                $<TARGET_FILE_DIR:${PROJECT_NAME}>
+        COMMENT "Copying webgpu_dawn.dll to the build directory"
+    )
 endif()
 
+if(EMSCRIPTEN)
+
+    # Configure the HTML file by replacing @PROJECT_NAME@ with the actual target name.
+    configure_file(${PROJECT_ROOT}cmake/templates/index.html.in
+                   ${CMAKE_CURRENT_BINARY_DIR}/index.html
+                   @ONLY)
+
+endif()
\ No newline at end of file
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 52a348b..6cce9e6 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -11,68 +11,11 @@ endif()
 
 message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}")
 
-
-include(FetchContent)
-
-set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party/fetchcontent/_deps")
-set(DAWN_INSTALL_PREFIX "${FETCHCONTENT_BASE_DIR}/dawn-build/out/${CMAKE_BUILD_TYPE}" CACHE INTERNAL "Dawn install location" FORCE)
-
-
-# Before fetching, set configuration options for Dawn.
-set(DCMAKE_INSTALL_PREFIX   ${DAWN_INSTALL_PREFIX} CACHE INTERNAL "Dawn install location" FORCE)
-
-# Dawn options for more,
-# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt
-set(DAWN_ALWAYS_ASSERT     OFF CACHE INTERNAL "Always assert in Dawn" FORCE)
-set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE)
-set(DAWN_BUILD_EXAMPLES      OFF CACHE INTERNAL "Build Dawn examples" FORCE)
-set(DAWN_BUILD_SAMPLES      OFF CACHE INTERNAL "Build Dawn samples" FORCE)
-set(DAWN_BUILD_TESTS         OFF CACHE INTERNAL "Build Dawn tests" FORCE)
-set(DAWN_ENABLE_INSTALL      ON  CACHE INTERNAL "Enable Dawn installation" FORCE)
-set(DAWN_FETCH_DEPENDENCIES ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
-
-set(TINT_BUILD_TESTS        OFF CACHE INTERNAL "Build Tint Tests" FORCE)
-set(TINT_BUILD_IR_BINARY    OFF CACHE INTERNAL "Build Tint IR binary" FORCE)
-set(TINT_BUILD_CMD_TOOLS   OFF CACHE INTERNAL "Build Tint command line tools" FORCE)
-
-set(BUILD_SHARED_LIBS       OFF CACHE INTERNAL "Build shared libraries" FORCE)
-
-
-# Fetch Setup
-# Add a commit hash to pin the version of Dawn.
-# git fetch --depth=1 url <commit hash>
-FetchContent_Declare(
-    dawn
-    DOWNLOAD_COMMAND
-    cd ${FETCHCONTENT_BASE_DIR}/dawn-src &&
-    git init &&
-    git fetch --depth=1 https://dawn.googlesource.com/dawn &&
-    git reset --hard FETCH_HEAD
-)
- 
-
-# Download the repository and add it as a subdirectory.
-FetchContent_MakeAvailable(dawn)
-
- 
-# Since we require Dawn to be built before linking against it, we need to configure it now.
-execute_process(
-    COMMAND ${CMAKE_COMMAND} ${FETCHCONTENT_BASE_DIR}/dawn-src 
-        -B ${DAWN_INSTALL_PREFIX}
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-        -G "${CMAKE_GENERATOR}"
-)
-
-# Build Dawn
-execute_process(
-    WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}/dawn-src
-    COMMAND ${CMAKE_COMMAND} --build ${DAWN_INSTALL_PREFIX} --config ${CMAKE_BUILD_TYPE}
-)
-
 # Add sources
 set(GPU_SOURCES
     "${PROJECT_ROOT}/gpu.cpp"
     "${PROJECT_ROOT}/numeric_types/half.cpp"
+    "${DAWN_BUILD_DIR}/gen/include/dawn/webgpu.h"
 )
 
 # Add headers
@@ -81,35 +24,17 @@ set(GPU_HEADERS
     "${PROJECT_ROOT}/utils/logging.hpp"
     "${PROJECT_ROOT}/utils/array_utils.hpp"
     "${PROJECT_ROOT}/numeric_types/half.hpp"
+    
 )
 
-# Emscripten includes a header automatically
-if(EMSCRIPTEN)
-    file(REMOVE "${PROJECT_ROOT}/webgpu/webgpu.h")
-else()
-    list(APPEND GPU_HEADERS "${PROJECT_ROOT}/third_party/headers/webgpu/webgpu.h")
-endif()
-
-
 # Create the STATIC library for gpu
 add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
+set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
 target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
-target_include_directories(gpu PUBLIC "${PROJECT_ROOT}/third_party/headers")
-
-# find_library, windows adds extra folder
-if(MSVC)
-    find_library(WEBGPU_DAWN_MONOLITHIC
-    NAMES webgpu_dawn
-    HINTS "${DAWN_INSTALL_PREFIX}/src/dawn/native/${CMAKE_BUILD_TYPE}"
-    REQUIRED
-    )
+if(NOT EMSCRIPTEN)
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/")
 else()
-    find_library(WEBGPU_DAWN_MONOLITHIC
-    NAMES webgpu_dawn
-    PATHS "${DAWN_INSTALL_PREFIX}/src/dawn/native"
-    REQUIRED
-    )
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
 endif()
 
-# Link the monolithic library
-target_link_libraries(gpu PRIVATE ${WEBGPU_DAWN_MONOLITHIC})
diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in
new file mode 100644
index 0000000..1bd64ca
--- /dev/null
+++ b/cmake/templates/index.html.in
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <title>@PROJECT_NAME@</title>
+  </head>
+  <body>
+    <!-- The generated JS file loads the WASM module -->
+    <script src="@PROJECT_NAME@.js"></script>
+    <script>
+      // When the Emscripten runtime is ready, call main.
+      if (typeof Module !== 'undefined') {
+        Module.onRuntimeInitialized = function() {
+          // Optionally, pass arguments to main in an array.
+          Module._main([]);
+        };
+      } else {
+        console.error('Module is undefined. Check that your generated JS file is loaded properly.');
+      }
+    </script>
+  </body>
+</html>
\ No newline at end of file
diff --git a/examples/shadertui/CMakeLists.txt b/examples/shadertui/CMakeLists.txt
index 0938023..b728fc8 100644
--- a/examples/shadertui/CMakeLists.txt
+++ b/examples/shadertui/CMakeLists.txt
@@ -1,3 +1,4 @@
+# Not working yet needs update with libs for emscripten
 cmake_minimum_required(VERSION 3.28)
 project(shadertui)
 
diff --git a/gpu.hpp b/gpu.hpp
index 5327fe7..edc8b38 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -15,7 +15,7 @@
 #include <utility> // std::pair
 #include <vector>
 
-#include "webgpu/webgpu.h"
+#include "webgpu.h"
 
 #include "numeric_types/half.hpp"
 #include "utils/logging.hpp"
@@ -910,6 +910,7 @@ inline Context createContext(
 
     // If the device was created, set up logging and fetch the queue
     if (devData.status == WGPURequestDeviceStatus_Success) {
+      #ifndef __EMSCRIPTEN__
       WGPULoggingCallbackInfo loggingCallbackInfo {
         .nextInChain = nullptr,
         .callback =
@@ -925,6 +926,7 @@ inline Context createContext(
         .userdata2 = nullptr
       };
       wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
+      #endif
       ctx.queue = wgpuDeviceGetQueue(ctx.device);
     }
   }
@@ -1206,7 +1208,7 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
   }
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
-  CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise,
+  CallbackData callbackData = {op.readbackBuffer, static_cast<size_t>(bufferSize), data, &op.promise,
                                &op.future};
 
   WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index e5bdaf0..fe5aab7 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -241,7 +241,7 @@ fn main(
   }
 }
 
-int main() {
+int testMain() {
   printf("\nHalf-precision float tests\n==========================\n");
 
   printf("\nRegular values float round trips\n\n");

From 9247b79f3f31b87a19bc7dc0ae524608e8eea593 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sun, 16 Feb 2025 18:26:57 -0600
Subject: [PATCH 08/30] remove redundant find function

---
 cmake/dawn.cmake | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
index f7ab748..b9394d4 100644
--- a/cmake/dawn.cmake
+++ b/cmake/dawn.cmake
@@ -8,34 +8,6 @@ if(EMSCRIPTEN)
     set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
 endif()
 
-function(find_dawn_library)
-    if(MSVC)
-        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
-            NAMES webgpu_dawn
-            HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
-        )
-        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
-            NAMES webgpu_dawn
-            HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
-        )
-    elseif(NOT EMSCRIPTEN AND NOT MSVC)
-        find_library(WEBGPU_DAWN_LIB
-            NAMES webgpu_dawn
-            PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
-            REQUIRED
-        )
-    endif()
-    
-    # Set result variables in parent scope
-    set(DAWN_BUILD_FOUND ON PARENT_SCOPE)
-    if(MSVC)
-        set(WEBGPU_DAWN_DEBUG ${WEBGPU_DAWN_DEBUG} PARENT_SCOPE)
-        set(WEBGPU_DAWN_RELEASE ${WEBGPU_DAWN_RELEASE} PARENT_SCOPE)
-    else()
-        set(WEBGPU_DAWN_LIB ${WEBGPU_DAWN_LIB} PARENT_SCOPE)
-    endif()
-endfunction()
-
 # Enable find for no dawn rebuilds with flutter run
 set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE)
 set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE)

From 3e59576f2b752a4f6255445d0569583d87f38d44 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sun, 16 Feb 2025 18:42:27 -0600
Subject: [PATCH 09/30] clean linker flags

---
 cmake/example.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/example.cmake b/cmake/example.cmake
index 99578fd..192358f 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -9,6 +9,7 @@
 #   or for emscripten
 #   emcmake cmake -S . -B ./build_web -DCMAKE_BUILD_TYPE=Release
 #   cmake --build build_web --config Release
+#   python3 -m http.server 8080 --d build_web
 
 if(NOT MSVC)
     set(CMAKE_CXX_STANDARD 17)
@@ -50,14 +51,13 @@ if(EMSCRIPTEN)
         -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
         -sEXPORTED_RUNTIME_METHODS=ccall \
         -sUSE_GLFW=3 \
-        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \
+        -sALLOW_MEMORY_GROWTH=1 \
         -sASYNCIFY \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \
         --js-library=${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js \
         --closure-args=--externs=${EMSCRIPTEN_DIR}/src/closure-externs/webgpu-externs.js \
-        -O3 \
     ")
 
 else()

From 0653a4b524e7bbb6e91d2fe02c827ee7782d5b65 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sun, 16 Feb 2025 18:47:51 -0600
Subject: [PATCH 10/30] needs large stack size

---
 cmake/example.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/example.cmake b/cmake/example.cmake
index 192358f..8216077 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -51,7 +51,7 @@ if(EMSCRIPTEN)
         -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
         -sEXPORTED_RUNTIME_METHODS=ccall \
         -sUSE_GLFW=3 \
-        -sALLOW_MEMORY_GROWTH=1 \
+        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \
         -sASYNCIFY \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \

From 9f4059b7d1ae5a0eba4fa26772a42a73064a66bf Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sun, 16 Feb 2025 18:50:13 -0600
Subject: [PATCH 11/30] use stack in MB instead

---
 cmake/example.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/example.cmake b/cmake/example.cmake
index 8216077..6f195ec 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -51,7 +51,7 @@ if(EMSCRIPTEN)
         -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
         -sEXPORTED_RUNTIME_METHODS=ccall \
         -sUSE_GLFW=3 \
-        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=10000000 \
+        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \
         -sASYNCIFY \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \

From 78ae4685a15faf8f962d8f63a66f26fa86ed24ca Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Mon, 17 Feb 2025 17:30:11 -0600
Subject: [PATCH 12/30] must set DAWN_EMSCRIPTEN_TOOLCHAIN for build too

---
 cmake/dawn.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
index b9394d4..46d7403 100644
--- a/cmake/dawn.cmake
+++ b/cmake/dawn.cmake
@@ -6,6 +6,7 @@ set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "")
 if(EMSCRIPTEN)
     set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
     set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
+    set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
 endif()
 
 # Enable find for no dawn rebuilds with flutter run

From 6197322e8e48e1c761043d6ca6badc89724b95e3 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Mon, 17 Feb 2025 22:59:26 -0600
Subject: [PATCH 13/30] EOF fixes

---
 cmake/dawn.cmake              | 2 +-
 cmake/example.cmake           | 2 +-
 cmake/templates/index.html.in | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
index 46d7403..2ead9ae 100644
--- a/cmake/dawn.cmake
+++ b/cmake/dawn.cmake
@@ -119,4 +119,4 @@ if(EMSCRIPTEN)
     target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js)
     target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js)
 else()
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/example.cmake b/cmake/example.cmake
index 6f195ec..7cf1f8d 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -94,4 +94,4 @@ if(EMSCRIPTEN)
                    ${CMAKE_CURRENT_BINARY_DIR}/index.html
                    @ONLY)
 
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in
index 1bd64ca..b6f130c 100644
--- a/cmake/templates/index.html.in
+++ b/cmake/templates/index.html.in
@@ -19,4 +19,4 @@
       }
     </script>
   </body>
-</html>
\ No newline at end of file
+</html>

From 9ac780bef1c6813f43855f7d9d7d33a733876c45 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Wed, 19 Feb 2025 16:30:50 -0600
Subject: [PATCH 14/30] refactors async

---
 cmake/templates/index.html.in |   2 +-
 examples/hello_world/run.cpp  |  10 +-
 gpu.hpp                       | 838 ++++++++++++++++++++++------------
 numeric_types/half.cpp        |  15 +-
 4 files changed, 550 insertions(+), 315 deletions(-)

diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in
index b6f130c..6b5957b 100644
--- a/cmake/templates/index.html.in
+++ b/cmake/templates/index.html.in
@@ -12,7 +12,7 @@
       if (typeof Module !== 'undefined') {
         Module.onRuntimeInitialized = function() {
           // Optionally, pass arguments to main in an array.
-          Module._main([]);
+          Module.ccall('main', 'number', [], [], { async: true });
         };
       } else {
         console.error('Module is undefined. Check that your generated JS file is loaded properly.');
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 7453869..06970a7 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -38,12 +38,14 @@ int main(int argc, char **argv) {
   Tensor output = createTensor(ctx, Shape{N}, kf32);
   std::promise<void> promise;
   std::future<void> future = promise.get_future();
-  Kernel op = createKernel(ctx, {kGelu, 256, kf32},
+  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  dispatchKernel(ctx, op, promise);
-  wait(ctx, future);
-  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  Kernel op = waitForFuture(ctx.instance, kernelFuture);
+  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
+  waitForFuture(ctx.instance, dispatchFuture);
+  std::future<void> cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  waitForFuture(ctx.instance, cpuFuture);
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
   }
diff --git a/gpu.hpp b/gpu.hpp
index edc8b38..052c674 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -1,6 +1,7 @@
 #ifndef GPU_HPP
 #define GPU_HPP
 
+#include "webgpu.h"
 #include <array>
 #include <cassert>
 #include <cstring>
@@ -15,15 +16,15 @@
 #include <utility> // std::pair
 #include <vector>
 
-#include "webgpu.h"
+#ifndef __EMSCRIPTEN__
 
-#include "numeric_types/half.hpp"
-#include "utils/logging.hpp"
-
-#ifdef __EMSCRIPTEN__
+#else
 #include "emscripten/emscripten.h"
 #endif
 
+#include "numeric_types/half.hpp"
+#include "utils/logging.hpp"
+
 #ifdef USE_DAWN_API
 #include "dawn/native/DawnNative.h"
 #endif
@@ -430,8 +431,8 @@ struct CallbackData {
   WGPUBuffer buffer; // managed by owning Kernel
   size_t bufferSize;
   void *output; // non-owning, only for target memory in toCPU, not used for
-                // kernel invocations
-  std::promise<void> *promise;
+  // kernel invocations
+  std::shared_ptr<std::promise<void>> promise;
   std::future<void> *future;
 };
 
@@ -530,32 +531,27 @@ struct Context {
   // Default constructor
   Context() = default;
 
-  Context(Context&& other) noexcept
-      : instance(other.instance),
-        adapter(other.adapter),
-        device(other.device),
+  Context(Context &&other) noexcept
+      : instance(other.instance), adapter(other.adapter), device(other.device),
         queue(other.queue),
         // Re‐initialize pools to point to *this*:
-        pool(this),
-        kernelPool(this),
-        adapterStatus(other.adapterStatus),
-        deviceStatus(other.deviceStatus)
-  {
+        pool(this), kernelPool(this), adapterStatus(other.adapterStatus),
+        deviceStatus(other.deviceStatus) {
     LOG(kDefLog, kTrace, "Moving Context ownership");
     // Move over the resources in the pools:
-    pool.data       = std::move(other.pool.data);
+    pool.data = std::move(other.pool.data);
     kernelPool.data = std::move(other.kernelPool.data);
 
     // Null out handles in the source so its destructor won't release them.
     other.instance = nullptr;
-    other.adapter  = nullptr;
-    other.device   = nullptr;
-    other.queue    = nullptr;
+    other.adapter = nullptr;
+    other.device = nullptr;
+    other.queue = nullptr;
     // other.adapterStatus = 0;
     // other.deviceStatus = 0;
   }
 
-  Context& operator=(Context&& other) noexcept {
+  Context &operator=(Context &&other) noexcept {
     if (this != &other) {
       // Free any existing resources. In most cases, this should be a no-op
       // since we typically shouldn't have two active initialized Context
@@ -625,7 +621,7 @@ inline Tensor createTensor(TensorPool &pool, WGPUDevice &device,
   size_t numElements = size(shape);
   size_t size = sizeBytes(dtype) * numElements;
   WGPUBufferDescriptor bufferDesc = {
-      .label = {.data = nullptr, .length = 0}, 
+      .label = {.data = nullptr, .length = 0},
       .usage = usage,
       .size = size,
   };
@@ -794,6 +790,162 @@ inline void check(bool condition, const char *message,
   }
 }
 
+/**
+ * @brief Pumps events until the provided future is ready.
+ *
+ * This helper template function continuously checks the status of the provided std::future<T>
+ * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop
+ * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it
+ * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future
+ * is ready, its value is returned.
+ *
+ * @tparam T The type of the value contained in the future.
+ * @param instance The WGPUInstance used to process events.
+ * @param f The future to wait on.
+ * @return T The value retrieved from the ready future.
+ *
+ * @code
+ * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter, devDescriptor);
+ * WGPUDevice device = waitForFuture(instance, deviceFuture);
+ * @endcode
+ */
+template <typename T>
+T waitForFuture(WGPUInstance instance, std::future<T> &f) {
+#ifdef __EMSCRIPTEN__
+  // Poll until the future is ready.
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    // Yield control to the JS event loop.
+    emscripten_sleep(1);
+  }
+  return f.get();
+#else
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    wgpuInstanceProcessEvents(instance);
+  }
+  return f.get();
+#endif
+}
+
+// Context Callbacks & Helpers
+
+/**
+ * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request.
+ *
+ * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether
+ * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds)
+ * and sets an exception on the associated promise. On success, it sets the value of the promise with
+ * the obtained adapter. Finally, it frees the allocated memory for the promise pointer.
+ *
+ * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success.
+ * @param adapter The WGPUAdapter obtained on a successful request.
+ * @param message A string view containing additional information about the adapter request.
+ * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUAdapter>>.
+ * @param userdata2 Unused.
+ */
+inline void adapterCallback(WGPURequestAdapterStatus status,
+                            WGPUAdapter adapter, WGPUStringView message,
+                            void *userdata1, void * /*userdata2*/) {
+  auto *promisePtr =
+      reinterpret_cast<std::shared_ptr<std::promise<WGPUAdapter>> *>(userdata1);
+  if (status != WGPURequestAdapterStatus_Success) {
+#ifdef __EMSCRIPTEN__
+    LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s",
+        static_cast<int>(message.length), message.data);
+#endif
+    (*promisePtr)
+        ->set_exception(std::make_exception_ptr(
+            std::runtime_error("Request WebGPU adapter failed")));
+  } else {
+    (*promisePtr)->set_value(adapter);
+  }
+  delete promisePtr;
+}
+
+/**
+ * @brief Callback function invoked upon completion of an asynchronous WebGPU device request.
+ *
+ * This callback is triggered when the request for a WebGPU device completes. It verifies that
+ * the device was successfully created. On success, the callback sets the value of the associated
+ * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated
+ * memory for the promise pointer.
+ *
+ * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success.
+ * @param device The WGPUDevice obtained on successful request.
+ * @param message A string view containing additional information about the device request.
+ * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUDevice>>.
+ * @param userdata2 Unused.
+ */
+inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device,
+                           WGPUStringView message, void *userdata1,
+                           void * /*userdata2*/) {
+  auto *promisePtr =
+      reinterpret_cast<std::shared_ptr<std::promise<WGPUDevice>> *>(userdata1);
+  if (status != WGPURequestDeviceStatus_Success) {
+    (*promisePtr)
+        ->set_exception(std::make_exception_ptr(
+            std::runtime_error("Request WebGPU device failed")));
+  } else {
+    LOG(kDefLog, kTrace, "Device Request succeeded %p",
+        static_cast<void *>(device));
+    (*promisePtr)->set_value(device);
+  }
+  delete promisePtr;
+}
+
+/**
+ * @brief Asynchronously requests a WebGPU adapter from the given instance.
+ *
+ * This helper function wraps the asynchronous call to request an adapter using the WebGPU API.
+ * It sets up a promise and registers an adapter callback, returning a future that will eventually
+ * hold the requested WGPUAdapter.
+ *
+ * @param instance The WGPUInstance from which to request the adapter.
+ * @param adapterOpts The options for requesting the adapter.
+ * @return std::future<WGPUAdapter> A future that will eventually hold the created WGPUAdapter.
+ */
+inline std::future<WGPUAdapter>
+requestAdapterAsync(WGPUInstance instance,
+                    const WGPURequestAdapterOptions &adapterOpts) {
+  auto promise = std::make_shared<std::promise<WGPUAdapter>>();
+  auto *promisePtr = new std::shared_ptr<std::promise<WGPUAdapter>>(promise);
+
+  WGPURequestAdapterCallbackInfo callbackInfo{
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = adapterCallback,
+      .userdata1 = promisePtr,
+      .userdata2 = nullptr};
+  wgpuInstanceRequestAdapter(instance, &adapterOpts, callbackInfo);
+  return promise->get_future();
+}
+
+/**
+ * @brief Asynchronously requests a WebGPU device from a given adapter.
+ *
+ * This helper function wraps the asynchronous call to request a device using the WebGPU API.
+ * It sets up a promise and registers a device callback, returning a future that will be fulfilled
+ * once the device is available.
+ *
+ * @param adapter The WGPUAdapter to request the device from.
+ * @param devDescriptor The descriptor specifying the characteristics of the requested device.
+ * @return std::future<WGPUDevice> A future that will eventually hold the created WGPUDevice.
+ */
+inline std::future<WGPUDevice>
+requestDeviceAsync(WGPUAdapter adapter,
+                   const WGPUDeviceDescriptor &devDescriptor) {
+  auto promise = std::make_shared<std::promise<WGPUDevice>>();
+  auto *promisePtr = new std::shared_ptr<std::promise<WGPUDevice>>(promise);
+
+  WGPURequestDeviceCallbackInfo deviceCallbackInfo{
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = deviceCallback,
+      .userdata1 = promisePtr,
+      .userdata2 = nullptr};
+  wgpuAdapterRequestDevice(adapter, &devDescriptor, deviceCallbackInfo);
+  return promise->get_future();
+}
+
 /**
  * @brief Factory function to create a GPU context, which aggregates WebGPU API
  * handles to interact with the GPU including the instance, adapter, device, and
@@ -812,12 +964,10 @@ inline void check(bool condition, const char *message,
  * @return Context instance representing the created GPU context
  *
  */
-inline Context createContext(
-    const WGPUInstanceDescriptor &desc = {},
-    const WGPURequestAdapterOptions &adapterOpts = {},
-    const WGPUDeviceDescriptor &devDescriptor = {}) 
-{
-  Context ctx; // stack-allocated
+inline Context createContext(const WGPUInstanceDescriptor &desc = {},
+                             const WGPURequestAdapterOptions &adapterOpts = {},
+                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+  Context ctx; // Stack-allocated Context.
 
 #ifdef __EMSCRIPTEN__
   ctx.instance = wgpuCreateInstance(nullptr);
@@ -826,115 +976,50 @@ inline Context createContext(
 #endif
   check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
 
+  // Request the adapter asynchronously.
   LOG(kDefLog, kTrace, "Requesting adapter");
-  {
-    struct AdapterData {
-      WGPUAdapter adapter = nullptr;
-      bool requestEnded = false;
-      WGPURequestAdapterStatus status;
-    };
-    AdapterData adapterData;
-
-    auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,
-                                    WGPUAdapter adapter, 
-                                    WGPUStringView message,
-                                    void *pUserData, void *) {
-      auto &ad = *reinterpret_cast<AdapterData*>(pUserData);
-      ad.status = status;
-#ifdef __EMSCRIPTEN__
-      if (status != WGPURequestAdapterStatus_Success) {
-        LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s",
-            static_cast<int>(message.length), message.data);
-      }
-#endif
-      check(status == WGPURequestAdapterStatus_Success,
-            "Request WebGPU adapter", __FILE__, __LINE__);
-      ad.adapter      = adapter;
-      ad.requestEnded = true;
-    };
-
-    WGPURequestAdapterCallbackInfo callbackInfo {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = onAdapterRequestEnded,
-      .userdata1 = &adapterData,
-      .userdata2 = nullptr
-    };
-    wgpuInstanceRequestAdapter(ctx.instance, &adapterOpts, callbackInfo);
-
-    while (!adapterData.requestEnded) {
-      processEvents(ctx.instance);
-    }
-    ctx.adapter = adapterData.adapter;
-    ctx.adapterStatus = adapterData.status;
+  try {
+    auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts);
+    // Pump events until the adapter future is ready.
+    ctx.adapter = waitForFuture(ctx.instance, adapterFuture);
+    ctx.adapterStatus = WGPURequestAdapterStatus_Success;
+  } catch (const std::exception &ex) {
+    check(false, ex.what(), __FILE__, __LINE__);
   }
 
+  // Request the device asynchronously.
   LOG(kDefLog, kTrace, "Requesting device");
-  {
-    struct DeviceData {
-      WGPUDevice device = nullptr;
-      bool requestEnded = false;
-      WGPURequestDeviceStatus status;
-    };
-    DeviceData devData;
-
-    auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
-                                   WGPUDevice device, 
-                                   WGPUStringView message,
-                                   void *pUserData, void *) {
-      auto &dd = *reinterpret_cast<DeviceData*>(pUserData);
-      dd.status = status;
-      check(status == WGPURequestDeviceStatus_Success,
-            "Could not get WebGPU device.", __FILE__, __LINE__);
-      LOG(kDefLog, kTrace, "Device Request succeeded %p", 
-          static_cast<void*>(device));
-      dd.device      = device;
-      dd.requestEnded= true;
-    };
-
-    WGPURequestDeviceCallbackInfo deviceCallbackInfo {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = onDeviceRequestEnded,
-      .userdata1= &devData,
-      .userdata2= nullptr
-    };
-    wgpuAdapterRequestDevice(ctx.adapter, &devDescriptor, deviceCallbackInfo);
-
-    LOG(kDefLog, kTrace, "Waiting for device request to end");
-    while (!devData.requestEnded) {
-      processEvents(ctx.instance);
-    }
+  try {
+    auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
+    // Pump events until the device future is ready.
+    ctx.device = waitForFuture(ctx.instance, deviceFuture);
+    ctx.deviceStatus = WGPURequestDeviceStatus_Success;
     LOG(kDefLog, kTrace, "Device request ended");
 
-    ctx.device = devData.device;
-    ctx.deviceStatus = devData.status;
-
-    // If the device was created, set up logging and fetch the queue
-    if (devData.status == WGPURequestDeviceStatus_Success) {
-      #ifndef __EMSCRIPTEN__
-      WGPULoggingCallbackInfo loggingCallbackInfo {
+    // If the device was created, set up logging and fetch the queue.
+#ifndef __EMSCRIPTEN__
+    WGPULoggingCallbackInfo loggingCallbackInfo{
         .nextInChain = nullptr,
         .callback =
-          [](WGPULoggingType type, WGPUStringView message, 
-             void *, void *) {
-            LOG(kDefLog, kError, "Device logging callback: %.*s",
-                static_cast<int>(message.length), message.data);
-            if (type == WGPULoggingType_Error) {
-              throw std::runtime_error("Device error logged.");
-            }
-          },
+            [](WGPULoggingType type, WGPUStringView message, void *, void *) {
+              LOG(kDefLog, kError, "Device logging callback: %.*s",
+                  static_cast<int>(message.length), message.data);
+              if (type == WGPULoggingType_Error) {
+                throw std::runtime_error("Device error logged.");
+              }
+            },
         .userdata1 = nullptr,
-        .userdata2 = nullptr
-      };
-      wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
-      #endif
-      ctx.queue = wgpuDeviceGetQueue(ctx.device);
-    }
+        .userdata2 = nullptr};
+    wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
+#endif
+    ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  } catch (const std::exception &ex) {
+    check(false, ex.what(), __FILE__, __LINE__);
   }
 
   return std::move(ctx);
 }
 
-
 #ifdef USE_DAWN_API
 /**
  * @brief Factory function to create a GPU context, which aggregates WebGPU API
@@ -1066,11 +1151,76 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
 }
 #endif
 
-inline void wait(Context &ctx, std::future<void> &future) {
-  while (future.wait_for(std::chrono::seconds(0)) !=
-         std::future_status::ready) {
-    processEvents(ctx.instance);
-  }
+/**
+ * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping.
+ *
+ * This callback is triggered when the GPU buffer mapping for a readback buffer is completed.
+ * It verifies that the mapping operation was successful, retrieves the mapped memory,
+ * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer,
+ * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data.
+ *
+ * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success.
+ * @param message A string view containing additional information about the mapping operation.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer,
+ *                  buffer size, destination CPU memory pointer, and a promise for signaling completion.
+ * @param userdata2 Unused.
+ */
+inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
+                              void *userdata1, void * /*userdata2*/) {
+  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  // Check that mapping succeeded.
+  check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__,
+        __LINE__);
+
+  // Get the mapped memory.
+  const void *mappedData =
+      wgpuBufferGetConstMappedRange(cbData->buffer, 0, cbData->bufferSize);
+  check(mappedData, "Get mapped range", __FILE__, __LINE__);
+
+  // Copy the data from the mapped GPU buffer to the CPU memory.
+  memcpy(cbData->output, mappedData, cbData->bufferSize);
+
+  // Unmap the buffer.
+  wgpuBufferUnmap(cbData->buffer);
+
+  // Signal that the copy has completed.
+  // Ensure you use the arrow operator on the shared_ptr to call set_value().
+  cbData->promise->set_value();
+
+  // Clean up the dynamically allocated callback data.
+  delete cbData;
+}
+
+/**
+ * @brief Callback function invoked when the GPU queue’s submitted work is complete.
+ *
+ * This callback is registered with the GPU queue after submitting work. When invoked,
+ * it verifies that all queued work completed successfully, and then sets up the buffer
+ * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback
+ * buffer is mapped to access the processed data on the CPU.
+ *
+ * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer,
+ *                  buffer size, destination CPU memory pointer, and a promise to signal completion.
+ * @param userdata2 Unused.
+ */
+inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
+                                  void *userdata1, void * /*userdata2*/) {
+  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  // Ensure the queue work finished successfully.
+  check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__,
+        __LINE__);
+
+  // Set up the buffer mapping callback information.
+  WGPUBufferMapCallbackInfo mapCallbackInfo;
+  mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+  mapCallbackInfo.callback = bufferMapCallback;
+  mapCallbackInfo.userdata1 = cbData;
+  mapCallbackInfo.userdata2 = nullptr;
+
+  // Begin the asynchronous mapping of the readback buffer.
+  wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize,
+                     mapCallbackInfo);
 }
 
 /**
@@ -1085,45 +1235,35 @@ inline void wait(Context &ctx, std::future<void> &future) {
  * toCPU(ctx, tensor, data, bufferSize);
  * @endcode
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
-                  CopyData &op) {
+inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
+                               size_t bufferSize, CopyData &op) {
+  // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
-  CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise,
-                               &op.future};
 
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            const auto *data = static_cast<CallbackData *>(userdata1);
-            WGPUBufferMapCallbackInfo mapCallbackInfo = {
-                .mode = WGPUCallbackMode_AllowSpontaneous,
-                .callback =
-                    [](WGPUMapAsyncStatus status, WGPUStringView message,
-                       void *userdata1, void *userdata2) {
-                      const auto *data = static_cast<CallbackData *>(userdata1);
-                      check(status == WGPUMapAsyncStatus_Success,
-                            "Map readbackBuffer", __FILE__, __LINE__);
-                      const void *mappedData = wgpuBufferGetConstMappedRange(
-                          data->buffer, /*offset=*/0, data->bufferSize);
-                      check(mappedData, "Get mapped range", __FILE__, __LINE__);
-                      memcpy(data->output, mappedData, data->bufferSize);
-                      wgpuBufferUnmap(data->buffer);
-                      data->promise->set_value();
-                    },
-                .userdata1 = const_cast<CallbackData *>(data),
-                .userdata2 = nullptr};
-            wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0,
-                               data->bufferSize, mapCallbackInfo);
-          },
-      .userdata1 = &callbackData,
-      .userdata2 = nullptr};
+  // Create a promise and get its future.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Allocate callback data so it remains valid until the async
+  // chain finishes.
+  CallbackData *cbData = new CallbackData{
+      op.readbackBuffer, // The GPU buffer to be read back.
+      bufferSize,
+      data,   // CPU memory destination.
+      promise // The promise to be signaled.
+  };
+
+  // Set up the work-done callback to initiate the buffer mapping.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo;
+  workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+  workDoneCallbackInfo.callback = queueWorkDoneCallback;
+  workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data.
+  workDoneCallbackInfo.userdata2 = nullptr;
+
+  // Begin the asynchronous chain by registering the queue work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
-  wait(ctx, op.future);
+  return promise->get_future();
 }
 
 /**
@@ -1141,31 +1281,59 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
  * @param[in] bufferSize Size of the data buffer in bytes
  * @param[out] data Pointer to the CPU memory to copy the data to
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
-  CopyData op;
-  op.future = op.promise.get_future();
-  {
-    WGPUBufferDescriptor readbackBufferDescriptor = {
-        .label = {.data = nullptr, .length = 0}, 
-        .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
-        .size = bufferSize,
-    };
-    op.readbackBuffer =
-        wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
-  }
-  {
-    WGPUCommandEncoder commandEncoder;
-    commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
-                                         op.readbackBuffer, 0, bufferSize);
-    op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
-    wgpuCommandEncoderRelease(commandEncoder);
-    check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
-  }
-  toCPU(ctx, tensor, data, bufferSize, op);
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
+inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
+                               size_t bufferSize) {
+  // Create a promise that will later be satisfied when the async copy
+  // completes.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Create a readback buffer that will be used for copying and mapping.
+  WGPUBufferDescriptor readbackBufferDescriptor = {
+      .label = {.data = nullptr, .length = 0},
+      .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
+      .size = bufferSize,
+  };
+  WGPUBuffer readbackBuffer =
+      wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
+
+  // Create a command encoder and record a copy from the tensor GPU buffer
+  WGPUCommandEncoder commandEncoder =
+      wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+  wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
+                                       readbackBuffer, 0, bufferSize);
+  // Finish recording by creating a command buffer and release the encoder.
+  WGPUCommandBuffer commandBuffer =
+      wgpuCommandEncoderFinish(commandEncoder, nullptr);
+  wgpuCommandEncoderRelease(commandEncoder);
+  check(commandBuffer, "Create command buffer", __FILE__, __LINE__);
+
+  // Submit the work to the queue and release the command buffer immediately.
+  wgpuQueueSubmit(ctx.queue, 1, &commandBuffer);
+  wgpuCommandBufferRelease(commandBuffer);
+
+  // Allocate callback data
+  CallbackData *cbData = new CallbackData{
+      readbackBuffer, // The readback buffer to map.
+      bufferSize,     // The size of the copy.
+      data,           // CPU memory destination.
+      promise         // The promise to signal when done.
+  };
+
+  // Set up the work-done callback. When the queue’s submitted work is
+  // completed, it is routed to queueWorkDoneCallback which then starts the
+  // asynchronous map.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = queueWorkDoneCallback,
+      .userdata1 = cbData,
+      .userdata2 = nullptr,
+  };
+
+  // Register the callback. The async chain continues inside
+  // queueWorkDoneCallback.
+  wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+
+  return promise->get_future();
 }
 
 /**
@@ -1176,76 +1344,74 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
  * @param[out] data Array of floats to copy the data to
  *
  * @code
- * toCPU(ctx, tensor, data);
+ * std::future<void> toCPUFuture = toCPU(ctx, tensor, data);
+ * WaitForFuture(ctx.instance, toCPUFuture);
  * @endcode
  */
 template <size_t N>
-void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
-  toCPU(ctx, tensor, data.data(), sizeof(data));
+inline std::future<void> toCPU(Context &ctx, Tensor &tensor,
+                               std::array<float, N> &data) {
+  return toCPU(ctx, tensor, data.data(), sizeof(data));
 }
 
-inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
+inline std::future<void> toCPU(Context &ctx, WGPUBuffer buffer, void *data,
+                               size_t size) {
+  // The size (in bytes) for the copy.
   uint64_t bufferSize = size;
+
+  // Create an operation structure (here we reuse CopyData solely for its
+  // members that we need to create a readback buffer and command buffer).
   CopyData op;
-  op.future = op.promise.get_future();
+
+  // Create the promise that will be fulfilled once the copy is done.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Create a readback buffer that we can map for reading.
   {
     WGPUBufferDescriptor readbackBufferDescriptor = {
-        .label = {.data = nullptr, .length = 0}, 
+        .label = {.data = nullptr, .length = 0},
         .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
         .size = bufferSize,
     };
     op.readbackBuffer =
         wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
   }
+
+  // Create a command encoder which copies from the provided buffer to the
+  // readback buffer.
   {
-    WGPUCommandEncoder commandEncoder;
-    commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+    WGPUCommandEncoder commandEncoder =
+        wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
     wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0,
                                          op.readbackBuffer, 0, bufferSize);
     op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
     wgpuCommandEncoderRelease(commandEncoder);
     check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
   }
+
+  // Submit the command and release the command buffer.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
-  CallbackData callbackData = {op.readbackBuffer, static_cast<size_t>(bufferSize), data, &op.promise,
-                               &op.future};
 
+  // Allocate callback data
+  CallbackData *cbData = new CallbackData{
+      op.readbackBuffer,               // The readback buffer created above.
+      static_cast<size_t>(bufferSize), // Size of the copy.
+      data,                            // Destination CPU memory.
+      promise                          // Our promise to satisfy when done.
+  };
+
+  // Set up the queue work-done callback info.
   WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
       .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            const auto *data = static_cast<CallbackData *>(userdata1);
-            WGPUBufferMapCallbackInfo mapCallbackInfo = {
-                .mode = WGPUCallbackMode_AllowSpontaneous,
-                .callback =
-                    [](WGPUMapAsyncStatus status, WGPUStringView message,
-                       void *userdata1, void *userdata2) {
-                      const auto *data = static_cast<CallbackData *>(userdata1);
-                      check(status == WGPUMapAsyncStatus_Success,
-                            "Map readbackBuffer", __FILE__, __LINE__);
-                      const void *mappedData = wgpuBufferGetConstMappedRange(
-                          data->buffer, /*offset=*/0, data->bufferSize);
-                      check(mappedData, "Get mapped range", __FILE__, __LINE__);
-                      memcpy(data->output, mappedData, data->bufferSize);
-                      wgpuBufferUnmap(data->buffer);
-                      data->promise->set_value();
-                    },
-                .userdata1 = const_cast<CallbackData *>(data),
-                .userdata2 = nullptr};
-            wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0,
-                               data->bufferSize, mapCallbackInfo);
-          },
-      .userdata1 = &callbackData,
+      .callback = queueWorkDoneCallback, // Our free function callback.
+      .userdata1 = cbData,               // Pass the callback data pointer.
       .userdata2 = nullptr};
+
+  // Start the asynchronous chain by registering the work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
-  wait(ctx, op.future);
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
+  return promise->get_future();
 }
 
 /**
@@ -1376,6 +1542,19 @@ inline Shape cdiv(Shape total, Shape group) {
   return result;
 }
 
+/**
+ * @brief Packages the shader compilation information along with a promise for asynchronous signaling.
+ *
+ * This structure holds a pointer to a CompilationInfo instance that collects
+ * details such as status, messages, line numbers, and positions from the shader compilation.
+ * It also contains a shared pointer to a std::promise<void> which is used to signal the completion
+ * of the asynchronous shader compilation process.
+ */
+struct CompData {
+  CompilationInfo *compInfo;
+  std::shared_ptr<std::promise<void>> compPromise;
+};
+
 /**
  * @brief A factory function to create a kernel on the GPU. The kernel is
  * created with the given WGSL code, input tensors, output tensor, and
@@ -1399,34 +1578,38 @@ inline Shape cdiv(Shape total, Shape group) {
  * @return Kernel instance representing the created kernel
  *
  * @code
- * Kernel kernel = createKernel(ctx, code, dataBindings, numInputs,
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize);
+ * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
  * @endcode
- * output, nThreads, params, paramsSize);
+ 
  */
-inline Kernel createKernel(Context& ctx, const KernelCode &code,
-                           const Tensor *dataBindings, size_t numTensors,
-                           const size_t *viewOffsets,
-                           const Shape &totalWorkgroups,
-                           const void *params = nullptr, size_t paramsSize = 0,
-                           CompilationInfo *compilationInfo = nullptr,
-                           const char *cacheKey = nullptr) {
+inline std::future<Kernel>
+createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
+             size_t numTensors, const size_t *viewOffsets,
+             const Shape &totalWorkgroups, const void *params = nullptr,
+             size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr,
+             const char *cacheKey = nullptr) {
   // Create a cache key by the pointer values of the data bindings and the
   // kernel code
   if (cacheKey != nullptr &&
       ctx.kernelPool.data.find(cacheKey) != ctx.kernelPool.data.end()) {
-    LOG(kDefLog, kInfo, "Kernel cache hit");
-    return ctx.kernelPool.data[cacheKey];
+    std::promise<Kernel> ready;
+    ready.set_value(ctx.kernelPool.data[cacheKey]);
+    return ready.get_future();
   }
 
+  // Create an outer promise for the new kernel.
+  std::promise<Kernel> outerPromise;
+  std::future<Kernel> outerFuture = outerPromise.get_future();
+
   assert(totalWorkgroups.rank == 3);
   WGPUDevice device = ctx.device;
   WGPUQueue queue = ctx.queue;
   Kernel op(new RawKernel());
-
   // paramIndex is the index into bgLayoutEntries for the parameters buffer If
   // there are no parameters for the kernel, paramsSize == 0 and paramIndex is
   // effectively undefined (== -1)
-  size_t paramIndex = -1;
+  size_t paramIndex = static_cast<size_t>(-1);
   // Note: paramIndex is undefined unless paramsSize > 0
   size_t numBindings = numTensors;
   if (paramsSize > 0) {
@@ -1435,11 +1618,13 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
                                   // op.buffers, op.bufferSizes and
                                   // bgLayoutEntries
   }
+
   op->buffers = std::make_unique<WGPUBuffer[]>(numBindings);
   op->bufferSizes = std::make_unique<size_t[]>(numBindings);
   op->numBindings = numBindings;
-  std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
+
   // Create layout entries for input buffers
+  std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
   for (size_t i = 0; i < numTensors; ++i) {
     bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{
         .binding = static_cast<uint32_t>(i),
@@ -1452,8 +1637,6 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
     };
   }
   if (paramsSize > 0) {
-    LOG(kDefLog, kInfo, "Create layout entry for the params buffer");
-    // Create layout entry for the params buffer
     bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{
         .binding = static_cast<uint32_t>(paramIndex),
         .visibility = WGPUShaderStage_Compute,
@@ -1466,10 +1649,11 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   }
   WGPUBindGroupLayoutDescriptor bgLayoutDesc = {
       .entryCount = static_cast<uint32_t>(bgLayoutEntries.size()),
-      .entries = bgLayoutEntries.data(),
-  };
+      .entries = bgLayoutEntries.data()};
   WGPUBindGroupLayout bgLayout =
       wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);
+
+  // Assign buffers from dataBindings.
   for (size_t i = 0; i < numTensors; ++i) {
     op->buffers[i] = dataBindings[i].data.buffer;
     op->bufferSizes[i] = dataBindings[i].data.size;
@@ -1477,7 +1661,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   // Create a buffer for the Params struct
   if (paramsSize > 0) {
     WGPUBufferDescriptor paramsBufferDesc = {
-        .label = {.data = nullptr, .length = 0}, 
+        .label = {.data = nullptr, .length = 0},
         .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
         .size = paramsSize,
         .mappedAtCreation = false,
@@ -1489,6 +1673,8 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   } else {
     LOG(kDefLog, kTrace, "No params buffer needed");
   }
+
+  // Build bind group entries and the bind group.
   std::vector<WGPUBindGroupEntry> bindGroupEntries(numBindings);
   for (size_t i = 0; i < numTensors; ++i) {
     bindGroupEntries[i] = WGPUBindGroupEntry{
@@ -1516,6 +1702,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   };
   op->bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
 
+  // Create pipeline layout.
   WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
       .bindGroupLayoutCount = 1,
       .bindGroupLayouts = &bgLayout,
@@ -1523,63 +1710,101 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   WGPUPipelineLayout pipelineLayout =
       wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
 
+  // Prepare the WGSL source and shader module descriptor.
   WGPUShaderSourceWGSL wgslDesc = {
       .chain = {.sType = WGPUSType_ShaderSourceWGSL},
       .code = {.data = code.data.c_str(), .length = code.data.length()}};
-
   WGPUShaderModuleDescriptor shaderModuleDesc = {};
   shaderModuleDesc.nextInChain = &wgslDesc.chain;
   shaderModuleDesc.label = {code.label.c_str(), code.label.length()};
 
-  WGPUComputePipelineDescriptor computePipelineDesc = {};
-  computePipelineDesc.layout = pipelineLayout;
-  computePipelineDesc.compute.module =
+  // Create the shader module.
+  WGPUShaderModule shaderModule =
       wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
 
+  // If compilation info is requested, register the callback immediately.
+  if (compilationInfo) {
+    auto compPromise = std::make_shared<std::promise<void>>();
+    std::future<void> compFuture = compPromise->get_future();
+    // Allocate helper data to pass to the callback.
+    auto *compData = new CompData{compilationInfo, compPromise};
+
+    auto compilationCallback = [](WGPUCompilationInfoRequestStatus status,
+                                  WGPUCompilationInfo const *info,
+                                  void *userdata1, void * /*userdata2*/) {
+      CompData *cd = reinterpret_cast<CompData *>(userdata1);
+      if (info && cd->compInfo) {
+        cd->compInfo->status = status;
+        for (uint32_t i = 0; i < info->messageCount; ++i) {
+          cd->compInfo->messages.push_back(
+              std::string(info->messages[i].message.data,
+                          info->messages[i].message.length));
+          cd->compInfo->lineNums.push_back(info->messages[i].lineNum);
+          cd->compInfo->linePos.push_back(info->messages[i].linePos);
+        }
+        cd->compInfo->finished = true;
+      }
+      cd->compPromise->set_value();
+      delete cd;
+    };
+
+    WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {};
+    compilationCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+    compilationCallbackInfo.callback = compilationCallback;
+    compilationCallbackInfo.userdata1 = compData;
+    compilationCallbackInfo.userdata2 = nullptr;
+
+    // Register callback and then wait for the result.
+    wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo);
+    waitForFuture(ctx.instance, compFuture);
+  }
+
+  // Now create the compute pipeline using the shader module.
+  WGPUComputePipelineDescriptor computePipelineDesc = {};
+  computePipelineDesc.layout = pipelineLayout;
+  computePipelineDesc.compute.module = shaderModule;
   computePipelineDesc.compute.entryPoint = {code.entryPoint.c_str(),
                                             code.entryPoint.length()};
   computePipelineDesc.label = {code.label.c_str(), code.label.length()};
-
   op->computePipeline =
       wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
+
   op->totalWorkgroups = {totalWorkgroups[0], totalWorkgroups[1],
                          totalWorkgroups[2]};
+
   resetCommandBuffer(device, op);
   if (cacheKey != nullptr)
     ctx.kernelPool.data[cacheKey] = op;
 
-  auto compilationInfoCallback = [](WGPUCompilationInfoRequestStatus status,
-                                    WGPUCompilationInfo const *compilationInfo,
-                                    void *userdata1, void *userdata2) {
-    CompilationInfo *result = static_cast<CompilationInfo *>(userdata1);
-    if (compilationInfo && result) {
-      result->status = status;
-      for (uint32_t i = 0; i < compilationInfo->messageCount; ++i) {
-        printf("Message %d: %.*s\n", i,
-               static_cast<int>(compilationInfo->messages[i].message.length),
-               compilationInfo->messages[i].message.data);
-        result->messages.push_back(
-            std::string(compilationInfo->messages[i].message.data,
-                        compilationInfo->messages[i].message.length));
-        result->lineNums.push_back(compilationInfo->messages[i].lineNum);
-        result->linePos.push_back(compilationInfo->messages[i].linePos);
-      }
-      result->finished = true;
-    } else {
-      LOG(kDefLog, kTrace, "No compilation info or result");
-    }
-  };
-
-  WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = compilationInfoCallback,
-      .userdata1 = static_cast<void *>(compilationInfo),
-      .userdata2 = nullptr};
+  outerPromise.set_value(op);
+  return outerFuture;
+}
 
-  while (compilationInfo && !compilationInfo->finished) {
-    processEvents(ctx.instance);
+/**
+ * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
+ *
+ * This callback is invoked when the GPU queue signals the completion of the submitted
+ * workload for a kernel dispatch. It receives the work-done status and a userdata pointer,
+ * which is expected to be a heap‑allocated pointer to a std::promise<void>.
+ *
+ * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception.
+ * After setting the promise state, the allocated memory for the promise is freed.
+ *
+ * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A heap allocated pointer to std::promise<void> which is set when the work is done.
+ * @param userdata2 Unused.
+ */
+inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
+                                   void *userdata1, void * /*userdata2*/) {
+  // Cast the userdata pointer back to our heap‑allocated promise.
+  auto *p = reinterpret_cast<std::promise<void> *>(userdata1);
+  if (status == WGPUQueueWorkDoneStatus_Success) {
+    p->set_value();
+  } else {
+    p->set_exception(std::make_exception_ptr(
+        std::runtime_error("Queue work did not complete successfully.")));
   }
-  return op;
+  delete p; // free the heap allocation
 }
 
 /**
@@ -1599,17 +1824,17 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
  * @return Kernel instance representing the created kernel
  *
  * @code
- * Kernel kernel = createKernel(ctx, code, tensorData, output,
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params);
+ * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
  * @endcode
- * totalWorkgroups, params);
  */
 template <typename ParamsType = NoParam, size_t numInputs>
-Kernel createKernel(Context &ctx, const KernelCode &code,
-                    const Bindings<numInputs> &dataBindings,
-                    const Shape &totalWorkgroups,
-                    const ParamsType &params = ParamsType{},
-                    CompilationInfo *compilationInfo = nullptr,
-                    const char *cacheKey = nullptr) {
+std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
+                                 const Bindings<numInputs> &dataBindings,
+                                 const Shape &totalWorkgroups,
+                                 const ParamsType &params = ParamsType{},
+                                 CompilationInfo *compilationInfo = nullptr,
+                                 const char *cacheKey = nullptr) {
   if constexpr (!IsNoParam<ParamsType>) {
     return createKernel(ctx, code, dataBindings.data.data(), numInputs,
                         dataBindings.viewOffsets.data(), totalWorkgroups,
@@ -1637,30 +1862,37 @@ Kernel createKernel(Context &ctx, const KernelCode &code,
  * @param[in] promise Promise to set when the kernel has finished executing
  *
  * @code
- * dispatchKernel(ctx, kernel);
+ * std::future<void> dispatchFuture = dispatchKernel(ctx, kernel);
+ * WaitForFuture(ctx.instance, dispatchFuture);
  * @endcode
  */
-inline void dispatchKernel(Context &ctx, Kernel &kernel,
-                           std::promise<void> &promise) {
+inline std::future<void> dispatchKernel(Context &ctx, Kernel &kernel) {
+  // If the kernel was used before, reset the command buffer.
   if (kernel->used) {
     resetCommandBuffer(ctx.device, kernel);
   }
+
+  // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &kernel->commandBuffer);
   wgpuCommandBufferRelease(kernel->commandBuffer);
   kernel->used = true;
 
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            auto *promise = static_cast<std::promise<void> *>(userdata1);
-            promise->set_value();
-          },
-      .userdata1 = &promise,
-      .userdata2 = nullptr};
+  // Allocate a promise on the heap so it remains valid beyond this function’s
+  // scope.
+  std::promise<void> *promise = new std::promise<void>();
+  std::future<void> future = promise->get_future();
+
+  // Set up the callback info.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {};
+  workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+  workDoneCallbackInfo.callback = dispatchKernelCallback;
+  workDoneCallbackInfo.userdata1 = reinterpret_cast<void *>(promise);
+  workDoneCallbackInfo.userdata2 = nullptr;
+
+  // IMPORTANT: Pass the address of the callback info structure.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+
+  return future;
 }
 
 } // namespace gpu
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index fe5aab7..75d9dc4 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -189,7 +189,8 @@ void testContainers() {
     std::array<half, 8> h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0};
     Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data());
     std::array<half, 8> h2;
-    toCPU(ctx, devH, h2.data(), sizeof(h2));
+    std::future<void> toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2));
+    waitForFuture(ctx.instance, toCPUFuture);
     for (int i = 0; i < 8; ++i) {
       printResult(h[i].data == h2[i].data, "Container round trip",
                   static_cast<float>(h[i]), static_cast<float>(h2[i]));
@@ -228,13 +229,13 @@ fn main(
   }
   Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf16);
-  std::promise<void> promise;
-  std::future<void> future = promise.get_future();
-  Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
+  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  dispatchKernel(ctx, op, promise);
-  wait(ctx, future);
-  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  Kernel op = waitForFuture(ctx.instance, kernelFuture);
+  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
+  waitForFuture(ctx.instance, dispatchFuture);
+  std::future<void> toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  waitForFuture(ctx.instance, toCPUFuture);
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", static_cast<float>(inputArr[i]),
            static_cast<float>(outputArr[i]));

From 14e7ab59a67329573bc69a7dfce5d431ba8777b3 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Wed, 19 Feb 2025 18:06:26 -0600
Subject: [PATCH 15/30] use async context waitForContext()

---
 cmake/example.cmake          |   4 +-
 examples/hello_world/run.cpp |   4 +-
 gpu.hpp                      | 279 ++++++++++++++++++++++-------------
 numeric_types/half.cpp       |   5 +-
 4 files changed, 180 insertions(+), 112 deletions(-)

diff --git a/cmake/example.cmake b/cmake/example.cmake
index 7cf1f8d..5953876 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -45,14 +45,16 @@ if(EMSCRIPTEN)
     # Set Emscripten-specific link flags that enable WASM output and expose certain symbols.
     # Needed to use updated version, emdawnwebgpu
     set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\
+        -O3 \
         -sUSE_WEBGPU=0 \
         -sWASM=1 \
         -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \
         -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
         -sEXPORTED_RUNTIME_METHODS=ccall \
         -sUSE_GLFW=3 \
-        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \
+        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
         -sASYNCIFY \
+        -sASYNCIFY_DEBUG \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 06970a7..c9f22c7 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -28,7 +28,7 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
-  Context ctx = createContext();
+  Context ctx = waitForContext();
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -36,8 +36,6 @@ int main(int argc, char **argv) {
   }
   Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf32);
-  std::promise<void> promise;
-  std::future<void> future = promise.get_future();
   std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
diff --git a/gpu.hpp b/gpu.hpp
index 052c674..0119108 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -793,10 +793,11 @@ inline void check(bool condition, const char *message,
 /**
  * @brief Pumps events until the provided future is ready.
  *
- * This helper template function continuously checks the status of the provided std::future<T>
- * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop
- * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it
- * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future
+ * This helper template function continuously checks the status of the provided
+ * std::future<T> until it becomes ready. On Emscripten builds, it yields
+ * control to the JavaScript event loop using emscripten_sleep to allow
+ * asynchronous callbacks to execute. On other platforms, it processes events
+ * from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future
  * is ready, its value is returned.
  *
  * @tparam T The type of the value contained in the future.
@@ -805,8 +806,8 @@ inline void check(bool condition, const char *message,
  * @return T The value retrieved from the ready future.
  *
  * @code
- * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter, devDescriptor);
- * WGPUDevice device = waitForFuture(instance, deviceFuture);
+ * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter,
+ * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture);
  * @endcode
  */
 template <typename T>
@@ -831,17 +832,56 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
 // Context Callbacks & Helpers
 
 /**
- * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request.
+ * @brief Waits for the provided std::future<T> to become ready by polling its status.
  *
- * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether
- * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds)
- * and sets an exception on the associated promise. On success, it sets the value of the promise with
- * the obtained adapter. Finally, it frees the allocated memory for the promise pointer.
+ * This helper template function continuously checks the status of the provided std::future<T> until it is ready.
+ * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior.
+ * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks.
+ * Once the future is ready, its value is returned.
  *
- * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success.
+ * @tparam T The type of the value contained in the future.
+ * @param f The future to wait on.
+ * @return T The value retrieved from the ready future.
+ *
+ * @code
+ * std::future<Context> contextFuture = createContext();
+ * Context ctx = waitForContextFuture(contextFuture);
+ * @endcode
+ */
+template <typename T> T waitForContextFuture(std::future<T> &f) {
+  #ifdef __EMSCRIPTEN__
+    while (f.wait_for(std::chrono::milliseconds(0)) !=
+           std::future_status::ready) {
+      emscripten_sleep(1); // Yield back to the JS event loop.
+    }
+    return f.get();
+  #else
+    while (f.wait_for(std::chrono::milliseconds(0)) !=
+           std::future_status::ready) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+    return f.get();
+  #endif
+  }
+
+/**
+ * @brief Adapter callback function invoked upon completion of an asynchronous
+ * WebGPU adapter request.
+ *
+ * This callback is triggered when the request for a WebGPU adapter completes.
+ * It verifies whether the adapter was successfully obtained. On failure, it
+ * logs an error message (in Emscripten builds) and sets an exception on the
+ * associated promise. On success, it sets the value of the promise with the
+ * obtained adapter. Finally, it frees the allocated memory for the promise
+ * pointer.
+ *
+ * @param status The status of the adapter request. Expected to be
+ * WGPURequestAdapterStatus_Success on success.
  * @param adapter The WGPUAdapter obtained on a successful request.
- * @param message A string view containing additional information about the adapter request.
- * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUAdapter>>.
+ * @param message A string view containing additional information about the
+ * adapter request.
+ * @param userdata1 A pointer to a heap-allocated
+ * std::shared_ptr<std::promise<WGPUAdapter>>.
  * @param userdata2 Unused.
  */
 inline void adapterCallback(WGPURequestAdapterStatus status,
@@ -864,17 +904,22 @@ inline void adapterCallback(WGPURequestAdapterStatus status,
 }
 
 /**
- * @brief Callback function invoked upon completion of an asynchronous WebGPU device request.
+ * @brief Callback function invoked upon completion of an asynchronous WebGPU
+ * device request.
  *
- * This callback is triggered when the request for a WebGPU device completes. It verifies that
- * the device was successfully created. On success, the callback sets the value of the associated
- * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated
- * memory for the promise pointer.
+ * This callback is triggered when the request for a WebGPU device completes. It
+ * verifies that the device was successfully created. On success, the callback
+ * sets the value of the associated promise; otherwise, it sets an exception.
+ * After fulfilling the promise, it frees the allocated memory for the promise
+ * pointer.
  *
- * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success.
+ * @param status The status of the device request. Expected to be
+ * WGPURequestDeviceStatus_Success on success.
  * @param device The WGPUDevice obtained on successful request.
- * @param message A string view containing additional information about the device request.
- * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUDevice>>.
+ * @param message A string view containing additional information about the
+ * device request.
+ * @param userdata1 A pointer to a heap-allocated
+ * std::shared_ptr<std::promise<WGPUDevice>>.
  * @param userdata2 Unused.
  */
 inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device,
@@ -897,13 +942,14 @@ inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device,
 /**
  * @brief Asynchronously requests a WebGPU adapter from the given instance.
  *
- * This helper function wraps the asynchronous call to request an adapter using the WebGPU API.
- * It sets up a promise and registers an adapter callback, returning a future that will eventually
- * hold the requested WGPUAdapter.
+ * This helper function wraps the asynchronous call to request an adapter using
+ * the WebGPU API. It sets up a promise and registers an adapter callback,
+ * returning a future that will eventually hold the requested WGPUAdapter.
  *
  * @param instance The WGPUInstance from which to request the adapter.
  * @param adapterOpts The options for requesting the adapter.
- * @return std::future<WGPUAdapter> A future that will eventually hold the created WGPUAdapter.
+ * @return std::future<WGPUAdapter> A future that will eventually hold the
+ * created WGPUAdapter.
  */
 inline std::future<WGPUAdapter>
 requestAdapterAsync(WGPUInstance instance,
@@ -923,13 +969,15 @@ requestAdapterAsync(WGPUInstance instance,
 /**
  * @brief Asynchronously requests a WebGPU device from a given adapter.
  *
- * This helper function wraps the asynchronous call to request a device using the WebGPU API.
- * It sets up a promise and registers a device callback, returning a future that will be fulfilled
- * once the device is available.
+ * This helper function wraps the asynchronous call to request a device using
+ * the WebGPU API. It sets up a promise and registers a device callback,
+ * returning a future that will be fulfilled once the device is available.
  *
  * @param adapter The WGPUAdapter to request the device from.
- * @param devDescriptor The descriptor specifying the characteristics of the requested device.
- * @return std::future<WGPUDevice> A future that will eventually hold the created WGPUDevice.
+ * @param devDescriptor The descriptor specifying the characteristics of the
+ * requested device.
+ * @return std::future<WGPUDevice> A future that will eventually hold the
+ * created WGPUDevice.
  */
 inline std::future<WGPUDevice>
 requestDeviceAsync(WGPUAdapter adapter,
@@ -964,60 +1012,62 @@ requestDeviceAsync(WGPUAdapter adapter,
  * @return Context instance representing the created GPU context
  *
  */
-inline Context createContext(const WGPUInstanceDescriptor &desc = {},
-                             const WGPURequestAdapterOptions &adapterOpts = {},
-                             const WGPUDeviceDescriptor &devDescriptor = {}) {
-  Context ctx; // Stack-allocated Context.
+inline std::future<Context>
+createContext(const WGPUInstanceDescriptor &desc = {},
+              const WGPURequestAdapterOptions &adapterOpts = {},
+              const WGPUDeviceDescriptor &devDescriptor = {}) {
 
-#ifdef __EMSCRIPTEN__
-  ctx.instance = wgpuCreateInstance(nullptr);
-#else
-  ctx.instance = wgpuCreateInstance(&desc);
-#endif
-  check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
+  auto promise = std::make_shared<std::promise<Context>>();
 
-  // Request the adapter asynchronously.
-  LOG(kDefLog, kTrace, "Requesting adapter");
+  // On native platforms, run our context creation in a detached thread.
+
+  Context ctx;
+  ctx.instance = wgpuCreateInstance(&desc);
+  if (!ctx.instance) {
+    promise->set_exception(std::make_exception_ptr(
+        std::runtime_error("Failed to create WebGPU instance.")));
+    return promise->get_future();
+  }
   try {
     auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts);
-    // Pump events until the adapter future is ready.
     ctx.adapter = waitForFuture(ctx.instance, adapterFuture);
     ctx.adapterStatus = WGPURequestAdapterStatus_Success;
   } catch (const std::exception &ex) {
-    check(false, ex.what(), __FILE__, __LINE__);
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
   }
-
-  // Request the device asynchronously.
-  LOG(kDefLog, kTrace, "Requesting device");
   try {
     auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
-    // Pump events until the device future is ready.
     ctx.device = waitForFuture(ctx.instance, deviceFuture);
     ctx.deviceStatus = WGPURequestDeviceStatus_Success;
-    LOG(kDefLog, kTrace, "Device request ended");
-
-    // If the device was created, set up logging and fetch the queue.
-#ifndef __EMSCRIPTEN__
-    WGPULoggingCallbackInfo loggingCallbackInfo{
-        .nextInChain = nullptr,
-        .callback =
-            [](WGPULoggingType type, WGPUStringView message, void *, void *) {
-              LOG(kDefLog, kError, "Device logging callback: %.*s",
-                  static_cast<int>(message.length), message.data);
-              if (type == WGPULoggingType_Error) {
-                throw std::runtime_error("Device error logged.");
-              }
-            },
-        .userdata1 = nullptr,
-        .userdata2 = nullptr};
-    wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
-#endif
-    ctx.queue = wgpuDeviceGetQueue(ctx.device);
   } catch (const std::exception &ex) {
-    check(false, ex.what(), __FILE__, __LINE__);
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
   }
+  ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  promise->set_value(std::move(ctx));
+
+  return promise->get_future();
+}
 
-  return std::move(ctx);
+/**
+ * @brief Synchronously waits for and returns the created GPU context.
+ *
+ * This function invokes the asynchronous createContext() factory function to create a GPU
+ * context, then waits for its completion using waitForContextFuture. The returned Context
+ * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent
+ * GPU operations.
+ *
+ * @return Context The fully initialized GPU context.
+ *
+ * @code
+ * Context ctx = waitForContext();
+ * // Now ctx can be used for GPU operations.
+ * @endcode
+ */
+inline Context waitForContext() {
+  std::future<Context> contextFuture = createContext();
+  return waitForContextFuture<Context>(contextFuture);
 }
 
 #ifdef USE_DAWN_API
@@ -1152,17 +1202,22 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
 #endif
 
 /**
- * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping.
- *
- * This callback is triggered when the GPU buffer mapping for a readback buffer is completed.
- * It verifies that the mapping operation was successful, retrieves the mapped memory,
- * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer,
- * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data.
- *
- * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success.
- * @param message A string view containing additional information about the mapping operation.
- * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer,
- *                  buffer size, destination CPU memory pointer, and a promise for signaling completion.
+ * @brief Callback function invoked upon completion of an asynchronous GPU
+ * buffer mapping.
+ *
+ * This callback is triggered when the GPU buffer mapping for a readback buffer
+ * is completed. It verifies that the mapping operation was successful,
+ * retrieves the mapped memory, copies the data from the GPU buffer to a CPU
+ * memory region, unmaps the buffer, signals the completion by fulfilling the
+ * associated promise, and cleans up the allocated callback data.
+ *
+ * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success
+ * on success.
+ * @param message A string view containing additional information about the
+ * mapping operation.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure
+ * containing the GPU buffer, buffer size, destination CPU memory pointer, and a
+ * promise for signaling completion.
  * @param userdata2 Unused.
  */
 inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
@@ -1192,16 +1247,20 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
 }
 
 /**
- * @brief Callback function invoked when the GPU queue’s submitted work is complete.
- *
- * This callback is registered with the GPU queue after submitting work. When invoked,
- * it verifies that all queued work completed successfully, and then sets up the buffer
- * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback
- * buffer is mapped to access the processed data on the CPU.
- *
- * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success.
- * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer,
- *                  buffer size, destination CPU memory pointer, and a promise to signal completion.
+ * @brief Callback function invoked when the GPU queue’s submitted work is
+ * complete.
+ *
+ * This callback is registered with the GPU queue after submitting work. When
+ * invoked, it verifies that all queued work completed successfully, and then
+ * sets up the buffer mapping callback to initiate the asynchronous mapping of a
+ * readback buffer. The readback buffer is mapped to access the processed data
+ * on the CPU.
+ *
+ * @param status The status of the completed work. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure
+ * containing the readback buffer, buffer size, destination CPU memory pointer,
+ * and a promise to signal completion.
  * @param userdata2 Unused.
  */
 inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
@@ -1543,12 +1602,14 @@ inline Shape cdiv(Shape total, Shape group) {
 }
 
 /**
- * @brief Packages the shader compilation information along with a promise for asynchronous signaling.
+ * @brief Packages the shader compilation information along with a promise for
+ * asynchronous signaling.
  *
  * This structure holds a pointer to a CompilationInfo instance that collects
- * details such as status, messages, line numbers, and positions from the shader compilation.
- * It also contains a shared pointer to a std::promise<void> which is used to signal the completion
- * of the asynchronous shader compilation process.
+ * details such as status, messages, line numbers, and positions from the shader
+ * compilation. It also contains a shared pointer to a std::promise<void> which
+ * is used to signal the completion of the asynchronous shader compilation
+ * process.
  */
 struct CompData {
   CompilationInfo *compInfo;
@@ -1578,10 +1639,11 @@ struct CompData {
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize);
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings,
+ numInputs, output, nThreads, params, paramsSize);
  * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
  * @endcode
- 
+
  */
 inline std::future<Kernel>
 createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
@@ -1783,15 +1845,19 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
 /**
  * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
  *
- * This callback is invoked when the GPU queue signals the completion of the submitted
- * workload for a kernel dispatch. It receives the work-done status and a userdata pointer,
- * which is expected to be a heap‑allocated pointer to a std::promise<void>.
+ * This callback is invoked when the GPU queue signals the completion of the
+ * submitted workload for a kernel dispatch. It receives the work-done status
+ * and a userdata pointer, which is expected to be a heap‑allocated pointer to a
+ * std::promise<void>.
  *
- * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception.
- * After setting the promise state, the allocated memory for the promise is freed.
+ * On success, the promise is fulfilled by calling set_value(). Otherwise, it is
+ * set with an exception. After setting the promise state, the allocated memory
+ * for the promise is freed.
  *
- * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success.
- * @param userdata1 A heap allocated pointer to std::promise<void> which is set when the work is done.
+ * @param status The status of the work done. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A heap allocated pointer to std::promise<void> which is set
+ * when the work is done.
  * @param userdata2 Unused.
  */
 inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
@@ -1824,8 +1890,9 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params);
- * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance,
+ * kernelFuture);
  * @endcode
  */
 template <typename ParamsType = NoParam, size_t numInputs>
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index 75d9dc4..21a0005 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -185,7 +185,7 @@ void testContainers() {
     testRoundTrip(h[3]);
   }
   {
-    Context ctx = createContext();
+    Context ctx = waitForContext();
     std::array<half, 8> h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0};
     Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data());
     std::array<half, 8> h2;
@@ -215,13 +215,14 @@ fn main(
     }
 }
 )";
-  Context ctx = createContext(
+  std::future<Context> futureContext = createContext(
       {}, {},
       /*device descriptor, enabling f16 in WGSL*/
       {
           .requiredFeatureCount = 1,
           .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(),
       });
+  Context ctx = waitForContextFuture(futureContext);
   static constexpr size_t N = 10000;
   std::array<half, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {

From 9a08f8a875d74fda1644adbb367edfdf2f70838a Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Thu, 20 Feb 2025 13:54:02 -0600
Subject: [PATCH 16/30] adds sync wrappers

---
 cmake/example.cmake          |   1 -
 examples/hello_world/run.cpp |  11 +-
 examples/render/run.cpp      |   6 +-
 gpu.hpp                      | 372 +++++++++++++++++++++++++----------
 numeric_types/half.cpp       |  19 +-
 5 files changed, 283 insertions(+), 126 deletions(-)

diff --git a/cmake/example.cmake b/cmake/example.cmake
index 5953876..cf697b5 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -54,7 +54,6 @@ if(EMSCRIPTEN)
         -sUSE_GLFW=3 \
         -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
         -sASYNCIFY \
-        -sASYNCIFY_DEBUG \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index c9f22c7..77549cf 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -28,7 +28,7 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
-  Context ctx = waitForContext();
+  Context ctx = createContext();
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -36,14 +36,11 @@ int main(int argc, char **argv) {
   }
   Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf32);
-  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf32},
+  Kernel op = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  Kernel op = waitForFuture(ctx.instance, kernelFuture);
-  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
-  waitForFuture(ctx.instance, dispatchFuture);
-  std::future<void> cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
-  waitForFuture(ctx.instance, cpuFuture);
+  dispatchKernel(ctx, op);
+  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
   }
diff --git a/examples/render/run.cpp b/examples/render/run.cpp
index f9a90f9..64122cd 100644
--- a/examples/render/run.cpp
+++ b/examples/render/run.cpp
@@ -124,10 +124,8 @@ int main(int argc, char **argv) {
                                      cdiv({NCOLS, NROWS, 1}, wgSize), params);
   printf("\033[2J\033[H");
   while (true) {
-    std::promise<void> promise;
-    std::future<void> future = promise.get_future();
-    dispatchKernel(ctx, renderKernel, promise);
-    wait(ctx, future);
+
+    dispatchKernel(ctx, renderKernel);
     toCPU(ctx, devScreen, screen.data(), sizeof(screen));
     params.time = getCurrentTimeInMilliseconds() - zeroTime;
 
diff --git a/gpu.hpp b/gpu.hpp
index 0119108..e050c87 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -807,11 +807,10 @@ inline void check(bool condition, const char *message,
  *
  * @code
  * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter,
- * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture);
+ * devDescriptor); WGPUDevice device = wait(instance, deviceFuture);
  * @endcode
  */
-template <typename T>
-T waitForFuture(WGPUInstance instance, std::future<T> &f) {
+template <typename T> T wait(Context &ctx, std::future<T> &f) {
 #ifdef __EMSCRIPTEN__
   // Poll until the future is ready.
   while (f.wait_for(std::chrono::milliseconds(0)) !=
@@ -823,7 +822,7 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
 #else
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
-    wgpuInstanceProcessEvents(instance);
+    wgpuInstanceProcessEvents(ctx.instance);
   }
   return f.get();
 #endif
@@ -832,12 +831,15 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
 // Context Callbacks & Helpers
 
 /**
- * @brief Waits for the provided std::future<T> to become ready by polling its status.
+ * @brief Waits for the provided std::future<T> to become ready by polling its
+ * status.
  *
- * This helper template function continuously checks the status of the provided std::future<T> until it is ready.
- * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior.
- * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks.
- * Once the future is ready, its value is returned.
+ * This helper template function continuously checks the status of the provided
+ * std::future<T> until it is ready. On Emscripten builds, it yields control to
+ * the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous
+ * behavior. On non-Emscripten platforms, it sleeps for a short duration (10
+ * milliseconds) between checks. Once the future is ready, its value is
+ * returned.
  *
  * @tparam T The type of the value contained in the future.
  * @param f The future to wait on.
@@ -849,20 +851,20 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
  * @endcode
  */
 template <typename T> T waitForContextFuture(std::future<T> &f) {
-  #ifdef __EMSCRIPTEN__
-    while (f.wait_for(std::chrono::milliseconds(0)) !=
-           std::future_status::ready) {
-      emscripten_sleep(1); // Yield back to the JS event loop.
-    }
-    return f.get();
-  #else
-    while (f.wait_for(std::chrono::milliseconds(0)) !=
-           std::future_status::ready) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-    return f.get();
-  #endif
+#ifdef __EMSCRIPTEN__
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    emscripten_sleep(1); // Yield back to the JS event loop.
+  }
+  return f.get();
+#else
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
+  return f.get();
+#endif
+}
 
 /**
  * @brief Adapter callback function invoked upon completion of an asynchronous
@@ -1013,9 +1015,9 @@ requestDeviceAsync(WGPUAdapter adapter,
  *
  */
 inline std::future<Context>
-createContext(const WGPUInstanceDescriptor &desc = {},
-              const WGPURequestAdapterOptions &adapterOpts = {},
-              const WGPUDeviceDescriptor &devDescriptor = {}) {
+createContextAsync(const WGPUInstanceDescriptor &desc = {},
+                   const WGPURequestAdapterOptions &adapterOpts = {},
+                   const WGPUDeviceDescriptor &devDescriptor = {}) {
 
   auto promise = std::make_shared<std::promise<Context>>();
 
@@ -1030,7 +1032,7 @@ createContext(const WGPUInstanceDescriptor &desc = {},
   }
   try {
     auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts);
-    ctx.adapter = waitForFuture(ctx.instance, adapterFuture);
+    ctx.adapter = wait(ctx, adapterFuture);
     ctx.adapterStatus = WGPURequestAdapterStatus_Success;
   } catch (const std::exception &ex) {
     promise->set_exception(std::make_exception_ptr(ex));
@@ -1038,7 +1040,7 @@ createContext(const WGPUInstanceDescriptor &desc = {},
   }
   try {
     auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
-    ctx.device = waitForFuture(ctx.instance, deviceFuture);
+    ctx.device = wait(ctx, deviceFuture);
     ctx.deviceStatus = WGPURequestDeviceStatus_Success;
   } catch (const std::exception &ex) {
     promise->set_exception(std::make_exception_ptr(ex));
@@ -1053,10 +1055,11 @@ createContext(const WGPUInstanceDescriptor &desc = {},
 /**
  * @brief Synchronously waits for and returns the created GPU context.
  *
- * This function invokes the asynchronous createContext() factory function to create a GPU
- * context, then waits for its completion using waitForContextFuture. The returned Context
- * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent
- * GPU operations.
+ * This function invokes the asynchronous createContext() factory function to
+ * create a GPU context, then waits for its completion using
+ * waitForContextFuture. The returned Context holds handles to the WebGPU
+ * instance, adapter, device, and queue, and is used for subsequent GPU
+ * operations.
  *
  * @return Context The fully initialized GPU context.
  *
@@ -1065,8 +1068,11 @@ createContext(const WGPUInstanceDescriptor &desc = {},
  * // Now ctx can be used for GPU operations.
  * @endcode
  */
-inline Context waitForContext() {
-  std::future<Context> contextFuture = createContext();
+inline Context createContext(const WGPUInstanceDescriptor &desc = {},
+                             const WGPURequestAdapterOptions &adapterOpts = {},
+                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+  std::future<Context> contextFuture =
+      createContextAsync(desc, adapterOpts, devDescriptor);
   return waitForContextFuture<Context>(contextFuture);
 }
 
@@ -1294,8 +1300,8 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
  * toCPU(ctx, tensor, data, bufferSize);
  * @endcode
  */
-inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
-                               size_t bufferSize, CopyData &op) {
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
+                                    size_t bufferSize, CopyData &op) {
   // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
@@ -1340,8 +1346,8 @@ inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
  * @param[in] bufferSize Size of the data buffer in bytes
  * @param[out] data Pointer to the CPU memory to copy the data to
  */
-inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
-                               size_t bufferSize) {
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
+                                    size_t bufferSize) {
   // Create a promise that will later be satisfied when the async copy
   // completes.
   auto promise = std::make_shared<std::promise<void>>();
@@ -1395,26 +1401,8 @@ inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
   return promise->get_future();
 }
 
-/**
- * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU
- * memory for an array of floats instead of a pointer to a float buffer.
- * @param[in] ctx Context instance to manage the operation
- * @param[in] tensor Tensor instance representing the GPU buffer to copy from
- * @param[out] data Array of floats to copy the data to
- *
- * @code
- * std::future<void> toCPUFuture = toCPU(ctx, tensor, data);
- * WaitForFuture(ctx.instance, toCPUFuture);
- * @endcode
- */
-template <size_t N>
-inline std::future<void> toCPU(Context &ctx, Tensor &tensor,
-                               std::array<float, N> &data) {
-  return toCPU(ctx, tensor, data.data(), sizeof(data));
-}
-
-inline std::future<void> toCPU(Context &ctx, WGPUBuffer buffer, void *data,
-                               size_t size) {
+inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
+                                    size_t size) {
   // The size (in bytes) for the copy.
   uint64_t bufferSize = size;
 
@@ -1473,6 +1461,92 @@ inline std::future<void> toCPU(Context &ctx, WGPUBuffer buffer, void *data,
   return promise->get_future();
 }
 
+/**
+ * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU
+ * memory for an array of floats instead of a pointer to a float buffer.
+ * @param[in] ctx Context instance to manage the operation
+ * @param[in] tensor Tensor instance representing the GPU buffer to copy from
+ * @param[out] data Array of floats to copy the data to
+ *
+ * @code
+ * std::future<void> toCPUFuture = toCPU(ctx, tensor, data);
+ * wait(ctx, toCPUFuture);
+ * @endcode
+ */
+template <size_t N>
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
+                                    std::array<float, N> &data) {
+  return toCPUAsync(ctx, tensor, data.data(), sizeof(data));
+}
+
+/**
+ * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU
+ * memory.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param tensor Tensor instance representing the GPU buffer to copy from
+ * @param data Pointer to the CPU memory to copy the data to
+ * @param bufferSize Size of the data buffer in bytes
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, tensor, data, bufferSize, instance);
+ * @endcode
+ */
+inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
+  auto future = toCPUAsync(ctx, tensor, data, bufferSize);
+  wait(ctx, future);
+}
+
+/**
+ * @brief Synchronous wrapper for copying from a GPU buffer to CPU memory.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param buffer WGPUBuffer instance representing the GPU buffer to copy from
+ * @param data Pointer to the CPU memory to copy the data to
+ * @param size Size of the data buffer in bytes
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, buffer, data, size, instance);
+ * @endcode
+ */
+inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
+  auto future = toCPUAsync(ctx, buffer, data, size);
+  wait(ctx, future);
+}
+
+/**
+ * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU
+ * memory for an array of floats instead of a pointer to a float buffer.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param tensor Tensor instance representing the GPU buffer to copy from
+ * @param data Array of floats to copy the data to
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, tensor, data, instance);
+ * @endcode
+ */
+template <size_t N>
+inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
+  auto future = toCPUAsync(ctx, tensor, data);
+  wait(ctx, future);
+}
+
 /**
  * @brief Copies data from CPU memory to a GPU buffer. The toGPU overloads are
  * effectively a convenience wrapper around the WebGPU API call
@@ -1617,9 +1691,9 @@ struct CompData {
 };
 
 /**
- * @brief A factory function to create a kernel on the GPU. The kernel is
- * created with the given WGSL code, input tensors, output tensor, and
- * optional parameters.
+ * @brief A factory function to create a kernel asynchronously on the GPU.
+ * The kernel is created with the given WGSL code, input tensors,
+ * output tensor, and optional parameters.
  *
  * Note that the values of the input tensors are not used here, only the
  * reference handles to the underlying buffers as well as the size of the
@@ -1639,18 +1713,19 @@ struct CompData {
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings,
+ * std::future<Kernel> kernelFuture = createKernelAsync(ctx, code, dataBindings,
  numInputs, output, nThreads, params, paramsSize);
- * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
+ * Kernel kernel = wait(ctx.instance, kernelFuture);
  * @endcode
 
  */
 inline std::future<Kernel>
-createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
-             size_t numTensors, const size_t *viewOffsets,
-             const Shape &totalWorkgroups, const void *params = nullptr,
-             size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr,
-             const char *cacheKey = nullptr) {
+createKernelAsync(Context &ctx, const KernelCode &code,
+                  const Tensor *dataBindings, size_t numTensors,
+                  const size_t *viewOffsets, const Shape &totalWorkgroups,
+                  const void *params = nullptr, size_t paramsSize = 0,
+                  CompilationInfo *compilationInfo = nullptr,
+                  const char *cacheKey = nullptr) {
   // Create a cache key by the pointer values of the data bindings and the
   // kernel code
   if (cacheKey != nullptr &&
@@ -1818,7 +1893,7 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
 
     // Register callback and then wait for the result.
     wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo);
-    waitForFuture(ctx.instance, compFuture);
+    wait(ctx, compFuture);
   }
 
   // Now create the compute pipeline using the shader module.
@@ -1842,35 +1917,81 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
   return outerFuture;
 }
 
-/**
- * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
+/*
+ * @brief Overload which wraps the createKernelAsync factory function to create
+ * a kernel on the GPU. This overload uses takes a pointer and size for the
+ * input tensors instead of a static collection and a void pointer for params
+ * instead of a static type.
  *
- * This callback is invoked when the GPU queue signals the completion of the
- * submitted workload for a kernel dispatch. It receives the work-done status
- * and a userdata pointer, which is expected to be a heap‑allocated pointer to a
- * std::promise<void>.
+ * @param[in] ctx Context instance to manage the kernel
+ * @param[in] code WGSL code for the kernel
+ * @param[in] dataBindings Pointer to a span of tensors bound to the kernel
+ * @param[in] numTensors Number of tensors in the dataBindings span
+ * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be
+ * a Shape of rank == 3.
+ * @param[in] params Optional parameters for the kernel. If the kernel does
+ * not have any parameters, use NoParam.
+ * @return Kernel instance representing the created kernel
  *
- * On success, the promise is fulfilled by calling set_value(). Otherwise, it is
- * set with an exception. After setting the promise state, the allocated memory
- * for the promise is freed.
+ * @code
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance,
+ * kernelFuture);
+ * @endcode
+ */
+inline Kernel createKernel(Context &ctx, const KernelCode &code,
+                           const Tensor *dataBindings, size_t numTensors,
+                           const size_t *viewOffsets,
+                           const Shape &totalWorkgroups,
+                           const void *params = nullptr, size_t paramsSize = 0,
+                           CompilationInfo *compilationInfo = nullptr,
+                           const char *cacheKey = nullptr) {
+  std::future<Kernel> kernelFuture = createKernelAsync(
+      ctx, code, dataBindings, numTensors, viewOffsets, totalWorkgroups, params,
+      paramsSize, compilationInfo, cacheKey);
+  return wait(ctx, kernelFuture);
+}
+
+/**
+ * @brief Overload which wraps the createKernelAsync factory function to create
+ * a kernel asynchronously on the GPU. This overload uses takes a static
+ * collection of input tensors instead of a pointer and a statically determined
+ * ParamsType instead of casting params to a void pointer.
  *
- * @param status The status of the work done. Expected to be
- * WGPUQueueWorkDoneStatus_Success on success.
- * @param userdata1 A heap allocated pointer to std::promise<void> which is set
- * when the work is done.
- * @param userdata2 Unused.
+ * @param[in] ctx Context instance to manage the kernel
+ * @param[in] code WGSL code for the kernel
+ * @param[in] dataBindings A Bindings of tensors whose GPU buffers are bound
+ * to the kernel as inputs and outputs.
+ * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be
+ * a Shape of rank == 3.
+ * @param[in] params Optional parameters for the kernel. If the kernel does
+ * not have any parameters, use NoParam.
+ * @return Kernel instance representing the created kernel
+ *
+ * @code
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance,
+ * kernelFuture);
+ * @endcode
  */
-inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
-                                   void *userdata1, void * /*userdata2*/) {
-  // Cast the userdata pointer back to our heap‑allocated promise.
-  auto *p = reinterpret_cast<std::promise<void> *>(userdata1);
-  if (status == WGPUQueueWorkDoneStatus_Success) {
-    p->set_value();
+template <typename ParamsType = NoParam, size_t numInputs>
+std::future<Kernel>
+createKernelAsync(Context &ctx, const KernelCode &code,
+                  const Bindings<numInputs> &dataBindings,
+                  const Shape &totalWorkgroups,
+                  const ParamsType &params = ParamsType{},
+                  CompilationInfo *compilationInfo = nullptr,
+                  const char *cacheKey = nullptr) {
+  if constexpr (!IsNoParam<ParamsType>) {
+    return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs,
+                             dataBindings.viewOffsets.data(), totalWorkgroups,
+                             reinterpret_cast<const void *>(&params),
+                             sizeof(ParamsType), compilationInfo, cacheKey);
   } else {
-    p->set_exception(std::make_exception_ptr(
-        std::runtime_error("Queue work did not complete successfully.")));
+    return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs,
+                             dataBindings.viewOffsets.data(), totalWorkgroups,
+                             nullptr, 0, compilationInfo, cacheKey);
   }
-  delete p; // free the heap allocation
 }
 
 /**
@@ -1890,18 +2011,17 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
- * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance,
- * kernelFuture);
+ * Kernel kernel = createKernel(ctx, code, tensorData, output,totalWorkgroups,
+ * params);
  * @endcode
  */
 template <typename ParamsType = NoParam, size_t numInputs>
-std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
-                                 const Bindings<numInputs> &dataBindings,
-                                 const Shape &totalWorkgroups,
-                                 const ParamsType &params = ParamsType{},
-                                 CompilationInfo *compilationInfo = nullptr,
-                                 const char *cacheKey = nullptr) {
+Kernel createKernel(Context &ctx, const KernelCode &code,
+                    const Bindings<numInputs> &dataBindings,
+                    const Shape &totalWorkgroups,
+                    const ParamsType &params = ParamsType{},
+                    CompilationInfo *compilationInfo = nullptr,
+                    const char *cacheKey = nullptr) {
   if constexpr (!IsNoParam<ParamsType>) {
     return createKernel(ctx, code, dataBindings.data.data(), numInputs,
                         dataBindings.viewOffsets.data(), totalWorkgroups,
@@ -1914,6 +2034,37 @@ std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
   }
 }
 
+/**
+ * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
+ *
+ * This callback is invoked when the GPU queue signals the completion of the
+ * submitted workload for a kernel dispatch. It receives the work-done status
+ * and a userdata pointer, which is expected to be a heap‑allocated pointer to a
+ * std::promise<void>.
+ *
+ * On success, the promise is fulfilled by calling set_value(). Otherwise, it is
+ * set with an exception. After setting the promise state, the allocated memory
+ * for the promise is freed.
+ *
+ * @param status The status of the work done. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A heap allocated pointer to std::promise<void> which is set
+ * when the work is done.
+ * @param userdata2 Unused.
+ */
+inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
+                                   void *userdata1, void * /*userdata2*/) {
+  // Cast the userdata pointer back to our heap‑allocated promise.
+  auto *p = reinterpret_cast<std::promise<void> *>(userdata1);
+  if (status == WGPUQueueWorkDoneStatus_Success) {
+    p->set_value();
+  } else {
+    p->set_exception(std::make_exception_ptr(
+        std::runtime_error("Queue work did not complete successfully.")));
+  }
+  delete p; // free the heap allocation
+}
+
 /**
  * @brief Asynchronously submits a kernel to the GPU queue for execution.
  * It also sets up a callback to notify when the kernel has finished executing
@@ -1930,10 +2081,10 @@ std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
  *
  * @code
  * std::future<void> dispatchFuture = dispatchKernel(ctx, kernel);
- * WaitForFuture(ctx.instance, dispatchFuture);
+ * wait(ctx.instance, dispatchFuture);
  * @endcode
  */
-inline std::future<void> dispatchKernel(Context &ctx, Kernel &kernel) {
+inline std::future<void> dispatchKernelAsync(Context &ctx, Kernel &kernel) {
   // If the kernel was used before, reset the command buffer.
   if (kernel->used) {
     resetCommandBuffer(ctx.device, kernel);
@@ -1962,6 +2113,23 @@ inline std::future<void> dispatchKernel(Context &ctx, Kernel &kernel) {
   return future;
 }
 
+/**
+ * @brief Synchronous wrapper for dispatchKernelAsync. This function submits
+ * the kernel to the GPU queue and waits for it to finish executing.
+ *
+ * @param[in] ctx Context instance to manage the kernel, from which the queue
+ * for the GPU is obtained
+ * @param[in] kernel Kernel instance to dispatch
+ *
+ * @code
+ * dispatchKernel(ctx, kernel);
+ * @endcode
+ */
+inline void dispatchKernel(Context &ctx, Kernel &kernel) {
+  auto future = dispatchKernelAsync(ctx, kernel);
+  wait(ctx, future);
+}
+
 } // namespace gpu
 
 #endif // GPU_H
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index 21a0005..c183754 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -185,12 +185,11 @@ void testContainers() {
     testRoundTrip(h[3]);
   }
   {
-    Context ctx = waitForContext();
+    Context ctx = createContext();
     std::array<half, 8> h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0};
     Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data());
     std::array<half, 8> h2;
-    std::future<void> toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2));
-    waitForFuture(ctx.instance, toCPUFuture);
+    toCPU(ctx, devH, h2.data(), sizeof(h2));
     for (int i = 0; i < 8; ++i) {
       printResult(h[i].data == h2[i].data, "Container round trip",
                   static_cast<float>(h[i]), static_cast<float>(h2[i]));
@@ -215,14 +214,13 @@ fn main(
     }
 }
 )";
-  std::future<Context> futureContext = createContext(
+  Context ctx = createContext(
       {}, {},
       /*device descriptor, enabling f16 in WGSL*/
       {
           .requiredFeatureCount = 1,
           .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(),
       });
-  Context ctx = waitForContextFuture(futureContext);
   static constexpr size_t N = 10000;
   std::array<half, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -230,20 +228,17 @@ fn main(
   }
   Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf16);
-  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
+  Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  Kernel op = waitForFuture(ctx.instance, kernelFuture);
-  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
-  waitForFuture(ctx.instance, dispatchFuture);
-  std::future<void> toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
-  waitForFuture(ctx.instance, toCPUFuture);
+  dispatchKernel(ctx, op);
+  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", static_cast<float>(inputArr[i]),
            static_cast<float>(outputArr[i]));
   }
 }
 
-int testMain() {
+int testHalfMain() {
   printf("\nHalf-precision float tests\n==========================\n");
 
   printf("\nRegular values float round trips\n\n");

From 95e587d71d25ab74207648ca91500a7594bff870 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Thu, 20 Feb 2025 16:34:35 -0600
Subject: [PATCH 17/30] refactors the byIdx context function and sets
 USE_DAWN_API compile def on native

---
 cmake/dawn.cmake             |   2 +
 cmake/gpu.cmake              |   2 +
 examples/hello_world/run.cpp |   6 +
 gpu.hpp                      | 308 ++++++++++++++++++++++-------------
 4 files changed, 201 insertions(+), 117 deletions(-)

diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
index 2ead9ae..c6fed94 100644
--- a/cmake/dawn.cmake
+++ b/cmake/dawn.cmake
@@ -7,6 +7,8 @@ if(EMSCRIPTEN)
     set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
     set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
     set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
+else()
+    add_compile_definitions(USE_DAWN_API)
 endif()
 
 # Enable find for no dawn rebuilds with flutter run
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 6cce9e6..f936991 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -32,7 +32,9 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
 set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
 target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
 if(NOT EMSCRIPTEN)
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/")
+    target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/")
 else()
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 77549cf..b44934b 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -28,7 +28,13 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
+  #ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+  auto adaptersList = listAdapters(ctx);
+  LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str());
+  #else
   Context ctx = createContext();
+  #endif
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
diff --git a/gpu.hpp b/gpu.hpp
index e050c87..906371c 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -16,9 +16,8 @@
 #include <utility> // std::pair
 #include <vector>
 
-#ifndef __EMSCRIPTEN__
 
-#else
+#ifdef __EMSCRIPTEN__
 #include "emscripten/emscripten.h"
 #endif
 
@@ -255,6 +254,26 @@ inline std::string toString(const Shape &shape) {
  */
 inline std::string toString(size_t value) { return std::to_string(value); }
 
+/**
+ * @brief Converts a WGPUStringView to an std::string.
+ *
+ * If the view's data is null, an empty string is returned. If the view's
+ * length equals WGPU_STRLEN, it is assumed to be null‑terminated; otherwise,
+ * the explicit length is used.
+ *
+ * @param strView The WGPUStringView to convert.
+ * @return std::string The resulting standard string.
+ */
+inline std::string formatWGPUStringView(WGPUStringView strView) {
+  if (!strView.data) {
+    return "";
+  }
+  if (strView.length == WGPU_STRLEN) {
+    return std::string(strView.data);
+  }
+  return std::string(strView.data, strView.length);
+}
+
 /**
  * @brief simple in-place string replacement helper function for substituting
  * placeholders in a WGSL string template.
@@ -1076,136 +1095,191 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {},
   return waitForContextFuture<Context>(contextFuture);
 }
 
-#ifdef USE_DAWN_API
+#ifndef __EMSCRIPTEN__
+#if USE_DAWN_API
 /**
- * @brief Factory function to create a GPU context, which aggregates WebGPU API
- * handles to interact with the GPU including the instance, adapter, device, and
- * queue.
+ * @brief Retrieves the list of available GPU adapters from the Dawn instance.
  *
- * The function takes gpu index to support for multi GPUs.
- * To activate this function, it needs not only webgpu's headers but also DAWN's
- * headers.
+ * This function creates a Dawn instance using the provided context's instance
+ * handle, then enumerates and returns the available GPU adapters as a vector.
  *
- * If dawn is used, it also sets up an error callback for device loss.
+ * @param ctx The Context containing the WebGPU instance handle.
+ * @return std::vector<dawn::native::Adapter> A vector of available GPU
+ * adapters.
+ * 
+ * @code
+ * std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
+ * @endcode
+ */
+inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
+  dawn::native::Instance dawnInstance(
+      reinterpret_cast<dawn::native::InstanceBase *>(ctx.instance));
+  return dawnInstance.EnumerateAdapters();
+}
+
+/**
+ * @brief Formats the given vector of Dawn adapters into a single concatenated string.
  *
- * @param[in] gpuIdx GPU index
- * @param[in] desc Instance descriptor for the WebGPU instance (optional)
- * @param[in] devDescriptor Device descriptor for the WebGPU device (optional)
- * @return Context instance representing the created GPU context
+ * This function iterates over each Dawn adapter in the provided vector, retrieves its
+ * description using the WebGPU API, and converts the description from a WGPUStringView
+ * to an std::string using the formatWGPUStringView helper. The resulting descriptions
+ * are concatenated into a single string separated by newline characters.
  *
+ * @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
+ * @return std::string A newline-delimited string listing each adapter's description.
+ * 
  * @code
- * Context ctx = createContextByGpuIdx(1);
+ * std::string adapterList = formatAdapters(adapters);
  * @endcode
  */
-inline Context
-createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
-                      const WGPUDeviceDescriptor &devDescriptor = {}) {
-  Context context;
-  {
-#ifdef __EMSCRIPTEN__
-    // Emscripten does not support the instance descriptor
-    // and throws an assertion error if it is not nullptr.
-    context.instance = wgpuCreateInstance(nullptr);
-#else
-    context.instance = wgpuCreateInstance(&desc);
-#endif
-    // check status
-    check(context.instance, "Initialize WebGPU", __FILE__, __LINE__);
+inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
+  std::string adapterList;
+  for (size_t i = 0; i < adapters.size(); ++i) {
+    auto adapterPtr = adapters[i].Get();
+    if (adapterPtr) {
+      WGPUAdapterInfo info = {};
+      wgpuAdapterGetInfo(adapterPtr, &info);
+      std::string desc = formatWGPUStringView(info.description);
+      adapterList += "GPU Adapter [" + std::to_string(i) + "]: " + desc + "\n";
+      wgpuAdapterInfoFreeMembers(info);
+    }
   }
+  return adapterList;
+}
 
-  LOG(kDefLog, kInfo, "Requesting adapter");
-  {
-    std::vector<dawn::native::Adapter> adapters =
-        dawn::native::Instance(
-            reinterpret_cast<dawn::native::InstanceBase *>(context.instance))
-            .EnumerateAdapters();
-    LOG(kDefLog, kInfo, "The number of GPUs=%d\n", adapters.size());
-    // Note: Second gpu is not available on Macos, but the number of GPUs is 2
-    // on Macos.
-    //       Calling wgpuAdapterGetInfo function for the second gpu becomes
-    //       segfault. When you check all GPUs on linux, uncomment out following
-    //       codes.
-    //
-    // for (size_t i = 0; i < adapters.size(); i++) {
-    //   WGPUAdapterInfo info {};
-    //   auto ptr = adapters[i].Get();
-    //   if (ptr && adapters[i]) {
-    //     wgpuAdapterGetInfo(ptr, &info);
-    //     LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", i, info.description);
-    //     wgpuAdapterInfoFreeMembers(info);
-    //   }
-    // }
-
-    {
-      LOG(kDefLog, kInfo, "Use GPU(Adapter)[%d]\n", gpuIdx);
-      auto ptr = adapters[gpuIdx].Get();
-      if (ptr) {
-        WGPUAdapterInfo info{};
-        wgpuAdapterGetInfo(ptr, &info);
-        LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", gpuIdx,
-            info.description);
-        wgpuAdapterInfoFreeMembers(info);
-      }
-      context.adapter = adapters[gpuIdx].Get();
-      dawn::native::GetProcs().adapterAddRef(context.adapter);
-    }
+/**
+ * @brief Lists the available GPU adapters in the current WebGPU instance.
+ *
+ * This function retrieves the list of available GPU adapters using the
+ * getAdapters helper function, then formats and returns the adapter
+ * descriptions as a single string using the formatAdapters helper function.
+ *
+ * @param ctx The Context containing the WebGPU instance handle.
+ * @return std::string A newline-delimited string listing each adapter's
+ * description.
+ * 
+ * @code
+ * std::string adapterList = listAdapters(ctx);
+ * @endcode
+ */
+inline std::string listAdapters(Context &ctx) {
+  auto adapters = getAdapters(ctx);
+  return formatAdapters(adapters);
+}
+
+/**
+ * @brief Asynchronously creates a GPU context using the specified GPU index.
+ *
+ * This function creates a WebGPU instance, retrieves the available GPU
+ * adapters, and selects the adapter at the specified index. It then requests a
+ * device from the selected adapter and sets up a logging callback for device
+ * errors. The function returns a future that will be fulfilled with the
+ * created Context once all operations are complete.
+ *
+ * @param gpuIdx The index of the GPU adapter to use.
+ * @param desc Instance descriptor for the WebGPU instance (optional)
+ * @param devDescriptor Device descriptor for the WebGPU device (optional)
+ * @return std::future<Context> A future that will eventually hold the created
+ * Context.
+ * 
+ * @code
+ * std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
+ * Context ctx = waitForContextFuture(contextFuture);
+ * @endcode
+ */
+inline std::future<Context>
+createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
+                           const WGPUDeviceDescriptor &devDescriptor = {}) {
+  auto promise = std::make_shared<std::promise<Context>>();
+  Context ctx;
+
+  ctx.instance = wgpuCreateInstance(&desc);
+
+  if (!ctx.instance) {
+    promise->set_exception(std::make_exception_ptr(
+        std::runtime_error("Failed to create WebGPU instance.")));
+    return promise->get_future();
   }
+  check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
 
-  LOG(kDefLog, kInfo, "Requesting device");
-  {
-    struct DeviceData {
-      WGPUDevice device = nullptr;
-      bool requestEnded = false;
-    };
-    DeviceData devData;
-
-    auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
-                                   WGPUDevice device, WGPUStringView message,
-                                   void *pUserData, void *) {
-      DeviceData &devData = *reinterpret_cast<DeviceData *>(pUserData);
-      check(status == WGPURequestDeviceStatus_Success,
-            "Could not get WebGPU device.", __FILE__, __LINE__);
-      LOG(kDefLog, kTrace, "Device Request succeeded %x",
-          static_cast<void *>(device));
-      devData.device = device;
-      devData.requestEnded = true;
-    };
+  // Use helper functions to obtain and format the adapters.
+  auto adapters = getAdapters(ctx);
 
-    WGPURequestDeviceCallbackInfo deviceCallbackInfo = {
-        .mode = WGPUCallbackMode_AllowSpontaneous,
-        .callback = onDeviceRequestEnded,
-        .userdata1 = &devData,
-        .userdata2 = nullptr};
-    wgpuAdapterRequestDevice(context.adapter, &devDescriptor,
-                             deviceCallbackInfo);
-
-    LOG(kDefLog, kInfo, "Waiting for device request to end");
-    while (!devData.requestEnded) {
-      processEvents(context.instance);
-    }
-    LOG(kDefLog, kInfo, "Device request ended");
-    assert(devData.requestEnded);
-    context.device = devData.device;
-
-    WGPULoggingCallbackInfo loggingCallbackInfo = {
-        .nextInChain = nullptr,
-        .callback =
-            [](WGPULoggingType type, WGPUStringView message, void *userdata1,
-               void *userdata2) {
-              LOG(kDefLog, kError, "Device logging callback: %.*s",
-                  static_cast<int>(message.length), message.data);
-              if (type == WGPULoggingType_Error) {
-                throw std::runtime_error("Device error logged.");
-              }
-            },
-        .userdata1 = nullptr,
-        .userdata2 = nullptr};
-    wgpuDeviceSetLoggingCallback(context.device, loggingCallbackInfo);
+  if (gpuIdx >= adapters.size()) {
+    promise->set_exception(
+        std::make_exception_ptr(std::runtime_error("Invalid GPU index.")));
+    return promise->get_future();
+  }
+  LOG(kDefLog, kInfo, "Using GPU Adapter[%d]", gpuIdx);
+  auto adapterPtr = adapters[gpuIdx].Get();
+  if (adapterPtr) {
+    WGPUAdapterInfo info = {};
+    wgpuAdapterGetInfo(adapterPtr, &info);
+    LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s", gpuIdx,
+        formatWGPUStringView(info.description).c_str());
+    wgpuAdapterInfoFreeMembers(info);
+  }
+  ctx.adapter = reinterpret_cast<WGPUAdapter>(adapterPtr);
+  dawn::native::GetProcs().adapterAddRef(ctx.adapter);
+
+  LOG(kDefLog, kInfo, "Requesting device");
+  // Request the device asynchronously (using our requestDeviceAsync helper).
+  auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
+  try {
+    ctx.device = wait(ctx, deviceFuture);
+    ctx.deviceStatus = WGPURequestDeviceStatus_Success;
+  } catch (const std::exception &ex) {
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
   }
-  context.queue = wgpuDeviceGetQueue(context.device);
-  return context;
+
+  WGPULoggingCallbackInfo loggingCallbackInfo{
+      .nextInChain = nullptr,
+      .callback =
+          [](WGPULoggingType type, WGPUStringView message, void *userdata1,
+             void *userdata2) {
+            LOG(kDefLog, kError, "Device logging callback: %.*s",
+                static_cast<int>(message.length), message.data);
+            if (type == WGPULoggingType_Error) {
+              throw std::runtime_error("Device error logged.");
+            }
+          },
+      .userdata1 = nullptr,
+      .userdata2 = nullptr};
+  wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
+  ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  promise->set_value(std::move(ctx));
+  return promise->get_future();
 }
-#endif
+
+/**
+ * @brief Synchronously creates a GPU context using the specified GPU index.
+ *
+ * This function calls the asynchronous createContextByGpuIdxAsync function to
+ * create a GPU context, then waits for its completion using
+ * waitForContextFuture. The returned Context holds handles to the WebGPU
+ * instance, adapter, device, and queue, and is used for subsequent GPU
+ * operations.
+ *
+ * @param gpuIdx The index of the GPU adapter to use.
+ * @param desc Instance descriptor for the WebGPU instance (optional)
+ * @param devDescriptor Device descriptor for the WebGPU device (optional)
+ * @return Context The fully initialized GPU context.
+ *
+ * @code
+ * Context ctx = createContextByGpuIdx(0);
+ * @endcode
+ */
+inline Context createContextByGpuIdx(int gpuIdx,
+                             const WGPUInstanceDescriptor &desc = {},
+                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+  std::future<Context> contextFuture =
+      createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor);
+  return waitForContextFuture<Context>(contextFuture);
+}
+
+#endif // USE_DAWN_API
+#endif // __EMSCRIPTEN__
 
 /**
  * @brief Callback function invoked upon completion of an asynchronous GPU

From 70d980287f9a7cca8889e166d67b802bc4b69319 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:14:41 -0600
Subject: [PATCH 18/30] tests toCPU, adds offset, adds gpuflow doc, default
 cmakelists builds test/test_gpu.cpp

---
 CMakeLists.txt    |  18 +++++
 docs/gpuflow.md   |  78 +++++++++++++++++++
 gpu.hpp           | 110 ++++++++++++++++----------
 test/test_gpu.cpp | 193 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 357 insertions(+), 42 deletions(-)
 create mode 100644 docs/gpuflow.md
 create mode 100644 test/test_gpu.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 816cdf3..a17602e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,24 @@ endif()
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
 
+target_link_libraries(gpu PRIVATE webgpu_dawn)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)
+
+add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp)
+target_link_libraries(test_gpu PRIVATE gpu)
+
+# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
+if(MSVC)
+    add_custom_command(
+        TARGET test_gpu POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+                ${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
+                $<TARGET_FILE_DIR:test_gpu>
+        COMMENT "Copying webgpu_dawn.dll to the build directory"
+    )
+endif()
+
 add_library(gpud SHARED gpu.hpp)
 set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
 target_link_libraries(gpud PRIVATE gpu)
diff --git a/docs/gpuflow.md b/docs/gpuflow.md
new file mode 100644
index 0000000..d4eb37a
--- /dev/null
+++ b/docs/gpuflow.md
@@ -0,0 +1,78 @@
+# GPU.cpp Lifecycle
+
+```mermaid
+flowchart TD
+  %% Data Preparation & Upload
+  subgraph "Data Preparation & Upload"
+    A["CPU Data"]
+    B["Define Data Properties<br>(shape, type, size)"]
+    C["Create GPU Buffer<br>(allocate raw buffer)"]
+    D["Create Tensor<br>(allocates Array with one<br> or more buffers<br>and associates Shape)"]
+    
+    E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
+    F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
+    G["Optional: Upload Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
+  end
+
+  %% Buffer Setup & Bindings
+  subgraph "Buffer & Binding Setup"
+    H["Define Bindings<br>(Bindings, TensorView)"]
+    I["Map GPU buffers<br> to shader bindings<br>(Collection from Tensor<br> or single buffers)"]
+  end
+
+  %% Kernel Setup & Execution
+  subgraph "Kernel Setup & Execution"
+    J["Define KernelCode<br>(WGSL template, workgroup size, precision)"]
+    K["Create Kernel"]
+    L["Dispatch Kernel"]
+  end
+
+  %% GPU Execution & Result Readback
+  subgraph "GPU Execution & Result Readback"
+    M["Kernel Execution<br>(GPU shader runs)"]
+    N["Readback Data<br>(toCPU variants)"]
+  end
+
+  %% Context & Resources
+  O["Context<br>(Device, Queue,<br>TensorPool, KernelPool)"]
+
+  %% Flow Connections
+  A --> B
+  B --> C
+  B --> D
+  C --> E
+  D --> F
+  F --> H
+  E --> H
+  H --> I
+  I --> K
+  J --> K
+  G --- K
+  K --> L
+  L --> M
+  M --> N
+
+  %% Context shared by all stages
+  O --- D
+  O --- E
+  O --- F
+  O --- K
+  O --- L
+  O --- N
+```
+
+Rank 0: Scalar
+Rank 1: Vector
+Rank 2: Matrix
+Rank 3: 3D Tensor (or Cube)
+Rank 4: 4D Tensor
+Rank ..: Higher Dimensional Tensors
+
+
+• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
+• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
+• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
+• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
+• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
+• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
+• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor.
diff --git a/gpu.hpp b/gpu.hpp
index 906371c..931d646 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -16,7 +16,6 @@
 #include <utility> // std::pair
 #include <vector>
 
-
 #ifdef __EMSCRIPTEN__
 #include "emscripten/emscripten.h"
 #endif
@@ -1106,7 +1105,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {},
  * @param ctx The Context containing the WebGPU instance handle.
  * @return std::vector<dawn::native::Adapter> A vector of available GPU
  * adapters.
- * 
+ *
  * @code
  * std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
  * @endcode
@@ -1118,21 +1117,25 @@ inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
 }
 
 /**
- * @brief Formats the given vector of Dawn adapters into a single concatenated string.
+ * @brief Formats the given vector of Dawn adapters into a single concatenated
+ * string.
  *
- * This function iterates over each Dawn adapter in the provided vector, retrieves its
- * description using the WebGPU API, and converts the description from a WGPUStringView
- * to an std::string using the formatWGPUStringView helper. The resulting descriptions
- * are concatenated into a single string separated by newline characters.
+ * This function iterates over each Dawn adapter in the provided vector,
+ * retrieves its description using the WebGPU API, and converts the description
+ * from a WGPUStringView to an std::string using the formatWGPUStringView
+ * helper. The resulting descriptions are concatenated into a single string
+ * separated by newline characters.
  *
  * @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
- * @return std::string A newline-delimited string listing each adapter's description.
- * 
+ * @return std::string A newline-delimited string listing each adapter's
+ * description.
+ *
  * @code
  * std::string adapterList = formatAdapters(adapters);
  * @endcode
  */
-inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
+inline std::string
+formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
   std::string adapterList;
   for (size_t i = 0; i < adapters.size(); ++i) {
     auto adapterPtr = adapters[i].Get();
@@ -1157,7 +1160,7 @@ inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adap
  * @param ctx The Context containing the WebGPU instance handle.
  * @return std::string A newline-delimited string listing each adapter's
  * description.
- * 
+ *
  * @code
  * std::string adapterList = listAdapters(ctx);
  * @endcode
@@ -1181,7 +1184,7 @@ inline std::string listAdapters(Context &ctx) {
  * @param devDescriptor Device descriptor for the WebGPU device (optional)
  * @return std::future<Context> A future that will eventually hold the created
  * Context.
- * 
+ *
  * @code
  * std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
  * Context ctx = waitForContextFuture(contextFuture);
@@ -1270,9 +1273,9 @@ createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
  * Context ctx = createContextByGpuIdx(0);
  * @endcode
  */
-inline Context createContextByGpuIdx(int gpuIdx,
-                             const WGPUInstanceDescriptor &desc = {},
-                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+inline Context
+createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
+                      const WGPUDeviceDescriptor &devDescriptor = {}) {
   std::future<Context> contextFuture =
       createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor);
   return waitForContextFuture<Context>(contextFuture);
@@ -1365,17 +1368,19 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
 /**
  * @brief Copies data from a GPU buffer to CPU memory.
  * @param[in] ctx Context instance to manage the operation
- * @param[in] tensor Tensor instance representing the GPU buffer to copy from
  * @param[out] data Pointer to the CPU memory to copy the data to
  * @param[in] bufferSize Size of the data buffer in bytes
  * @param[in] op StagingBuffer instance to manage the operation
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
  *
  * @code
  * toCPU(ctx, tensor, data, bufferSize);
  * @endcode
  */
-inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
-                                    size_t bufferSize, CopyData &op) {
+
+// NOTE: I think this one is redundant? CopyData not used externally.
+inline std::future<void> toCPUAsync(Context &ctx, void *data, size_t bufferSize,
+                                    CopyData &op, size_t sourceOffset = 0) {
   // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
@@ -1388,8 +1393,8 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   CallbackData *cbData = new CallbackData{
       op.readbackBuffer, // The GPU buffer to be read back.
       bufferSize,
-      data,   // CPU memory destination.
-      promise // The promise to be signaled.
+      data,    // CPU memory destination.
+      promise, // The promise to be signaled.
   };
 
   // Set up the work-done callback to initiate the buffer mapping.
@@ -1402,6 +1407,11 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Begin the asynchronous chain by registering the queue work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
+  // Release the readback buffer as it is no longer needed.
+  if (op.readbackBuffer) {
+    wgpuBufferRelease(op.readbackBuffer);
+  }
+
   return promise->get_future();
 }
 
@@ -1417,11 +1427,13 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
  *
  * @param[in] ctx Context instance to manage the operation
  * @param[in] tensor Tensor instance representing the GPU buffer to copy from
- * @param[in] bufferSize Size of the data buffer in bytes
+ * @param[in] bufferSize Size to read in bytes as out data.
  * @param[out] data Pointer to the CPU memory to copy the data to
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
  */
 inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
-                                    size_t bufferSize) {
+                                    size_t bufferSize,
+                                    size_t sourceOffset = 0) {
   // Create a promise that will later be satisfied when the async copy
   // completes.
   auto promise = std::make_shared<std::promise<void>>();
@@ -1430,7 +1442,7 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   WGPUBufferDescriptor readbackBufferDescriptor = {
       .label = {.data = nullptr, .length = 0},
       .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
-      .size = bufferSize,
+      .size = bufferSize, // Size of the readback buffer.
   };
   WGPUBuffer readbackBuffer =
       wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
@@ -1438,8 +1450,9 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Create a command encoder and record a copy from the tensor GPU buffer
   WGPUCommandEncoder commandEncoder =
       wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-  wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
-                                       readbackBuffer, 0, bufferSize);
+  wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer,
+                                       sourceOffset, readbackBuffer, 0,
+                                       bufferSize);
   // Finish recording by creating a command buffer and release the encoder.
   WGPUCommandBuffer commandBuffer =
       wgpuCommandEncoderFinish(commandEncoder, nullptr);
@@ -1472,13 +1485,16 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // queueWorkDoneCallback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
+  if (readbackBuffer) {
+    wgpuBufferRelease(readbackBuffer);
+  }
+
   return promise->get_future();
 }
 
 inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
-                                    size_t size) {
-  // The size (in bytes) for the copy.
-  uint64_t bufferSize = size;
+                                    size_t bufferSize,
+                                    size_t sourceOffset = 0) {
 
   // Create an operation structure (here we reuse CopyData solely for its
   // members that we need to create a readback buffer and command buffer).
@@ -1503,7 +1519,7 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
   {
     WGPUCommandEncoder commandEncoder =
         wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0,
+    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, sourceOffset,
                                          op.readbackBuffer, 0, bufferSize);
     op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
     wgpuCommandEncoderRelease(commandEncoder);
@@ -1516,10 +1532,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
 
   // Allocate callback data
   CallbackData *cbData = new CallbackData{
-      op.readbackBuffer,               // The readback buffer created above.
-      static_cast<size_t>(bufferSize), // Size of the copy.
-      data,                            // Destination CPU memory.
-      promise                          // Our promise to satisfy when done.
+      op.readbackBuffer, // The readback buffer created above.
+      bufferSize,        // Size of the copy.
+      data,   // Destination CPU memory.         // Offset in the GPU buffer.
+      promise // Our promise to satisfy when done.
   };
 
   // Set up the queue work-done callback info.
@@ -1532,6 +1548,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
   // Start the asynchronous chain by registering the work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
+  if (op.readbackBuffer) {
+    wgpuBufferRelease(op.readbackBuffer);
+  }
+
   return promise->get_future();
 }
 
@@ -1548,9 +1568,11 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
  * @endcode
  */
 template <size_t N>
-inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
-                                    std::array<float, N> &data) {
-  return toCPUAsync(ctx, tensor, data.data(), sizeof(data));
+inline std::future<void>
+toCPUAsync(Context &ctx, Tensor &tensor, std::array<float, N> &data,
+           size_t sourceOffset = 0) {
+  return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset
+                    );
 }
 
 /**
@@ -1571,8 +1593,10 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
  * toCPU(ctx, tensor, data, bufferSize, instance);
  * @endcode
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
-  auto future = toCPUAsync(ctx, tensor, data, bufferSize);
+inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
+                  size_t sourceOffset = 0) {
+  auto future =
+      toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
   wait(ctx, future);
 }
 
@@ -1593,8 +1617,9 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
  * toCPU(ctx, buffer, data, size, instance);
  * @endcode
  */
-inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
-  auto future = toCPUAsync(ctx, buffer, data, size);
+inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size,
+                  size_t sourceOffset = 0) {
+  auto future = toCPUAsync(ctx, buffer, data, size, sourceOffset);
   wait(ctx, future);
 }
 
@@ -1616,8 +1641,9 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
  * @endcode
  */
 template <size_t N>
-inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
-  auto future = toCPUAsync(ctx, tensor, data);
+inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data,
+                  size_t sourceOffset = 0) {
+  auto future = toCPUAsync(ctx, tensor, data, sourceOffset);
   wait(ctx, future);
 }
 
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
new file mode 100644
index 0000000..2cc4290
--- /dev/null
+++ b/test/test_gpu.cpp
@@ -0,0 +1,193 @@
+//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp
+#include "gpu.hpp"
+#include <array>
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <future>
+#include <vector>
+
+using namespace gpu;
+
+// A simple WGSL copy kernel that copies input to output.
+static const char *kCopyKernel = R"(
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+  let i: u32 = gid.x;
+  if (i < arrayLength(&inp)) {
+    out[i] = inp[i];
+  }
+}
+)";
+
+// Test using the overload that takes a Tensor.
+void testToCPUWithTensor() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
+
+    // Create a real GPU context.
+    #ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+    #else
+    Context ctx = createContext();
+    #endif
+
+    constexpr size_t N = 1024;
+    std::array<float, N> inputData, outputData;
+    for (size_t i = 0; i < N; ++i) {
+        inputData[i] = static_cast<float>(i);
+        outputData[i] = 0.0f;
+    }
+
+    // Create input and output tensors.
+    Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+    Tensor outputTensor = createTensor(ctx, Shape{N}, kf32);
+
+    // Create and dispatch the copy kernel.
+    Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32},
+                                     Bindings{inputTensor, outputTensor},
+                                     {cdiv(N, 256), 1, 1});
+    dispatchKernel(ctx, copyKernel);
+
+    // Synchronously copy GPU output to CPU using the tensor overload.
+    toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData));
+
+    // Verify the output matches the input.
+    for (size_t i = 0; i < N; ++i) {
+        LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
+        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+        assert(outputData[i] == inputData[i]);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
+}
+
+// Test using the overload that takes a raw GPU buffer.
+// We reuse the Tensor's underlying buffer for this test.
+void testToCPUWithBuffer() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithBuffer...");
+
+    #ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+    #else
+    Context ctx = createContext();
+    #endif
+
+    constexpr size_t N = 1024;
+    std::array<float, N> data, outputData;
+    for (size_t i = 0; i < N; ++i) {
+        data[i] = static_cast<float>(i * 2);
+        outputData[i] = 0.0f;
+    }
+
+    // Create a tensor to allocate a GPU buffer and initialize it.
+    Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data());
+
+    // Now extract the raw GPU buffer from the tensor.
+    WGPUBuffer gpuBuffer = tensor.data.buffer;
+
+    // Use the WGPUBuffer overload. This call returns a future.
+    auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0);
+    wait(ctx, future);
+
+    // Verify that the CPU output matches the original data.
+    for (size_t i = 0; i < N; ++i) {
+        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+        assert(outputData[i] == data[i]);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
+}
+
+void testToCPUWithTensorSourceOffset() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset...");
+#ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+#else
+    Context ctx = createContext();
+#endif
+
+    constexpr size_t numElements = 25;
+    constexpr size_t sourceOffsetElements = 5;  // Skip first 5 elements
+    constexpr size_t copyCount = 10;            // Number of floats to copy
+    size_t copySize = copyCount * sizeof(float);
+
+    // Create an input array with known data.
+    std::array<float, numElements> inputData{};
+    for (size_t i = 0; i < numElements; ++i) {
+        inputData[i] = static_cast<float>(i + 50); // Arbitrary values
+    }
+    // Create a tensor from the full data.
+    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+
+    // Allocate a destination CPU buffer exactly as large as the data we want to copy.
+    std::vector<float> cpuOutput(copyCount, -1.0f);
+
+    // Set sourceOffset to skip the first few float elements
+    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+    // Call the tensor overload with sourceOffset and destOffset = 0.
+    auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes);
+    wait(ctx, future);
+
+    // Verify the copied data matches the expected subset.
+    for (size_t i = 0; i < copyCount; ++i) {
+        float expected = inputData[sourceOffsetElements + i];
+        float actual = cpuOutput[i];
+        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+        assert(expected == actual);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
+}
+
+void testToCPUWithBufferSourceOffset() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset...");
+#ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+#else
+    Context ctx = createContext();
+#endif
+
+    constexpr size_t numElements = 30;
+    constexpr size_t sourceOffsetElements = 7;  // Skip first 7 elements
+    constexpr size_t copyCount = 12;            // Number of floats to copy
+    size_t copySize = copyCount * sizeof(float);
+
+    // Create an input array with arbitrary data.
+    std::array<float, numElements> inputData{};
+    for (size_t i = 0; i < numElements; ++i) {
+        inputData[i] = static_cast<float>(i + 100);
+    }
+    // Create a tensor to initialize a GPU buffer.
+    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+    // Extract the raw GPU buffer from the tensor.
+    WGPUBuffer buffer = tensor.data.buffer;
+
+    // Allocate a destination CPU buffer exactly as large as needed.
+    std::vector<float> cpuOutput(copyCount, -2.0f);
+    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+
+    // Call the buffer overload with sourceOffset and destOffset = 0.
+    auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes);
+    wait(ctx, future);
+
+    // Verify that the copied data matches the expected subset.
+    for (size_t i = 0; i < copyCount; ++i) {
+        float expected = inputData[sourceOffsetElements + i];
+        float actual = cpuOutput[i];
+        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+        assert(expected == actual);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
+}
+
+int main() {
+    LOG(kDefLog, kInfo, "Running GPU integration tests...");
+    testToCPUWithTensor();
+    testToCPUWithBuffer();
+    testToCPUWithTensorSourceOffset();
+    testToCPUWithBufferSourceOffset();
+    LOG(kDefLog, kInfo, "All tests passed.");
+    return 0;
+}
\ No newline at end of file

From 16feb9e9f32e8cc2bbc12019a448c856a061d19f Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:16:44 -0600
Subject: [PATCH 19/30] remove path

---
 test/test_gpu.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 2cc4290..0954e44 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,4 +1,3 @@
-//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp
 #include "gpu.hpp"
 #include <array>
 #include <cstdio>

From e61e80917a73406e8fb8af5a94c743982231f51b Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:18:53 -0600
Subject: [PATCH 20/30] format

---
 test/test_gpu.cpp | 294 +++++++++++++++++++++++-----------------------
 1 file changed, 149 insertions(+), 145 deletions(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 0954e44..48aa1bc 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,7 +1,7 @@
 #include "gpu.hpp"
 #include <array>
-#include <cstdio>
 #include <cassert>
+#include <cstdio>
 #include <cstring>
 #include <future>
 #include <vector>
@@ -24,169 +24,173 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 
 // Test using the overload that takes a Tensor.
 void testToCPUWithTensor() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
-
-    // Create a real GPU context.
-    #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
-    #else
-    Context ctx = createContext();
-    #endif
-
-    constexpr size_t N = 1024;
-    std::array<float, N> inputData, outputData;
-    for (size_t i = 0; i < N; ++i) {
-        inputData[i] = static_cast<float>(i);
-        outputData[i] = 0.0f;
-    }
-
-    // Create input and output tensors.
-    Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
-    Tensor outputTensor = createTensor(ctx, Shape{N}, kf32);
-
-    // Create and dispatch the copy kernel.
-    Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32},
-                                     Bindings{inputTensor, outputTensor},
-                                     {cdiv(N, 256), 1, 1});
-    dispatchKernel(ctx, copyKernel);
-
-    // Synchronously copy GPU output to CPU using the tensor overload.
-    toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData));
-
-    // Verify the output matches the input.
-    for (size_t i = 0; i < N; ++i) {
-        LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
-        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
-        assert(outputData[i] == inputData[i]);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
+  LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
+
+// Create a real GPU context.
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<float, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<float>(i);
+    outputData[i] = 0.0f;
+  }
+
+  // Create input and output tensors.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+  Tensor outputTensor = createTensor(ctx, Shape{N}, kf32);
+
+  // Create and dispatch the copy kernel.
+  Kernel copyKernel =
+      createKernel(ctx, {kCopyKernel, 256, kf32},
+                   Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1});
+  dispatchKernel(ctx, copyKernel);
+
+  // Synchronously copy GPU output to CPU using the tensor overload.
+  toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData));
+
+  // Verify the output matches the input.
+  for (size_t i = 0; i < N; ++i) {
+    LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
+    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
 }
 
 // Test using the overload that takes a raw GPU buffer.
 // We reuse the Tensor's underlying buffer for this test.
 void testToCPUWithBuffer() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithBuffer...");
-
-    #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
-    #else
-    Context ctx = createContext();
-    #endif
-
-    constexpr size_t N = 1024;
-    std::array<float, N> data, outputData;
-    for (size_t i = 0; i < N; ++i) {
-        data[i] = static_cast<float>(i * 2);
-        outputData[i] = 0.0f;
-    }
-
-    // Create a tensor to allocate a GPU buffer and initialize it.
-    Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data());
-
-    // Now extract the raw GPU buffer from the tensor.
-    WGPUBuffer gpuBuffer = tensor.data.buffer;
-
-    // Use the WGPUBuffer overload. This call returns a future.
-    auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0);
-    wait(ctx, future);
-
-    // Verify that the CPU output matches the original data.
-    for (size_t i = 0; i < N; ++i) {
-        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
-        assert(outputData[i] == data[i]);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
+  LOG(kDefLog, kInfo, "Running testToCPUWithBuffer...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<float, N> data, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    data[i] = static_cast<float>(i * 2);
+    outputData[i] = 0.0f;
+  }
+
+  // Create a tensor to allocate a GPU buffer and initialize it.
+  Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data());
+
+  // Now extract the raw GPU buffer from the tensor.
+  WGPUBuffer gpuBuffer = tensor.data.buffer;
+
+  // Use the WGPUBuffer overload. This call returns a future.
+  auto future =
+      toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0);
+  wait(ctx, future);
+
+  // Verify that the CPU output matches the original data.
+  for (size_t i = 0; i < N; ++i) {
+    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    assert(outputData[i] == data[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
 }
 
 void testToCPUWithTensorSourceOffset() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset...");
+  LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset...");
 #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
+  Context ctx = createContextByGpuIdx(0);
 #else
-    Context ctx = createContext();
+  Context ctx = createContext();
 #endif
 
-    constexpr size_t numElements = 25;
-    constexpr size_t sourceOffsetElements = 5;  // Skip first 5 elements
-    constexpr size_t copyCount = 10;            // Number of floats to copy
-    size_t copySize = copyCount * sizeof(float);
-
-    // Create an input array with known data.
-    std::array<float, numElements> inputData{};
-    for (size_t i = 0; i < numElements; ++i) {
-        inputData[i] = static_cast<float>(i + 50); // Arbitrary values
-    }
-    // Create a tensor from the full data.
-    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
-
-    // Allocate a destination CPU buffer exactly as large as the data we want to copy.
-    std::vector<float> cpuOutput(copyCount, -1.0f);
-
-    // Set sourceOffset to skip the first few float elements
-    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
-    // Call the tensor overload with sourceOffset and destOffset = 0.
-    auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes);
-    wait(ctx, future);
-
-    // Verify the copied data matches the expected subset.
-    for (size_t i = 0; i < copyCount; ++i) {
-        float expected = inputData[sourceOffsetElements + i];
-        float actual = cpuOutput[i];
-        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
-        assert(expected == actual);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
+  constexpr size_t numElements = 25;
+  constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements
+  constexpr size_t copyCount = 10;           // Number of floats to copy
+  size_t copySize = copyCount * sizeof(float);
+
+  // Create an input array with known data.
+  std::array<float, numElements> inputData{};
+  for (size_t i = 0; i < numElements; ++i) {
+    inputData[i] = static_cast<float>(i + 50); // Arbitrary values
+  }
+  // Create a tensor from the full data.
+  Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+
+  // Allocate a destination CPU buffer exactly as large as the data we want to
+  // copy.
+  std::vector<float> cpuOutput(copyCount, -1.0f);
+
+  // Set sourceOffset to skip the first few float elements
+  size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+  // Call the tensor overload with sourceOffset and destOffset = 0.
+  auto future =
+      toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes);
+  wait(ctx, future);
+
+  // Verify the copied data matches the expected subset.
+  for (size_t i = 0; i < copyCount; ++i) {
+    float expected = inputData[sourceOffsetElements + i];
+    float actual = cpuOutput[i];
+    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    assert(expected == actual);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
 }
 
 void testToCPUWithBufferSourceOffset() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset...");
+  LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset...");
 #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
+  Context ctx = createContextByGpuIdx(0);
 #else
-    Context ctx = createContext();
+  Context ctx = createContext();
 #endif
 
-    constexpr size_t numElements = 30;
-    constexpr size_t sourceOffsetElements = 7;  // Skip first 7 elements
-    constexpr size_t copyCount = 12;            // Number of floats to copy
-    size_t copySize = copyCount * sizeof(float);
-
-    // Create an input array with arbitrary data.
-    std::array<float, numElements> inputData{};
-    for (size_t i = 0; i < numElements; ++i) {
-        inputData[i] = static_cast<float>(i + 100);
-    }
-    // Create a tensor to initialize a GPU buffer.
-    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
-    // Extract the raw GPU buffer from the tensor.
-    WGPUBuffer buffer = tensor.data.buffer;
-
-    // Allocate a destination CPU buffer exactly as large as needed.
-    std::vector<float> cpuOutput(copyCount, -2.0f);
-    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
-
-    // Call the buffer overload with sourceOffset and destOffset = 0.
-    auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes);
-    wait(ctx, future);
-
-    // Verify that the copied data matches the expected subset.
-    for (size_t i = 0; i < copyCount; ++i) {
-        float expected = inputData[sourceOffsetElements + i];
-        float actual = cpuOutput[i];
-        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
-        assert(expected == actual);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
+  constexpr size_t numElements = 30;
+  constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements
+  constexpr size_t copyCount = 12;           // Number of floats to copy
+  size_t copySize = copyCount * sizeof(float);
+
+  // Create an input array with arbitrary data.
+  std::array<float, numElements> inputData{};
+  for (size_t i = 0; i < numElements; ++i) {
+    inputData[i] = static_cast<float>(i + 100);
+  }
+  // Create a tensor to initialize a GPU buffer.
+  Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+  // Extract the raw GPU buffer from the tensor.
+  WGPUBuffer buffer = tensor.data.buffer;
+
+  // Allocate a destination CPU buffer exactly as large as needed.
+  std::vector<float> cpuOutput(copyCount, -2.0f);
+  size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+
+  // Call the buffer overload with sourceOffset and destOffset = 0.
+  auto future =
+      toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes);
+  wait(ctx, future);
+
+  // Verify that the copied data matches the expected subset.
+  for (size_t i = 0; i < copyCount; ++i) {
+    float expected = inputData[sourceOffsetElements + i];
+    float actual = cpuOutput[i];
+    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    assert(expected == actual);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
 }
 
 int main() {
-    LOG(kDefLog, kInfo, "Running GPU integration tests...");
-    testToCPUWithTensor();
-    testToCPUWithBuffer();
-    testToCPUWithTensorSourceOffset();
-    testToCPUWithBufferSourceOffset();
-    LOG(kDefLog, kInfo, "All tests passed.");
-    return 0;
-}
\ No newline at end of file
+  LOG(kDefLog, kInfo, "Running GPU integration tests...");
+  testToCPUWithTensor();
+  testToCPUWithBuffer();
+  testToCPUWithTensorSourceOffset();
+  testToCPUWithBufferSourceOffset();
+  LOG(kDefLog, kInfo, "All tests passed.");
+  return 0;
+}

From ad8698dc1cb10ac89f020e6920d680328a6200ae Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:22:25 -0600
Subject: [PATCH 21/30] doc formatting

---
 docs/gpuflow.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/gpuflow.md b/docs/gpuflow.md
index d4eb37a..420397d 100644
--- a/docs/gpuflow.md
+++ b/docs/gpuflow.md
@@ -61,18 +61,18 @@ flowchart TD
   O --- N
 ```
 
+• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
+• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
+• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
+• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
+• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
+• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
+• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor.
+
+`gpu::Tensor` Ranks:
 Rank 0: Scalar
 Rank 1: Vector
 Rank 2: Matrix
 Rank 3: 3D Tensor (or Cube)
 Rank 4: 4D Tensor
-Rank ..: Higher Dimensional Tensors
-
-
-• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
-• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
-• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
-• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
-• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
-• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
-• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor.
+Rank (max 8): Higher Dimensional Tensors
\ No newline at end of file

From 025af2a8f4621ba9612354a6e524044da2188ac3 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:29:25 -0600
Subject: [PATCH 22/30] doc nits

---
 docs/gpuflow.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gpuflow.md b/docs/gpuflow.md
index 420397d..fee9d4c 100644
--- a/docs/gpuflow.md
+++ b/docs/gpuflow.md
@@ -11,7 +11,7 @@ flowchart TD
     
     E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
     F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
-    G["Optional: Upload Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
+    G["Optional: <br> Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
   end
 
   %% Buffer Setup & Bindings

From 3776dcd50152ba4fc18ca9029006bd9e9588dca7 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 11:33:30 -0600
Subject: [PATCH 23/30] set project root on root cmakelists

---
 CMakeLists.txt  | 2 +-
 cmake/gpu.cmake | 1 -
 docs/gpuflow.md | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a17602e..85911a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # and cmake/gpu.cmake for more details
 cmake_minimum_required(VERSION 3.28)
 project(gpu)
-
+set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
                                       # LSP
 set(CMAKE_CXX_STANDARD 20)
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index f936991..d991a18 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -39,4 +39,3 @@ else()
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
 endif()
-
diff --git a/docs/gpuflow.md b/docs/gpuflow.md
index fee9d4c..d13a228 100644
--- a/docs/gpuflow.md
+++ b/docs/gpuflow.md
@@ -75,4 +75,4 @@ Rank 1: Vector
 Rank 2: Matrix
 Rank 3: 3D Tensor (or Cube)
 Rank 4: 4D Tensor
-Rank (max 8): Higher Dimensional Tensors
\ No newline at end of file
+Rank (max 8): Higher Dimensional Tensors

From d58e1911b5e015ea073c4e4350d3378a2edf80bd Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 16:15:51 -0600
Subject: [PATCH 24/30] fix linux issue with callback info

---
 gpu.hpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index 931d646..8c661bc 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -10,6 +10,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <thread>
 #include <tuple>
 #include <type_traits>
 #include <unordered_map>
@@ -1354,11 +1355,12 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
         __LINE__);
 
   // Set up the buffer mapping callback information.
-  WGPUBufferMapCallbackInfo mapCallbackInfo;
-  mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
-  mapCallbackInfo.callback = bufferMapCallback;
-  mapCallbackInfo.userdata1 = cbData;
-  mapCallbackInfo.userdata2 = nullptr;
+  WGPUBufferMapCallbackInfo mapCallbackInfo = {
+    .mode = WGPUCallbackMode_AllowSpontaneous,
+    .callback = bufferMapCallback,
+    .userdata1 = cbData, // Pass the callback data.
+    .userdata2 = nullptr // No additional user data.
+  };
 
   // Begin the asynchronous mapping of the readback buffer.
   wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize,

From 498ba74b73962d8b647b844fc570cf758ebaf467 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 17:54:16 -0600
Subject: [PATCH 25/30] should not release readback buffer

---
 gpu.hpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index 8c661bc..4854338 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -1486,11 +1486,7 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Register the callback. The async chain continues inside
   // queueWorkDoneCallback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
-
-  if (readbackBuffer) {
-    wgpuBufferRelease(readbackBuffer);
-  }
-
+  
   return promise->get_future();
 }
 
@@ -1550,10 +1546,6 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
   // Start the asynchronous chain by registering the work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
-
   return promise->get_future();
 }
 

From 2db9be10fb3f0298294ba199d71eca894746e3a6 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 18:07:09 -0600
Subject: [PATCH 26/30] clean up callback syntax

---
 gpu.hpp | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index 4854338..b057514 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -1306,7 +1306,7 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
  */
 inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
                               void *userdata1, void * /*userdata2*/) {
-  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  const CallbackData *cbData = static_cast<CallbackData *>(userdata1);
   // Check that mapping succeeded.
   check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__,
         __LINE__);
@@ -1349,17 +1349,17 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
  */
 inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
                                   void *userdata1, void * /*userdata2*/) {
-  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  const CallbackData *cbData = static_cast<CallbackData *>(userdata1);
   // Ensure the queue work finished successfully.
   check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__,
         __LINE__);
 
   // Set up the buffer mapping callback information.
   WGPUBufferMapCallbackInfo mapCallbackInfo = {
-    .mode = WGPUCallbackMode_AllowSpontaneous,
-    .callback = bufferMapCallback,
-    .userdata1 = cbData, // Pass the callback data.
-    .userdata2 = nullptr // No additional user data.
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = bufferMapCallback,
+      .userdata1 = const_cast<CallbackData *>(cbData), // Pass the callback data.
+      .userdata2 = nullptr // No additional user data.
   };
 
   // Begin the asynchronous mapping of the readback buffer.
@@ -1400,11 +1400,11 @@ inline std::future<void> toCPUAsync(Context &ctx, void *data, size_t bufferSize,
   };
 
   // Set up the work-done callback to initiate the buffer mapping.
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo;
-  workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
-  workDoneCallbackInfo.callback = queueWorkDoneCallback;
-  workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data.
-  workDoneCallbackInfo.userdata2 = nullptr;
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = queueWorkDoneCallback,
+      .userdata1 = const_cast<CallbackData *>(cbData),
+      .userdata2 = nullptr};
 
   // Begin the asynchronous chain by registering the queue work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
@@ -1486,7 +1486,7 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Register the callback. The async chain continues inside
   // queueWorkDoneCallback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
-  
+
   return promise->get_future();
 }
 
@@ -1562,11 +1562,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
  * @endcode
  */
 template <size_t N>
-inline std::future<void>
-toCPUAsync(Context &ctx, Tensor &tensor, std::array<float, N> &data,
-           size_t sourceOffset = 0) {
-  return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset
-                    );
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
+                                    std::array<float, N> &data,
+                                    size_t sourceOffset = 0) {
+  return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset);
 }
 
 /**
@@ -1589,8 +1588,7 @@ toCPUAsync(Context &ctx, Tensor &tensor, std::array<float, N> &data,
  */
 inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
                   size_t sourceOffset = 0) {
-  auto future =
-      toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
+  auto future = toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
   wait(ctx, future);
 }
 

From 752a53a3d426fb5bb87a89f31b601817adea25c7 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 18:38:09 -0600
Subject: [PATCH 27/30] add stress test

---
 test/test_gpu.cpp | 78 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 69 insertions(+), 9 deletions(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 48aa1bc..99a1af6 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,12 +1,34 @@
 #include "gpu.hpp"
 #include <array>
 #include <cassert>
+#include <chrono>
 #include <cstdio>
 #include <cstring>
 #include <future>
 #include <vector>
 
 using namespace gpu;
+using namespace std::chrono;
+
+
+// Forward declarations:
+void testToCPUWithTensor();
+void testToCPUWithBuffer();
+void testToCPUWithTensorSourceOffset();
+void testToCPUWithBufferSourceOffset();
+void stressTestToCPU();
+
+int main() {
+  LOG(kDefLog, kInfo, "Running GPU integration tests...");
+  testToCPUWithTensor();
+  testToCPUWithBuffer();
+  testToCPUWithTensorSourceOffset();
+  testToCPUWithBufferSourceOffset();
+  stressTestToCPU();
+  LOG(kDefLog, kInfo, "All tests passed.");
+  return 0;
+}
+
 
 // A simple WGSL copy kernel that copies input to output.
 static const char *kCopyKernel = R"(
@@ -22,6 +44,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 }
 )";
 
+
 // Test using the overload that takes a Tensor.
 void testToCPUWithTensor() {
   LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
@@ -185,12 +208,49 @@ void testToCPUWithBufferSourceOffset() {
   LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
 }
 
-int main() {
-  LOG(kDefLog, kInfo, "Running GPU integration tests...");
-  testToCPUWithTensor();
-  testToCPUWithBuffer();
-  testToCPUWithTensorSourceOffset();
-  testToCPUWithBufferSourceOffset();
-  LOG(kDefLog, kInfo, "All tests passed.");
-  return 0;
-}
+void stressTestToCPU() {
+  LOG(kDefLog, kInfo, "Running stressTestToCPU for 2 seconds...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  // Create a persistent tensor with some test data.
+  std::vector<float> inputData(N, 0.0f);
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<float>(i);
+  }
+  Tensor tensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+
+  // Prepare to run for one second.
+  auto startTime = high_resolution_clock::now();
+  std::vector<std::future<void>> futures;
+  size_t opCount = 0;
+  while (high_resolution_clock::now() - startTime < seconds(2)) {
+    // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes)
+    auto outputData = std::make_shared<std::vector<float>>(N, 0.0f);
+    // Use the tensor overload; we’re copying the entire tensor (destOffset = 0)
+    LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float));
+    // log count
+    LOG(kDefLog, kInfo, "opCount = %zu", opCount);
+    auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0);
+    futures.push_back(std::move(fut));
+    ++opCount;
+  }
+
+  // Wait for all submitted operations to complete.
+  for (auto &f : futures) {
+    wait(ctx, f);
+  }
+  
+  auto endTime = high_resolution_clock::now();
+  auto totalMs = duration_cast<milliseconds>(endTime - startTime).count();
+  double throughput = (opCount / (totalMs / 1000.0));
+
+  LOG(kDefLog, kInfo, "Stress test completed:\n"
+            "  %zu GPU to CPU operations in %lld ms\n"
+            "  Throughput: %.2f ops/sec", opCount, totalMs, throughput);
+}
\ No newline at end of file

From 5f82ff4d9e0fdd1de7f2ccf8e0a0a6d8e981b2fb Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 19:07:20 -0600
Subject: [PATCH 28/30] linux has a segfault if wait for events after.

---
 test/test_gpu.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 99a1af6..aa42b83 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -227,7 +227,6 @@ void stressTestToCPU() {
 
   // Prepare to run for one second.
   auto startTime = high_resolution_clock::now();
-  std::vector<std::future<void>> futures;
   size_t opCount = 0;
   while (high_resolution_clock::now() - startTime < seconds(2)) {
     // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes)
@@ -237,14 +236,9 @@ void stressTestToCPU() {
     // log count
     LOG(kDefLog, kInfo, "opCount = %zu", opCount);
     auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0);
-    futures.push_back(std::move(fut));
+    wait(ctx, fut);
     ++opCount;
   }
-
-  // Wait for all submitted operations to complete.
-  for (auto &f : futures) {
-    wait(ctx, f);
-  }
   
   auto endTime = high_resolution_clock::now();
   auto totalMs = duration_cast<milliseconds>(endTime - startTime).count();

From 28dabf277eebb9fb5541870014287a9d7f533036 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sun, 23 Feb 2025 10:22:27 -0600
Subject: [PATCH 29/30] EOF newline

---
 test/test_gpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index aa42b83..b855712 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -247,4 +247,4 @@ void stressTestToCPU() {
   LOG(kDefLog, kInfo, "Stress test completed:\n"
             "  %zu GPU to CPU operations in %lld ms\n"
             "  Throughput: %.2f ops/sec", opCount, totalMs, throughput);
-}
\ No newline at end of file
+}

From 39c816ca6b4ba0dff8808b680e0cf8f7b36973d4 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 1 Mar 2025 17:34:42 -0600
Subject: [PATCH 30/30] added sleeptime optional arg

---
 gpu.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index b057514..69ed0e9 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -869,7 +869,7 @@ template <typename T> T wait(Context &ctx, std::future<T> &f) {
  * Context ctx = waitForContextFuture(contextFuture);
  * @endcode
  */
-template <typename T> T waitForContextFuture(std::future<T> &f) {
+template <typename T> T waitForContextFuture(std::future<T> &f, size_t sleepTime = 10) {
 #ifdef __EMSCRIPTEN__
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
@@ -879,7 +879,7 @@ template <typename T> T waitForContextFuture(std::future<T> &f) {
 #else
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleepTime));
   }
   return f.get();
 #endif