diff --git a/.gitignore b/.gitignore
index 1a8b5bc..4dc9cf7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 build/*
 # any build subdirectory in the tree
 **/build/
+**/build_web/
 examples/hello_gpu/build/*
 examples/raymarch/build/*
 docs/html
@@ -8,6 +9,7 @@ source
 .DS_Store
 third_party/lib/*
 third_party/local/*
+third_party/dawn/*
 
 # formatter files
 .cmake-format.py
@@ -20,3 +22,6 @@ build
 .cache
 compile_commands.json
 
+# editor specific
+.vscode/*
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index db89df7..85911a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,23 +1,13 @@
+# This only builds a shared lib, see cmake/example.cmake
+# and cmake/gpu.cmake for more details
 cmake_minimum_required(VERSION 3.28)
 project(gpu)
-
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/webgpu.cmake")
-
+set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
                                       # LSP
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-option(USE_LOCAL_LIBS
-       "Use local libraries instead of fetching from the internet" OFF)
-
-# Ensure the build type is set
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE
-        Release
-        CACHE STRING "Choose the type of build: Debug or Release" FORCE)
-endif()
-
 option(FASTBUILD "Option to enable fast builds" OFF)
 if(FASTBUILD)
     set(CMAKE_BUILD_TYPE None) # Avoid default flags of predefined build types
@@ -30,21 +20,27 @@ if(DEBUG)
     set(CMAKE_CXX_FLAGS "-O0 -g")
 endif()
 
-if(WIN64)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN")
-endif()
-
+include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
 
-message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
-message(
-    STATUS
-        "Include directories for wgpu: ${CMAKE_CURRENT_SOURCE_DIR}/third_party/headers"
-)
+target_link_libraries(gpu PRIVATE webgpu_dawn)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)
+
+add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp)
+target_link_libraries(test_gpu PRIVATE gpu)
+
+# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
+if(MSVC)
+    add_custom_command(
+        TARGET test_gpu POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+                ${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
+                $<TARGET_FILE_DIR:test_gpu>
+        COMMENT "Copying webgpu_dawn.dll to the build directory"
+    )
+endif()
 
 add_library(gpud SHARED gpu.hpp)
 set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(gpud PRIVATE wgpu)
-target_link_libraries(gpud PRIVATE webgpu)
 target_link_libraries(gpud PRIVATE gpu)
-install(TARGETS gpud)
diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
new file mode 100644
index 0000000..c6fed94
--- /dev/null
+++ b/cmake/dawn.cmake
@@ -0,0 +1,124 @@
+# Setup directories
+set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party")
+set(DAWN_DIR "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "")
+set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "")
+
+if(EMSCRIPTEN)
+    set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
+    set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
+    set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
+else()
+    add_compile_definitions(USE_DAWN_API)
+endif()
+
+# Enable find for no dawn rebuilds with flutter run
+set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE)
+set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE)
+if(ENABLE_DAWN_FIND)
+    # find_library, windows adds extra folder
+    if(MSVC)
+        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
+        )
+        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
+        )
+        set(DAWN_BUILD_FOUND ON)
+    elseif(NOT EMSCRIPTEN AND NOT MSVC)
+        find_library(WEBGPU_DAWN_LIB
+        NAMES webgpu_dawn
+        PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
+        REQUIRED
+        )
+        set(DAWN_BUILD_FOUND ON)
+    else()
+        set(DAWN_BUILD_FOUND ON)
+    endif()
+endif()
+
+# Dawn options for more,
+# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt
+set(DAWN_ALWAYS_ASSERT     OFF CACHE INTERNAL "Always assert in Dawn" FORCE)
+set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE)
+set(DAWN_BUILD_EXAMPLES      OFF CACHE INTERNAL "Build Dawn examples" FORCE)
+set(DAWN_BUILD_SAMPLES      OFF CACHE INTERNAL "Build Dawn samples" FORCE)
+set(DAWN_BUILD_TESTS         OFF CACHE INTERNAL "Build Dawn tests" FORCE)
+set(DAWN_ENABLE_INSTALL      OFF  CACHE INTERNAL "Enable Dawn installation" FORCE)
+set(DAWN_FETCH_DEPENDENCIES ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
+set(TINT_BUILD_TESTS        OFF CACHE INTERNAL "Build Tint Tests" FORCE)
+set(TINT_BUILD_IR_BINARY    OFF CACHE INTERNAL "Build Tint IR binary" FORCE)
+set(TINT_BUILD_CMD_TOOLS   OFF CACHE INTERNAL "Build Tint command line tools" FORCE)
+
+if(NOT DAWN_BUILD_FOUND)
+    include(FetchContent)
+    message("webgpu_dawn not found start building")
+    if(EMSCRIPTEN)
+        set(EMSCRIPTEN_DIR "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "" FORCE)
+        set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EMSCRIPTEN_DIR} CACHE INTERNAL "" FORCE)
+    endif()
+
+    FetchContent_Declare(
+        dawn
+        DOWNLOAD_DIR ${DAWN_DIR}
+        SOURCE_DIR ${DAWN_DIR}
+        SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp
+        BINARY_DIR ${DAWN_BUILD_DIR}
+        DOWNLOAD_COMMAND
+        cd ${DAWN_DIR} &&
+        git init &&
+        git fetch --depth=1 https://dawn.googlesource.com/dawn &&
+        git reset --hard FETCH_HEAD
+    )
+
+    # Download the repository and add it as a subdirectory.
+    FetchContent_MakeAvailable(dawn)
+
+    # attempt fix flutter rebuilds
+    set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "")
+
+    execute_process(
+        WORKING_DIRECTORY ${DAWN_DIR}
+        COMMAND ${CMAKE_COMMAND} -S ${DAWN_DIR}
+            -B ${DAWN_BUILD_DIR}
+    )
+
+    # Build Dawn
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} --build ${DAWN_BUILD_DIR}
+    )
+    
+    # find_library, windows adds extra folder
+    if(MSVC)
+        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
+        )
+        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
+        NAMES webgpu_dawn
+        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
+        )
+        set(DAWN_BUILD_FOUND ON)
+    elseif(NOT EMSCRIPTEN AND NOT MSVC)
+        find_library(WEBGPU_DAWN_LIB
+        NAMES webgpu_dawn
+        PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
+        REQUIRED
+        )
+        set(DAWN_BUILD_FOUND ON)
+    else()
+        set(DAWN_BUILD_FOUND ON)
+    endif()
+endif()
+
+if(EMSCRIPTEN)
+    add_library(webgpu_dawn INTERFACE IMPORTED)
+    target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include)
+    target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/webgpu.h)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js)
+    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js)
+else()
+endif()
diff --git a/cmake/example.cmake b/cmake/example.cmake
index eba8e7c..cf697b5 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -1,68 +1,98 @@
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
-                                      # LSP
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+# Getting Started with CMAKE
+# Each example includes this and sets PROJECT_NAME.
+#
+# Example usage:
+#   cd examples/hello_world
+#   cmake -S . build/ -DCMAKE_BUILD_TYPE=Release
+#   cmake --build build/ --config Release
+#   ./build/hello_world   (or serve the output .js/.wasm for Emscripten)
+#   or for emscripten
+#   emcmake cmake -S . -B ./build_web -DCMAKE_BUILD_TYPE=Release
+#   cmake --build build_web --config Release
+#   python3 -m http.server 8080 --d build_web
 
+if(NOT MSVC)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 20)
+endif()
+
+# Locate the project root (two levels up from the current source dir)
 get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
 get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
 
-# Construct potential paths
-set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
-set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}")
+# Include external libraries and helper scripts (dawn and gpu)
+include("${PROJECT_ROOT}/cmake/dawn.cmake")
+include("${PROJECT_ROOT}/cmake/gpu.cmake")
 
-# Include file finding utility script
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake")
+# Create the executable
+add_executable(${PROJECT_NAME} run.cpp)
 
-# Check if the file exists in the current directory
-find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME}
-                  TARGET_FILE_PATH)
-if("${TARGET_FILE_PATH}" STREQUAL "")
-    find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME}
-                      TARGET_FILE_PATH)
-    if("${TARGET_FILE_PATH}" STREQUAL "")
-        message(
-            FATAL_ERROR
-                "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../"
-        )
-    endif()
-endif()
+# Platform-specific linking & build settings
+if(EMSCRIPTEN)
+    # Emscripten-specific configuration
 
-# Ensure the build type is set
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE
-        Release
-        CACHE STRING "Choose the type of build: Debug or Release" FORCE)
-endif()
+    # Define a web output directory (adjust as needed)
+    set(WEB_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/web_build")
 
-# Define architecture and build type directories or file names
-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(ARCH "x64")
-else()
-    set(ARCH "x86")
-endif()
+    # If necessary, include the generated WebGPU include dirs first.
+    include_directories(BEFORE "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
 
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(BUILD_TYPE "Debug")
-else()
-    set(BUILD_TYPE "Release")
-endif()
+    # Create a helper library for WebGPU support.
+    add_library(webgpu_web "${DAWN_DIR}/third_party/emdawnwebgpu/webgpu.cpp")
+    target_link_libraries(${PROJECT_NAME} PRIVATE webgpu_web)
+
+    # Set Emscripten-specific link flags that enable WASM output and expose certain symbols.
+    # Needed to use updated version, emdawnwebgpu
+    set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\
+        -O3 \
+        -sUSE_WEBGPU=0 \
+        -sWASM=1 \
+        -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \
+        -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
+        -sEXPORTED_RUNTIME_METHODS=ccall \
+        -sUSE_GLFW=3 \
+        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
+        -sASYNCIFY \
+        --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
+        --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
+        --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \
+        --js-library=${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js \
+        --closure-args=--externs=${EMSCRIPTEN_DIR}/src/closure-externs/webgpu-externs.js \
+    ")
 
-if(NOT TARGET gpu)
-    message(STATUS "GPU_LIB not found")
-    include("${TARGET_FILE_PATH}/cmake/webgpu.cmake")
-    include("${TARGET_FILE_PATH}/cmake/gpu.cmake")
+else()
+    # Non-Emscripten (desktop) linking
+    if(MSVC)
+        target_link_libraries(gpu 
+            PRIVATE 
+                $<$<CONFIG:Debug>:${WEBGPU_DAWN_DEBUG}>
+                $<$<CONFIG:Release>:${WEBGPU_DAWN_RELEASE}>
+        )
+    else()
+        target_link_libraries(gpu PRIVATE webgpu_dawn)
+    endif()
 endif()
 
-add_executable(${PROJECT_NAME} run.cpp)
+# Link the gpu/dawn library to the executable.
 target_link_libraries(${PROJECT_NAME} PRIVATE gpu)
-target_link_libraries(${PROJECT_NAME} PRIVATE wgpu)
-target_link_libraries(${PROJECT_NAME} PRIVATE webgpu)
 
-if(WIN32)
-    # Ensure DLL is copied if on Windows
+# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
+if(MSVC)
     add_custom_command(
-        TARGET ${PROJECT_NAME}
-        POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DLL_PATH}
-                $<TARGET_FILE_DIR:${PROJECT_NAME}>)
+        TARGET ${PROJECT_NAME} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+                ${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
+                $<TARGET_FILE_DIR:${PROJECT_NAME}>
+        COMMENT "Copying webgpu_dawn.dll to the build directory"
+    )
+endif()
+
+if(EMSCRIPTEN)
+
+    # Configure the HTML file by replacing @PROJECT_NAME@ with the actual target name.
+    configure_file(${PROJECT_ROOT}cmake/templates/index.html.in
+                   ${CMAKE_CURRENT_BINARY_DIR}/index.html
+                   @ONLY)
+
 endif()
diff --git a/cmake/find_gpu.cmake b/cmake/find_gpu.cmake
deleted file mode 100644
index b6b7dad..0000000
--- a/cmake/find_gpu.cmake
+++ /dev/null
@@ -1,30 +0,0 @@
-# file name to find
-set(FILENAME "gpu.hpp")
-
-# Function to check for file existence up the directory hierarchy
-function(find_project_root current_dir filename result_var)
-    set(found FALSE) # Flag to indicate if the file is found
-    set(current_check_dir "${current_dir}") # Start from the given directory
-    # using 1 is jsut to supress the cmane-format warning
-    foreach(i RANGE 0 2 1)
-        set(filepath "${current_check_dir}/${filename}")
-
-        if(EXISTS "${filepath}")
-            set(${result_var}
-                "${current_check_dir}"
-                PARENT_SCOPE)
-            set(found TRUE)
-            break()
-        endif()
-
-        # Move one level up
-        get_filename_component(current_check_dir "${current_check_dir}"
-                               DIRECTORY)
-    endforeach()
-
-    if(NOT found)
-        set(${result_var}
-            ""
-            PARENT_SCOPE) # Set to empty if not found
-    endif()
-endfunction()
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 08db244..d991a18 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -1,69 +1,41 @@
-get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
+set(FILENAME "gpu.hpp")
 
-# Construct potential paths
-set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
-set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}")
-
-# Include file finding utility script
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake")
-
-# Check if the file exists in the current directory
-find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH)
-if("${TARGET_FILE_PATH}" STREQUAL "")
-    find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH)
-    if("${TARGET_FILE_PATH}" STREQUAL "")
-        message(
-            FATAL_ERROR
-                "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../"
-        )
-    endif()
-endif()
-
-# Define architecture and build type directories or file names
-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(ARCH "x64")
+# Setup project root here.
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
+    set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 else()
-    set(ARCH "x86")
+    get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+    get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
+    set(PROJECT_ROOT "${PROJECT_ROOT}/")
 endif()
 
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(BUILD_TYPE "Debug")
+message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}")
+
+# Add sources
+set(GPU_SOURCES
+    "${PROJECT_ROOT}/gpu.cpp"
+    "${PROJECT_ROOT}/numeric_types/half.cpp"
+    "${DAWN_BUILD_DIR}/gen/include/dawn/webgpu.h"
+)
+
+# Add headers
+set(GPU_HEADERS
+    "${PROJECT_ROOT}/gpu.hpp"
+    "${PROJECT_ROOT}/utils/logging.hpp"
+    "${PROJECT_ROOT}/utils/array_utils.hpp"
+    "${PROJECT_ROOT}/numeric_types/half.hpp"
+    
+)
+
+# Create the STATIC library for gpu
+add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
+set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
+target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
+if(NOT EMSCRIPTEN)
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/")
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/")
+    target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/")
 else()
-    set(BUILD_TYPE "Release")
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
 endif()
-
-add_library(webgpulib SHARED IMPORTED)
-add_library(gpu INTERFACE)
-add_library(wgpu INTERFACE)
-add_dependencies(gpu webgpulib)
-# Define the header-only library
-target_include_directories(gpu INTERFACE ${TARGET_FILE_PATH})
-
-# Add headers webgpu.h
-target_include_directories(wgpu
-                           INTERFACE ${TARGET_FILE_PATH}/third_party/headers)
-include(ExternalProject)
-
-set(DAWN_EXT_PREFIX "${TARGET_FILE_PATH}/third_party/local/dawn")
-
-ExternalProject_Add(
-    dawn_project
-    PREFIX ${DAWN_EXT_PREFIX}
-    GIT_REPOSITORY "https://dawn.googlesource.com/dawn"
-    GIT_TAG "main"
-    SOURCE_DIR "${DAWN_EXT_PREFIX}/source"
-    BINARY_DIR "${DAWN_EXT_PREFIX}/build"
-    INSTALL_DIR "${DAWN_EXT_PREFIX}/install"
-    GIT_SUBMODULES ""
-    # setting cmake args doesn't work and I don't know why
-    CONFIGURE_COMMAND
-        ${CMAKE_COMMAND} -S ${DAWN_EXT_PREFIX}/source -B
-        ${DAWN_EXT_PREFIX}/build -DDAWN_FETCH_DEPENDENCIES=ON
-        -DDAWN_ENABLE_INSTALL=ON -DDAWN_BUILD_MONOLITHIC_LIBRARY=ON
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -G ${CMAKE_GENERATOR}
-    INSTALL_COMMAND ${CMAKE_COMMAND} --install . --prefix
-                    ${DAWN_EXT_PREFIX}/install
-    LOG_INSTALL ON)
-find_library(LIBDAWN dawn PATHS "${DAWN_EXT_PREFIX}/install/lib")
-target_link_libraries(webgpulib INTERFACE ${LIBDAWN})
diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in
new file mode 100644
index 0000000..6b5957b
--- /dev/null
+++ b/cmake/templates/index.html.in
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <title>@PROJECT_NAME@</title>
+  </head>
+  <body>
+    <!-- The generated JS file loads the WASM module -->
+    <script src="@PROJECT_NAME@.js"></script>
+    <script>
+      // When the Emscripten runtime is ready, call main.
+      if (typeof Module !== 'undefined') {
+        Module.onRuntimeInitialized = function() {
+          // Optionally, pass arguments to main in an array.
+          Module.ccall('main', 'number', [], [], { async: true });
+        };
+      } else {
+        console.error('Module is undefined. Check that your generated JS file is loaded properly.');
+      }
+    </script>
+  </body>
+</html>
diff --git a/cmake/webgpu.cmake b/cmake/webgpu.cmake
deleted file mode 100644
index c63f1e2..0000000
--- a/cmake/webgpu.cmake
+++ /dev/null
@@ -1,61 +0,0 @@
-# Specify the filename to search for
-set(FILENAME "gpu.hpp")
-
-get_filename_component(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PROJECT_ROOT ${PROJECT_ROOT} DIRECTORY)
-
-# Construct potential paths
-set(FILEPATH_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}")
-set(FILEPATH_PROJECT_ROOT "${PROJECT_ROOT}/${FILENAME}")
-
-# Include file finding utility script
-include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_gpu.cmake")
-
-# Check if the file exists in the current directory
-find_project_root(${CMAKE_CURRENT_SOURCE_DIR} ${FILENAME} TARGET_FILE_PATH)
-if("${TARGET_FILE_PATH}" STREQUAL "")
-    find_project_root(${FILEPATH_CURRENT_DIR} ${FILENAME} TARGET_FILE_PATH)
-    if("${TARGET_FILE_PATH}" STREQUAL "")
-        message(
-            FATAL_ERROR
-                "File ${FILENAME} not found in either ${CMAKE_CURRENT_SOURCE_DIR} or ${CMAKE_CURRENT_SOURCE_DIR}/../../"
-        )
-    endif()
-endif()
-
-include(FetchContent)
-
-set(FETCHCONTENT_BASE_DIR "${TARGET_FILE_PATH}/third_party/fetchcontent")
-set(WEBGPU_DIST_LOCAL_PATH
-    "${TARGET_FILE_PATH}/third_party/local/WebGPU-distribution")
-
-if(USE_LOCAL_LIBS)
-    set(WEBGPU_DIST_GIT_REPO ${WEBGPU_DIST_LOCAL_PATH})
-    message(STATUS "Using local WebGPU distribution: ${WEBGPU_DIST_LOCAL_PATH}")
-else()
-    set(WEBGPU_DIST_GIT_REPO
-        "https://github.com/eliemichel/WebGPU-distribution")
-endif()
-
-option(WEBGPU_TAG "WebGPU distribution tag to use")
-if(NOT WEBGPU_TAG)
-    set(WEBGPU_TAG "dawn")
-endif()
-message(STATUS "Using WebGPU distribution tag: ${WEBGPU_TAG}")
-
-if(WEBGPU_TAG STREQUAL "dawn")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN")
-    # use specific commit set(WEBGPU_TAG
-    # "1025b977e1927b6d0327e67352f90feb4bcf8274") set(WEBGPU_TAG
-    # "acf972b7b909f52e183bdae3971b93bb13d4a29e")
-    # add_compile_options(-UABSL_INTERNAL_AT_LEAST_CXX20) set(CMAKE_CXX_FLAGS
-    # "${CMAKE_CXX_FLAGS} -UABSL_INTERNAL_AT_LEAST_CXX20")
-    message(STATUS "Using Dawn backend")
-endif()
-
-FetchContent_Declare(
-    webgpu
-    GIT_REPOSITORY ${WEBGPU_DIST_GIT_REPO}
-    GIT_TAG ${WEBGPU_TAG}
-    GIT_SHALLOW TRUE)
-FetchContent_MakeAvailable(webgpu)
diff --git a/docs/gpuflow.md b/docs/gpuflow.md
new file mode 100644
index 0000000..d13a228
--- /dev/null
+++ b/docs/gpuflow.md
@@ -0,0 +1,78 @@
+# GPU.cpp Lifecycle
+
+```mermaid
+flowchart TD
+  %% Data Preparation & Upload
+  subgraph "Data Preparation & Upload"
+    A["CPU Data"]
+    B["Define Data Properties<br>(shape, type, size)"]
+    C["Create GPU Buffer<br>(allocate raw buffer)"]
+    D["Create Tensor<br>(allocates Array with one<br> or more buffers<br>and associates Shape)"]
+    
+    E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
+    F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
+    G["Optional: <br> Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
+  end
+
+  %% Buffer Setup & Bindings
+  subgraph "Buffer & Binding Setup"
+    H["Define Bindings<br>(Bindings, TensorView)"]
+    I["Map GPU buffers<br> to shader bindings<br>(Collection from Tensor<br> or single buffers)"]
+  end
+
+  %% Kernel Setup & Execution
+  subgraph "Kernel Setup & Execution"
+    J["Define KernelCode<br>(WGSL template, workgroup size, precision)"]
+    K["Create Kernel"]
+    L["Dispatch Kernel"]
+  end
+
+  %% GPU Execution & Result Readback
+  subgraph "GPU Execution & Result Readback"
+    M["Kernel Execution<br>(GPU shader runs)"]
+    N["Readback Data<br>(toCPU variants)"]
+  end
+
+  %% Context & Resources
+  O["Context<br>(Device, Queue,<br>TensorPool, KernelPool)"]
+
+  %% Flow Connections
+  A --> B
+  B --> C
+  B --> D
+  C --> E
+  D --> F
+  F --> H
+  E --> H
+  H --> I
+  I --> K
+  J --> K
+  G --- K
+  K --> L
+  L --> M
+  M --> N
+
+  %% Context shared by all stages
+  O --- D
+  O --- E
+  O --- F
+  O --- K
+  O --- L
+  O --- N
+```
+
+• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
+• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
+• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
+• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
+• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
+• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
+• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor.
+
+`gpu::Tensor` Ranks:
+Rank 0: Scalar
+Rank 1: Vector
+Rank 2: Matrix
+Rank 3: 3D Tensor (or Cube)
+Rank 4: 4D Tensor
+Rank (max 8): Higher Dimensional Tensors
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 7453869..b44934b 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -28,7 +28,13 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
+  #ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+  auto adaptersList = listAdapters(ctx);
+  LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str());
+  #else
   Context ctx = createContext();
+  #endif
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -36,13 +42,10 @@ int main(int argc, char **argv) {
   }
   Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf32);
-  std::promise<void> promise;
-  std::future<void> future = promise.get_future();
   Kernel op = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  dispatchKernel(ctx, op, promise);
-  wait(ctx, future);
+  dispatchKernel(ctx, op);
   toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
diff --git a/examples/render/run.cpp b/examples/render/run.cpp
index f2c6bec..64122cd 100644
--- a/examples/render/run.cpp
+++ b/examples/render/run.cpp
@@ -124,10 +124,8 @@ int main(int argc, char **argv) {
                                      cdiv({NCOLS, NROWS, 1}, wgSize), params);
   printf("\033[2J\033[H");
   while (true) {
-    std::promise<void> promise;
-    std::future<void> future = promise.get_future();
-    dispatchKernel(ctx, renderKernel, promise);
-    wait(ctx, future);
+
+    dispatchKernel(ctx, renderKernel);
     toCPU(ctx, devScreen, screen.data(), sizeof(screen));
     params.time = getCurrentTimeInMilliseconds() - zeroTime;
 
@@ -149,11 +147,12 @@ int main(int argc, char **argv) {
 
     std::array<char, screen.size()> raster;
     for (size_t i = 0; i < screen.size(); ++i) {
-      size_t index =
-          std::min(sizeof(intensity) - 2,
-                   std::max(0ul, static_cast<size_t>(screen[i] *
-                                                     (sizeof(intensity) - 2))));
-      raster[i] = intensity[index];
+        // Convert all values to size_t to ensure proper type matching
+        const size_t intensity_max = sizeof(intensity) - 2;
+        const size_t scaled_value = static_cast<size_t>(screen[i] * intensity_max);
+        size_t index = std::min(intensity_max, 
+                               std::max(static_cast<size_t>(0), scaled_value));
+        raster[i] = intensity[index];
     }
 
     char buffer[(NROWS + 2) * (NCOLS + 2)];
diff --git a/examples/shadertui/CMakeLists.txt b/examples/shadertui/CMakeLists.txt
index 0938023..b728fc8 100644
--- a/examples/shadertui/CMakeLists.txt
+++ b/examples/shadertui/CMakeLists.txt
@@ -1,3 +1,4 @@
+# Not working yet needs update with libs for emscripten
 cmake_minimum_required(VERSION 3.28)
 project(shadertui)
 
diff --git a/gpu.hpp b/gpu.hpp
index 5327fe7..69ed0e9 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -1,6 +1,7 @@
 #ifndef GPU_HPP
 #define GPU_HPP
 
+#include "webgpu.h"
 #include <array>
 #include <cassert>
 #include <cstring>
@@ -9,21 +10,20 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <thread>
 #include <tuple>
 #include <type_traits>
 #include <unordered_map>
 #include <utility> // std::pair
 #include <vector>
 
-#include "webgpu/webgpu.h"
-
-#include "numeric_types/half.hpp"
-#include "utils/logging.hpp"
-
 #ifdef __EMSCRIPTEN__
 #include "emscripten/emscripten.h"
 #endif
 
+#include "numeric_types/half.hpp"
+#include "utils/logging.hpp"
+
 #ifdef USE_DAWN_API
 #include "dawn/native/DawnNative.h"
 #endif
@@ -254,6 +254,26 @@ inline std::string toString(const Shape &shape) {
  */
 inline std::string toString(size_t value) { return std::to_string(value); }
 
+/**
+ * @brief Converts a WGPUStringView to an std::string.
+ *
+ * If the view's data is null, an empty string is returned. If the view's
+ * length equals WGPU_STRLEN, it is assumed to be null‑terminated; otherwise,
+ * the explicit length is used.
+ *
+ * @param strView The WGPUStringView to convert.
+ * @return std::string The resulting standard string.
+ */
+inline std::string formatWGPUStringView(WGPUStringView strView) {
+  if (!strView.data) {
+    return "";
+  }
+  if (strView.length == WGPU_STRLEN) {
+    return std::string(strView.data);
+  }
+  return std::string(strView.data, strView.length);
+}
+
 /**
  * @brief simple in-place string replacement helper function for substituting
  * placeholders in a WGSL string template.
@@ -430,8 +450,8 @@ struct CallbackData {
   WGPUBuffer buffer; // managed by owning Kernel
   size_t bufferSize;
   void *output; // non-owning, only for target memory in toCPU, not used for
-                // kernel invocations
-  std::promise<void> *promise;
+  // kernel invocations
+  std::shared_ptr<std::promise<void>> promise;
   std::future<void> *future;
 };
 
@@ -530,32 +550,27 @@ struct Context {
   // Default constructor
   Context() = default;
 
-  Context(Context&& other) noexcept
-      : instance(other.instance),
-        adapter(other.adapter),
-        device(other.device),
+  Context(Context &&other) noexcept
+      : instance(other.instance), adapter(other.adapter), device(other.device),
         queue(other.queue),
         // Re‐initialize pools to point to *this*:
-        pool(this),
-        kernelPool(this),
-        adapterStatus(other.adapterStatus),
-        deviceStatus(other.deviceStatus)
-  {
+        pool(this), kernelPool(this), adapterStatus(other.adapterStatus),
+        deviceStatus(other.deviceStatus) {
     LOG(kDefLog, kTrace, "Moving Context ownership");
     // Move over the resources in the pools:
-    pool.data       = std::move(other.pool.data);
+    pool.data = std::move(other.pool.data);
     kernelPool.data = std::move(other.kernelPool.data);
 
     // Null out handles in the source so its destructor won't release them.
     other.instance = nullptr;
-    other.adapter  = nullptr;
-    other.device   = nullptr;
-    other.queue    = nullptr;
+    other.adapter = nullptr;
+    other.device = nullptr;
+    other.queue = nullptr;
     // other.adapterStatus = 0;
     // other.deviceStatus = 0;
   }
 
-  Context& operator=(Context&& other) noexcept {
+  Context &operator=(Context &&other) noexcept {
     if (this != &other) {
       // Free any existing resources. In most cases, this should be a no-op
       // since we typically shouldn't have two active initialized Context
@@ -625,7 +640,7 @@ inline Tensor createTensor(TensorPool &pool, WGPUDevice &device,
   size_t numElements = size(shape);
   size_t size = sizeBytes(dtype) * numElements;
   WGPUBufferDescriptor bufferDesc = {
-      .label = {.data = nullptr, .length = 0}, 
+      .label = {.data = nullptr, .length = 0},
       .usage = usage,
       .size = size,
   };
@@ -794,6 +809,212 @@ inline void check(bool condition, const char *message,
   }
 }
 
+/**
+ * @brief Pumps events until the provided future is ready.
+ *
+ * This helper template function continuously checks the status of the provided
+ * std::future<T> until it becomes ready. On Emscripten builds, it yields
+ * control to the JavaScript event loop using emscripten_sleep to allow
+ * asynchronous callbacks to execute. On other platforms, it processes events
+ * from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future
+ * is ready, its value is returned.
+ *
+ * @tparam T The type of the value contained in the future.
+ * @param instance The WGPUInstance used to process events.
+ * @param f The future to wait on.
+ * @return T The value retrieved from the ready future.
+ *
+ * @code
+ * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter,
+ * devDescriptor); WGPUDevice device = wait(instance, deviceFuture);
+ * @endcode
+ */
+template <typename T> T wait(Context &ctx, std::future<T> &f) {
+#ifdef __EMSCRIPTEN__
+  // Poll until the future is ready.
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    // Yield control to the JS event loop.
+    emscripten_sleep(1);
+  }
+  return f.get();
+#else
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    wgpuInstanceProcessEvents(ctx.instance);
+  }
+  return f.get();
+#endif
+}
+
+// Context Callbacks & Helpers
+
+/**
+ * @brief Waits for the provided std::future<T> to become ready by polling its
+ * status.
+ *
+ * This helper template function continuously checks the status of the provided
+ * std::future<T> until it is ready. On Emscripten builds, it yields control to
+ * the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous
+ * behavior. On non-Emscripten platforms, it sleeps for a short duration (10
+ * milliseconds) between checks. Once the future is ready, its value is
+ * returned.
+ *
+ * @tparam T The type of the value contained in the future.
+ * @param f The future to wait on.
+ * @return T The value retrieved from the ready future.
+ *
+ * @code
+ * std::future<Context> contextFuture = createContext();
+ * Context ctx = waitForContextFuture(contextFuture);
+ * @endcode
+ */
+template <typename T> T waitForContextFuture(std::future<T> &f, size_t sleepTime = 10) {
+#ifdef __EMSCRIPTEN__
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    emscripten_sleep(1); // Yield back to the JS event loop.
+  }
+  return f.get();
+#else
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleepTime));
+  }
+  return f.get();
+#endif
+}
+
+/**
+ * @brief Adapter callback function invoked upon completion of an asynchronous
+ * WebGPU adapter request.
+ *
+ * This callback is triggered when the request for a WebGPU adapter completes.
+ * It verifies whether the adapter was successfully obtained. On failure, it
+ * logs an error message (in Emscripten builds) and sets an exception on the
+ * associated promise. On success, it sets the value of the promise with the
+ * obtained adapter. Finally, it frees the allocated memory for the promise
+ * pointer.
+ *
+ * @param status The status of the adapter request. Expected to be
+ * WGPURequestAdapterStatus_Success on success.
+ * @param adapter The WGPUAdapter obtained on a successful request.
+ * @param message A string view containing additional information about the
+ * adapter request.
+ * @param userdata1 A pointer to a heap-allocated
+ * std::shared_ptr<std::promise<WGPUAdapter>>.
+ * @param userdata2 Unused.
+ */
+inline void adapterCallback(WGPURequestAdapterStatus status,
+                            WGPUAdapter adapter, WGPUStringView message,
+                            void *userdata1, void * /*userdata2*/) {
+  auto *promisePtr =
+      reinterpret_cast<std::shared_ptr<std::promise<WGPUAdapter>> *>(userdata1);
+  if (status != WGPURequestAdapterStatus_Success) {
+#ifdef __EMSCRIPTEN__
+    LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s",
+        static_cast<int>(message.length), message.data);
+#endif
+    (*promisePtr)
+        ->set_exception(std::make_exception_ptr(
+            std::runtime_error("Request WebGPU adapter failed")));
+  } else {
+    (*promisePtr)->set_value(adapter);
+  }
+  delete promisePtr;
+}
+
+/**
+ * @brief Callback function invoked upon completion of an asynchronous WebGPU
+ * device request.
+ *
+ * This callback is triggered when the request for a WebGPU device completes. It
+ * verifies that the device was successfully created. On success, the callback
+ * sets the value of the associated promise; otherwise, it sets an exception.
+ * After fulfilling the promise, it frees the allocated memory for the promise
+ * pointer.
+ *
+ * @param status The status of the device request. Expected to be
+ * WGPURequestDeviceStatus_Success on success.
+ * @param device The WGPUDevice obtained on successful request.
+ * @param message A string view containing additional information about the
+ * device request.
+ * @param userdata1 A pointer to a heap-allocated
+ * std::shared_ptr<std::promise<WGPUDevice>>.
+ * @param userdata2 Unused.
+ */
+inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device,
+                           WGPUStringView message, void *userdata1,
+                           void * /*userdata2*/) {
+  auto *promisePtr =
+      reinterpret_cast<std::shared_ptr<std::promise<WGPUDevice>> *>(userdata1);
+  if (status != WGPURequestDeviceStatus_Success) {
+    (*promisePtr)
+        ->set_exception(std::make_exception_ptr(
+            std::runtime_error("Request WebGPU device failed")));
+  } else {
+    LOG(kDefLog, kTrace, "Device Request succeeded %p",
+        static_cast<void *>(device));
+    (*promisePtr)->set_value(device);
+  }
+  delete promisePtr;
+}
+
+/**
+ * @brief Asynchronously requests a WebGPU adapter from the given instance.
+ *
+ * This helper function wraps the asynchronous call to request an adapter using
+ * the WebGPU API. It sets up a promise and registers an adapter callback,
+ * returning a future that will eventually hold the requested WGPUAdapter.
+ *
+ * @param instance The WGPUInstance from which to request the adapter.
+ * @param adapterOpts The options for requesting the adapter.
+ * @return std::future<WGPUAdapter> A future that will eventually hold the
+ * created WGPUAdapter.
+ */
+inline std::future<WGPUAdapter>
+requestAdapterAsync(WGPUInstance instance,
+                    const WGPURequestAdapterOptions &adapterOpts) {
+  auto promise = std::make_shared<std::promise<WGPUAdapter>>();
+  auto *promisePtr = new std::shared_ptr<std::promise<WGPUAdapter>>(promise);
+
+  WGPURequestAdapterCallbackInfo callbackInfo{
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = adapterCallback,
+      .userdata1 = promisePtr,
+      .userdata2 = nullptr};
+  wgpuInstanceRequestAdapter(instance, &adapterOpts, callbackInfo);
+  return promise->get_future();
+}
+
+/**
+ * @brief Asynchronously requests a WebGPU device from a given adapter.
+ *
+ * This helper function wraps the asynchronous call to request a device using
+ * the WebGPU API. It sets up a promise and registers a device callback,
+ * returning a future that will be fulfilled once the device is available.
+ *
+ * @param adapter The WGPUAdapter to request the device from.
+ * @param devDescriptor The descriptor specifying the characteristics of the
+ * requested device.
+ * @return std::future<WGPUDevice> A future that will eventually hold the
+ * created WGPUDevice.
+ */
+inline std::future<WGPUDevice>
+requestDeviceAsync(WGPUAdapter adapter,
+                   const WGPUDeviceDescriptor &devDescriptor) {
+  auto promise = std::make_shared<std::promise<WGPUDevice>>();
+  auto *promisePtr = new std::shared_ptr<std::promise<WGPUDevice>>(promise);
+
+  WGPURequestDeviceCallbackInfo deviceCallbackInfo{
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = deviceCallback,
+      .userdata1 = promisePtr,
+      .userdata2 = nullptr};
+  wgpuAdapterRequestDevice(adapter, &devDescriptor, deviceCallbackInfo);
+  return promise->get_future();
+}
+
 /**
  * @brief Factory function to create a GPU context, which aggregates WebGPU API
  * handles to interact with the GPU including the instance, adapter, device, and
@@ -812,316 +1033,388 @@ inline void check(bool condition, const char *message,
  * @return Context instance representing the created GPU context
  *
  */
-inline Context createContext(
-    const WGPUInstanceDescriptor &desc = {},
-    const WGPURequestAdapterOptions &adapterOpts = {},
-    const WGPUDeviceDescriptor &devDescriptor = {}) 
-{
-  Context ctx; // stack-allocated
+inline std::future<Context>
+createContextAsync(const WGPUInstanceDescriptor &desc = {},
+                   const WGPURequestAdapterOptions &adapterOpts = {},
+                   const WGPUDeviceDescriptor &devDescriptor = {}) {
 
-#ifdef __EMSCRIPTEN__
-  ctx.instance = wgpuCreateInstance(nullptr);
-#else
+  auto promise = std::make_shared<std::promise<Context>>();
+
+  // On native platforms, run our context creation in a detached thread.
+
+  Context ctx;
   ctx.instance = wgpuCreateInstance(&desc);
-#endif
-  check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
+  if (!ctx.instance) {
+    promise->set_exception(std::make_exception_ptr(
+        std::runtime_error("Failed to create WebGPU instance.")));
+    return promise->get_future();
+  }
+  try {
+    auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts);
+    ctx.adapter = wait(ctx, adapterFuture);
+    ctx.adapterStatus = WGPURequestAdapterStatus_Success;
+  } catch (const std::exception &ex) {
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
+  }
+  try {
+    auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
+    ctx.device = wait(ctx, deviceFuture);
+    ctx.deviceStatus = WGPURequestDeviceStatus_Success;
+  } catch (const std::exception &ex) {
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
+  }
+  ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  promise->set_value(std::move(ctx));
 
-  LOG(kDefLog, kTrace, "Requesting adapter");
-  {
-    struct AdapterData {
-      WGPUAdapter adapter = nullptr;
-      bool requestEnded = false;
-      WGPURequestAdapterStatus status;
-    };
-    AdapterData adapterData;
-
-    auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,
-                                    WGPUAdapter adapter, 
-                                    WGPUStringView message,
-                                    void *pUserData, void *) {
-      auto &ad = *reinterpret_cast<AdapterData*>(pUserData);
-      ad.status = status;
-#ifdef __EMSCRIPTEN__
-      if (status != WGPURequestAdapterStatus_Success) {
-        LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s",
-            static_cast<int>(message.length), message.data);
-      }
-#endif
-      check(status == WGPURequestAdapterStatus_Success,
-            "Request WebGPU adapter", __FILE__, __LINE__);
-      ad.adapter      = adapter;
-      ad.requestEnded = true;
-    };
+  return promise->get_future();
+}
 
-    WGPURequestAdapterCallbackInfo callbackInfo {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = onAdapterRequestEnded,
-      .userdata1 = &adapterData,
-      .userdata2 = nullptr
-    };
-    wgpuInstanceRequestAdapter(ctx.instance, &adapterOpts, callbackInfo);
+/**
+ * @brief Synchronously waits for and returns the created GPU context.
+ *
+ * This function invokes the asynchronous createContext() factory function to
+ * create a GPU context, then waits for its completion using
+ * waitForContextFuture. The returned Context holds handles to the WebGPU
+ * instance, adapter, device, and queue, and is used for subsequent GPU
+ * operations.
+ *
+ * @return Context The fully initialized GPU context.
+ *
+ * @code
+ * Context ctx = waitForContext();
+ * // Now ctx can be used for GPU operations.
+ * @endcode
+ */
+inline Context createContext(const WGPUInstanceDescriptor &desc = {},
+                             const WGPURequestAdapterOptions &adapterOpts = {},
+                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+  std::future<Context> contextFuture =
+      createContextAsync(desc, adapterOpts, devDescriptor);
+  return waitForContextFuture<Context>(contextFuture);
+}
+
+#ifndef __EMSCRIPTEN__
+#if USE_DAWN_API
+/**
+ * @brief Retrieves the list of available GPU adapters from the Dawn instance.
+ *
+ * This function creates a Dawn instance using the provided context's instance
+ * handle, then enumerates and returns the available GPU adapters as a vector.
+ *
+ * @param ctx The Context containing the WebGPU instance handle.
+ * @return std::vector<dawn::native::Adapter> A vector of available GPU
+ * adapters.
+ *
+ * @code
+ * std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
+ * @endcode
+ */
+inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
+  dawn::native::Instance dawnInstance(
+      reinterpret_cast<dawn::native::InstanceBase *>(ctx.instance));
+  return dawnInstance.EnumerateAdapters();
+}
 
-    while (!adapterData.requestEnded) {
-      processEvents(ctx.instance);
+/**
+ * @brief Formats the given vector of Dawn adapters into a single concatenated
+ * string.
+ *
+ * This function iterates over each Dawn adapter in the provided vector,
+ * retrieves its description using the WebGPU API, and converts the description
+ * from a WGPUStringView to an std::string using the formatWGPUStringView
+ * helper. The resulting descriptions are concatenated into a single string
+ * separated by newline characters.
+ *
+ * @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
+ * @return std::string A newline-delimited string listing each adapter's
+ * description.
+ *
+ * @code
+ * std::string adapterList = formatAdapters(adapters);
+ * @endcode
+ */
+inline std::string
+formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
+  std::string adapterList;
+  for (size_t i = 0; i < adapters.size(); ++i) {
+    auto adapterPtr = adapters[i].Get();
+    if (adapterPtr) {
+      WGPUAdapterInfo info = {};
+      wgpuAdapterGetInfo(adapterPtr, &info);
+      std::string desc = formatWGPUStringView(info.description);
+      adapterList += "GPU Adapter [" + std::to_string(i) + "]: " + desc + "\n";
+      wgpuAdapterInfoFreeMembers(info);
     }
-    ctx.adapter = adapterData.adapter;
-    ctx.adapterStatus = adapterData.status;
   }
+  return adapterList;
+}
 
-  LOG(kDefLog, kTrace, "Requesting device");
-  {
-    struct DeviceData {
-      WGPUDevice device = nullptr;
-      bool requestEnded = false;
-      WGPURequestDeviceStatus status;
-    };
-    DeviceData devData;
-
-    auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
-                                   WGPUDevice device, 
-                                   WGPUStringView message,
-                                   void *pUserData, void *) {
-      auto &dd = *reinterpret_cast<DeviceData*>(pUserData);
-      dd.status = status;
-      check(status == WGPURequestDeviceStatus_Success,
-            "Could not get WebGPU device.", __FILE__, __LINE__);
-      LOG(kDefLog, kTrace, "Device Request succeeded %p", 
-          static_cast<void*>(device));
-      dd.device      = device;
-      dd.requestEnded= true;
-    };
+/**
+ * @brief Lists the available GPU adapters in the current WebGPU instance.
+ *
+ * This function retrieves the list of available GPU adapters using the
+ * getAdapters helper function, then formats and returns the adapter
+ * descriptions as a single string using the formatAdapters helper function.
+ *
+ * @param ctx The Context containing the WebGPU instance handle.
+ * @return std::string A newline-delimited string listing each adapter's
+ * description.
+ *
+ * @code
+ * std::string adapterList = listAdapters(ctx);
+ * @endcode
+ */
+inline std::string listAdapters(Context &ctx) {
+  auto adapters = getAdapters(ctx);
+  return formatAdapters(adapters);
+}
 
-    WGPURequestDeviceCallbackInfo deviceCallbackInfo {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = onDeviceRequestEnded,
-      .userdata1= &devData,
-      .userdata2= nullptr
-    };
-    wgpuAdapterRequestDevice(ctx.adapter, &devDescriptor, deviceCallbackInfo);
+/**
+ * @brief Asynchronously creates a GPU context using the specified GPU index.
+ *
+ * This function creates a WebGPU instance, retrieves the available GPU
+ * adapters, and selects the adapter at the specified index. It then requests a
+ * device from the selected adapter and sets up a logging callback for device
+ * errors. The function returns a future that will be fulfilled with the
+ * created Context once all operations are complete.
+ *
+ * @param gpuIdx The index of the GPU adapter to use.
+ * @param desc Instance descriptor for the WebGPU instance (optional)
+ * @param devDescriptor Device descriptor for the WebGPU device (optional)
+ * @return std::future<Context> A future that will eventually hold the created
+ * Context.
+ *
+ * @code
+ * std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
+ * Context ctx = waitForContextFuture(contextFuture);
+ * @endcode
+ */
+inline std::future<Context>
+createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
+                           const WGPUDeviceDescriptor &devDescriptor = {}) {
+  auto promise = std::make_shared<std::promise<Context>>();
+  Context ctx;
 
-    LOG(kDefLog, kTrace, "Waiting for device request to end");
-    while (!devData.requestEnded) {
-      processEvents(ctx.instance);
-    }
-    LOG(kDefLog, kTrace, "Device request ended");
-
-    ctx.device = devData.device;
-    ctx.deviceStatus = devData.status;
-
-    // If the device was created, set up logging and fetch the queue
-    if (devData.status == WGPURequestDeviceStatus_Success) {
-      WGPULoggingCallbackInfo loggingCallbackInfo {
-        .nextInChain = nullptr,
-        .callback =
-          [](WGPULoggingType type, WGPUStringView message, 
-             void *, void *) {
+  ctx.instance = wgpuCreateInstance(&desc);
+
+  if (!ctx.instance) {
+    promise->set_exception(std::make_exception_ptr(
+        std::runtime_error("Failed to create WebGPU instance.")));
+    return promise->get_future();
+  }
+  check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
+
+  // Use helper functions to obtain and format the adapters.
+  auto adapters = getAdapters(ctx);
+
+  if (gpuIdx >= adapters.size()) {
+    promise->set_exception(
+        std::make_exception_ptr(std::runtime_error("Invalid GPU index.")));
+    return promise->get_future();
+  }
+  LOG(kDefLog, kInfo, "Using GPU Adapter[%d]", gpuIdx);
+  auto adapterPtr = adapters[gpuIdx].Get();
+  if (adapterPtr) {
+    WGPUAdapterInfo info = {};
+    wgpuAdapterGetInfo(adapterPtr, &info);
+    LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s", gpuIdx,
+        formatWGPUStringView(info.description).c_str());
+    wgpuAdapterInfoFreeMembers(info);
+  }
+  ctx.adapter = reinterpret_cast<WGPUAdapter>(adapterPtr);
+  dawn::native::GetProcs().adapterAddRef(ctx.adapter);
+
+  LOG(kDefLog, kInfo, "Requesting device");
+  // Request the device asynchronously (using our requestDeviceAsync helper).
+  auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
+  try {
+    ctx.device = wait(ctx, deviceFuture);
+    ctx.deviceStatus = WGPURequestDeviceStatus_Success;
+  } catch (const std::exception &ex) {
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
+  }
+
+  WGPULoggingCallbackInfo loggingCallbackInfo{
+      .nextInChain = nullptr,
+      .callback =
+          [](WGPULoggingType type, WGPUStringView message, void *userdata1,
+             void *userdata2) {
             LOG(kDefLog, kError, "Device logging callback: %.*s",
                 static_cast<int>(message.length), message.data);
             if (type == WGPULoggingType_Error) {
               throw std::runtime_error("Device error logged.");
             }
           },
-        .userdata1 = nullptr,
-        .userdata2 = nullptr
-      };
-      wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
-      ctx.queue = wgpuDeviceGetQueue(ctx.device);
-    }
-  }
-
-  return std::move(ctx);
+      .userdata1 = nullptr,
+      .userdata2 = nullptr};
+  wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
+  ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  promise->set_value(std::move(ctx));
+  return promise->get_future();
 }
 
-
-#ifdef USE_DAWN_API
 /**
- * @brief Factory function to create a GPU context, which aggregates WebGPU API
- * handles to interact with the GPU including the instance, adapter, device, and
- * queue.
+ * @brief Synchronously creates a GPU context using the specified GPU index.
  *
- * The function takes gpu index to support for multi GPUs.
- * To activate this function, it needs not only webgpu's headers but also DAWN's
- * headers.
+ * This function calls the asynchronous createContextByGpuIdxAsync function to
+ * create a GPU context, then waits for its completion using
+ * waitForContextFuture. The returned Context holds handles to the WebGPU
+ * instance, adapter, device, and queue, and is used for subsequent GPU
+ * operations.
  *
- * If dawn is used, it also sets up an error callback for device loss.
- *
- * @param[in] gpuIdx GPU index
- * @param[in] desc Instance descriptor for the WebGPU instance (optional)
- * @param[in] devDescriptor Device descriptor for the WebGPU device (optional)
- * @return Context instance representing the created GPU context
+ * @param gpuIdx The index of the GPU adapter to use.
+ * @param desc Instance descriptor for the WebGPU instance (optional)
+ * @param devDescriptor Device descriptor for the WebGPU device (optional)
+ * @return Context The fully initialized GPU context.
  *
  * @code
- * Context ctx = createContextByGpuIdx(1);
+ * Context ctx = createContextByGpuIdx(0);
  * @endcode
  */
 inline Context
 createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
                       const WGPUDeviceDescriptor &devDescriptor = {}) {
-  Context context;
-  {
-#ifdef __EMSCRIPTEN__
-    // Emscripten does not support the instance descriptor
-    // and throws an assertion error if it is not nullptr.
-    context.instance = wgpuCreateInstance(nullptr);
-#else
-    context.instance = wgpuCreateInstance(&desc);
-#endif
-    // check status
-    check(context.instance, "Initialize WebGPU", __FILE__, __LINE__);
-  }
-
-  LOG(kDefLog, kInfo, "Requesting adapter");
-  {
-    std::vector<dawn::native::Adapter> adapters =
-        dawn::native::Instance(
-            reinterpret_cast<dawn::native::InstanceBase *>(context.instance))
-            .EnumerateAdapters();
-    LOG(kDefLog, kInfo, "The number of GPUs=%d\n", adapters.size());
-    // Note: Second gpu is not available on Macos, but the number of GPUs is 2
-    // on Macos.
-    //       Calling wgpuAdapterGetInfo function for the second gpu becomes
-    //       segfault. When you check all GPUs on linux, uncomment out following
-    //       codes.
-    //
-    // for (size_t i = 0; i < adapters.size(); i++) {
-    //   WGPUAdapterInfo info {};
-    //   auto ptr = adapters[i].Get();
-    //   if (ptr && adapters[i]) {
-    //     wgpuAdapterGetInfo(ptr, &info);
-    //     LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", i, info.description);
-    //     wgpuAdapterInfoFreeMembers(info);
-    //   }
-    // }
-
-    {
-      LOG(kDefLog, kInfo, "Use GPU(Adapter)[%d]\n", gpuIdx);
-      auto ptr = adapters[gpuIdx].Get();
-      if (ptr) {
-        WGPUAdapterInfo info{};
-        wgpuAdapterGetInfo(ptr, &info);
-        LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", gpuIdx,
-            info.description);
-        wgpuAdapterInfoFreeMembers(info);
-      }
-      context.adapter = adapters[gpuIdx].Get();
-      dawn::native::GetProcs().adapterAddRef(context.adapter);
-    }
-  }
+  std::future<Context> contextFuture =
+      createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor);
+  return waitForContextFuture<Context>(contextFuture);
+}
 
-  LOG(kDefLog, kInfo, "Requesting device");
-  {
-    struct DeviceData {
-      WGPUDevice device = nullptr;
-      bool requestEnded = false;
-    };
-    DeviceData devData;
-
-    auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
-                                   WGPUDevice device, WGPUStringView message,
-                                   void *pUserData, void *) {
-      DeviceData &devData = *reinterpret_cast<DeviceData *>(pUserData);
-      check(status == WGPURequestDeviceStatus_Success,
-            "Could not get WebGPU device.", __FILE__, __LINE__);
-      LOG(kDefLog, kTrace, "Device Request succeeded %x",
-          static_cast<void *>(device));
-      devData.device = device;
-      devData.requestEnded = true;
-    };
+#endif // USE_DAWN_API
+#endif // __EMSCRIPTEN__
 
-    WGPURequestDeviceCallbackInfo deviceCallbackInfo = {
-        .mode = WGPUCallbackMode_AllowSpontaneous,
-        .callback = onDeviceRequestEnded,
-        .userdata1 = &devData,
-        .userdata2 = nullptr};
-    wgpuAdapterRequestDevice(context.adapter, &devDescriptor,
-                             deviceCallbackInfo);
-
-    LOG(kDefLog, kInfo, "Waiting for device request to end");
-    while (!devData.requestEnded) {
-      processEvents(context.instance);
-    }
-    LOG(kDefLog, kInfo, "Device request ended");
-    assert(devData.requestEnded);
-    context.device = devData.device;
-
-    WGPULoggingCallbackInfo loggingCallbackInfo = {
-        .nextInChain = nullptr,
-        .callback =
-            [](WGPULoggingType type, WGPUStringView message, void *userdata1,
-               void *userdata2) {
-              LOG(kDefLog, kError, "Device logging callback: %.*s",
-                  static_cast<int>(message.length), message.data);
-              if (type == WGPULoggingType_Error) {
-                throw std::runtime_error("Device error logged.");
-              }
-            },
-        .userdata1 = nullptr,
-        .userdata2 = nullptr};
-    wgpuDeviceSetLoggingCallback(context.device, loggingCallbackInfo);
-  }
-  context.queue = wgpuDeviceGetQueue(context.device);
-  return context;
+/**
+ * @brief Callback function invoked upon completion of an asynchronous GPU
+ * buffer mapping.
+ *
+ * This callback is triggered when the GPU buffer mapping for a readback buffer
+ * is completed. It verifies that the mapping operation was successful,
+ * retrieves the mapped memory, copies the data from the GPU buffer to a CPU
+ * memory region, unmaps the buffer, signals the completion by fulfilling the
+ * associated promise, and cleans up the allocated callback data.
+ *
+ * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success
+ * on success.
+ * @param message A string view containing additional information about the
+ * mapping operation.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure
+ * containing the GPU buffer, buffer size, destination CPU memory pointer, and a
+ * promise for signaling completion.
+ * @param userdata2 Unused.
+ */
+inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
+                              void *userdata1, void * /*userdata2*/) {
+  const CallbackData *cbData = static_cast<CallbackData *>(userdata1);
+  // Check that mapping succeeded.
+  check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__,
+        __LINE__);
+
+  // Get the mapped memory.
+  const void *mappedData =
+      wgpuBufferGetConstMappedRange(cbData->buffer, 0, cbData->bufferSize);
+  check(mappedData, "Get mapped range", __FILE__, __LINE__);
+
+  // Copy the data from the mapped GPU buffer to the CPU memory.
+  memcpy(cbData->output, mappedData, cbData->bufferSize);
+
+  // Unmap the buffer.
+  wgpuBufferUnmap(cbData->buffer);
+
+  // Signal that the copy has completed.
+  // Ensure you use the arrow operator on the shared_ptr to call set_value().
+  cbData->promise->set_value();
+
+  // Clean up the dynamically allocated callback data.
+  delete cbData;
 }
-#endif
 
-inline void wait(Context &ctx, std::future<void> &future) {
-  while (future.wait_for(std::chrono::seconds(0)) !=
-         std::future_status::ready) {
-    processEvents(ctx.instance);
-  }
+/**
+ * @brief Callback function invoked when the GPU queue’s submitted work is
+ * complete.
+ *
+ * This callback is registered with the GPU queue after submitting work. When
+ * invoked, it verifies that all queued work completed successfully, and then
+ * sets up the buffer mapping callback to initiate the asynchronous mapping of a
+ * readback buffer. The readback buffer is mapped to access the processed data
+ * on the CPU.
+ *
+ * @param status The status of the completed work. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure
+ * containing the readback buffer, buffer size, destination CPU memory pointer,
+ * and a promise to signal completion.
+ * @param userdata2 Unused.
+ */
+inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
+                                  void *userdata1, void * /*userdata2*/) {
+  const CallbackData *cbData = static_cast<CallbackData *>(userdata1);
+  // Ensure the queue work finished successfully.
+  check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__,
+        __LINE__);
+
+  // Set up the buffer mapping callback information.
+  WGPUBufferMapCallbackInfo mapCallbackInfo = {
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = bufferMapCallback,
+      .userdata1 = const_cast<CallbackData *>(cbData), // Pass the callback data.
+      .userdata2 = nullptr // No additional user data.
+  };
+
+  // Begin the asynchronous mapping of the readback buffer.
+  wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize,
+                     mapCallbackInfo);
 }
 
 /**
  * @brief Copies data from a GPU buffer to CPU memory.
  * @param[in] ctx Context instance to manage the operation
- * @param[in] tensor Tensor instance representing the GPU buffer to copy from
  * @param[out] data Pointer to the CPU memory to copy the data to
  * @param[in] bufferSize Size of the data buffer in bytes
  * @param[in] op StagingBuffer instance to manage the operation
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
  *
  * @code
  * toCPU(ctx, tensor, data, bufferSize);
  * @endcode
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
-                  CopyData &op) {
+
+// NOTE: I think this one is redundant? CopyData not used externally.
+inline std::future<void> toCPUAsync(Context &ctx, void *data, size_t bufferSize,
+                                    CopyData &op, size_t sourceOffset = 0) {
+  // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
-  CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise,
-                               &op.future};
 
+  // Create a promise and get its future.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Allocate callback data so it remains valid until the async
+  // chain finishes.
+  CallbackData *cbData = new CallbackData{
+      op.readbackBuffer, // The GPU buffer to be read back.
+      bufferSize,
+      data,    // CPU memory destination.
+      promise, // The promise to be signaled.
+  };
+
+  // Set up the work-done callback to initiate the buffer mapping.
   WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
       .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            const auto *data = static_cast<CallbackData *>(userdata1);
-            WGPUBufferMapCallbackInfo mapCallbackInfo = {
-                .mode = WGPUCallbackMode_AllowSpontaneous,
-                .callback =
-                    [](WGPUMapAsyncStatus status, WGPUStringView message,
-                       void *userdata1, void *userdata2) {
-                      const auto *data = static_cast<CallbackData *>(userdata1);
-                      check(status == WGPUMapAsyncStatus_Success,
-                            "Map readbackBuffer", __FILE__, __LINE__);
-                      const void *mappedData = wgpuBufferGetConstMappedRange(
-                          data->buffer, /*offset=*/0, data->bufferSize);
-                      check(mappedData, "Get mapped range", __FILE__, __LINE__);
-                      memcpy(data->output, mappedData, data->bufferSize);
-                      wgpuBufferUnmap(data->buffer);
-                      data->promise->set_value();
-                    },
-                .userdata1 = const_cast<CallbackData *>(data),
-                .userdata2 = nullptr};
-            wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0,
-                               data->bufferSize, mapCallbackInfo);
-          },
-      .userdata1 = &callbackData,
+      .callback = queueWorkDoneCallback,
+      .userdata1 = const_cast<CallbackData *>(cbData),
       .userdata2 = nullptr};
+
+  // Begin the asynchronous chain by registering the queue work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
-  wait(ctx, op.future);
+  // Release the readback buffer as it is no longer needed.
+  if (op.readbackBuffer) {
+    wgpuBufferRelease(op.readbackBuffer);
+  }
+
+  return promise->get_future();
 }
 
 /**
@@ -1136,34 +1429,124 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
  *
  * @param[in] ctx Context instance to manage the operation
  * @param[in] tensor Tensor instance representing the GPU buffer to copy from
- * @param[in] bufferSize Size of the data buffer in bytes
+ * @param[in] bufferSize Size to read in bytes as out data.
  * @param[out] data Pointer to the CPU memory to copy the data to
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
+                                    size_t bufferSize,
+                                    size_t sourceOffset = 0) {
+  // Create a promise that will later be satisfied when the async copy
+  // completes.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Create a readback buffer that will be used for copying and mapping.
+  WGPUBufferDescriptor readbackBufferDescriptor = {
+      .label = {.data = nullptr, .length = 0},
+      .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
+      .size = bufferSize, // Size of the readback buffer.
+  };
+  WGPUBuffer readbackBuffer =
+      wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
+
+  // Create a command encoder and record a copy from the tensor GPU buffer
+  WGPUCommandEncoder commandEncoder =
+      wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+  wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer,
+                                       sourceOffset, readbackBuffer, 0,
+                                       bufferSize);
+  // Finish recording by creating a command buffer and release the encoder.
+  WGPUCommandBuffer commandBuffer =
+      wgpuCommandEncoderFinish(commandEncoder, nullptr);
+  wgpuCommandEncoderRelease(commandEncoder);
+  check(commandBuffer, "Create command buffer", __FILE__, __LINE__);
+
+  // Submit the work to the queue and release the command buffer immediately.
+  wgpuQueueSubmit(ctx.queue, 1, &commandBuffer);
+  wgpuCommandBufferRelease(commandBuffer);
+
+  // Allocate callback data
+  CallbackData *cbData = new CallbackData{
+      readbackBuffer, // The readback buffer to map.
+      bufferSize,     // The size of the copy.
+      data,           // CPU memory destination.
+      promise         // The promise to signal when done.
+  };
+
+  // Set up the work-done callback. When the queue’s submitted work is
+  // completed, it is routed to queueWorkDoneCallback which then starts the
+  // asynchronous map.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = queueWorkDoneCallback,
+      .userdata1 = cbData,
+      .userdata2 = nullptr,
+  };
+
+  // Register the callback. The async chain continues inside
+  // queueWorkDoneCallback.
+  wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+
+  return promise->get_future();
+}
+
+inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
+                                    size_t bufferSize,
+                                    size_t sourceOffset = 0) {
+
+  // Create an operation structure (here we reuse CopyData solely for its
+  // members that we need to create a readback buffer and command buffer).
   CopyData op;
-  op.future = op.promise.get_future();
+
+  // Create the promise that will be fulfilled once the copy is done.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Create a readback buffer that we can map for reading.
   {
     WGPUBufferDescriptor readbackBufferDescriptor = {
-        .label = {.data = nullptr, .length = 0}, 
+        .label = {.data = nullptr, .length = 0},
         .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
         .size = bufferSize,
     };
     op.readbackBuffer =
         wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
   }
+
+  // Create a command encoder which copies from the provided buffer to the
+  // readback buffer.
   {
-    WGPUCommandEncoder commandEncoder;
-    commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
+    WGPUCommandEncoder commandEncoder =
+        wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, sourceOffset,
                                          op.readbackBuffer, 0, bufferSize);
     op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
     wgpuCommandEncoderRelease(commandEncoder);
     check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
   }
-  toCPU(ctx, tensor, data, bufferSize, op);
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
+
+  // Submit the command and release the command buffer.
+  wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
+  wgpuCommandBufferRelease(op.commandBuffer);
+
+  // Allocate callback data
+  CallbackData *cbData = new CallbackData{
+      op.readbackBuffer, // The readback buffer created above.
+      bufferSize,        // Size of the copy.
+      data,   // Destination CPU memory.         // Offset in the GPU buffer.
+      promise // Our promise to satisfy when done.
+  };
+
+  // Set up the queue work-done callback info.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = queueWorkDoneCallback, // Our free function callback.
+      .userdata1 = cbData,               // Pass the callback data pointer.
+      .userdata2 = nullptr};
+
+  // Start the asynchronous chain by registering the work-done callback.
+  wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+
+  return promise->get_future();
 }
 
 /**
@@ -1174,76 +1557,86 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
  * @param[out] data Array of floats to copy the data to
  *
  * @code
- * toCPU(ctx, tensor, data);
+ * std::future<void> toCPUFuture = toCPU(ctx, tensor, data);
+ * wait(ctx, toCPUFuture);
  * @endcode
  */
 template <size_t N>
-void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
-  toCPU(ctx, tensor, data.data(), sizeof(data));
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
+                                    std::array<float, N> &data,
+                                    size_t sourceOffset = 0) {
+  return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset);
 }
 
-inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
-  uint64_t bufferSize = size;
-  CopyData op;
-  op.future = op.promise.get_future();
-  {
-    WGPUBufferDescriptor readbackBufferDescriptor = {
-        .label = {.data = nullptr, .length = 0}, 
-        .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
-        .size = bufferSize,
-    };
-    op.readbackBuffer =
-        wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
-  }
-  {
-    WGPUCommandEncoder commandEncoder;
-    commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0,
-                                         op.readbackBuffer, 0, bufferSize);
-    op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
-    wgpuCommandEncoderRelease(commandEncoder);
-    check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
-  }
-  wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
-  wgpuCommandBufferRelease(op.commandBuffer);
-  CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise,
-                               &op.future};
+/**
+ * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU
+ * memory.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param tensor Tensor instance representing the GPU buffer to copy from
+ * @param data Pointer to the CPU memory to copy the data to
+ * @param bufferSize Size of the data buffer in bytes
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, tensor, data, bufferSize, instance);
+ * @endcode
+ */
+inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
+                  size_t sourceOffset = 0) {
+  auto future = toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
+  wait(ctx, future);
+}
 
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            const auto *data = static_cast<CallbackData *>(userdata1);
-            WGPUBufferMapCallbackInfo mapCallbackInfo = {
-                .mode = WGPUCallbackMode_AllowSpontaneous,
-                .callback =
-                    [](WGPUMapAsyncStatus status, WGPUStringView message,
-                       void *userdata1, void *userdata2) {
-                      const auto *data = static_cast<CallbackData *>(userdata1);
-                      check(status == WGPUMapAsyncStatus_Success,
-                            "Map readbackBuffer", __FILE__, __LINE__);
-                      const void *mappedData = wgpuBufferGetConstMappedRange(
-                          data->buffer, /*offset=*/0, data->bufferSize);
-                      check(mappedData, "Get mapped range", __FILE__, __LINE__);
-                      memcpy(data->output, mappedData, data->bufferSize);
-                      wgpuBufferUnmap(data->buffer);
-                      data->promise->set_value();
-                    },
-                .userdata1 = const_cast<CallbackData *>(data),
-                .userdata2 = nullptr};
-            wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0,
-                               data->bufferSize, mapCallbackInfo);
-          },
-      .userdata1 = &callbackData,
-      .userdata2 = nullptr};
-  wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+/**
+ * @brief Synchronous wrapper for copying from a GPU buffer to CPU memory.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param buffer WGPUBuffer instance representing the GPU buffer to copy from
+ * @param data Pointer to the CPU memory to copy the data to
+ * @param size Size of the data buffer in bytes
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, buffer, data, size, instance);
+ * @endcode
+ */
+inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size,
+                  size_t sourceOffset = 0) {
+  auto future = toCPUAsync(ctx, buffer, data, size, sourceOffset);
+  wait(ctx, future);
+}
 
-  wait(ctx, op.future);
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
+/**
+ * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU
+ * memory for an array of floats instead of a pointer to a float buffer.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param tensor Tensor instance representing the GPU buffer to copy from
+ * @param data Array of floats to copy the data to
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, tensor, data, instance);
+ * @endcode
+ */
+template <size_t N>
+inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data,
+                  size_t sourceOffset = 0) {
+  auto future = toCPUAsync(ctx, tensor, data, sourceOffset);
+  wait(ctx, future);
 }
 
 /**
@@ -1375,9 +1768,24 @@ inline Shape cdiv(Shape total, Shape group) {
 }
 
 /**
- * @brief A factory function to create a kernel on the GPU. The kernel is
- * created with the given WGSL code, input tensors, output tensor, and
- * optional parameters.
+ * @brief Packages the shader compilation information along with a promise for
+ * asynchronous signaling.
+ *
+ * This structure holds a pointer to a CompilationInfo instance that collects
+ * details such as status, messages, line numbers, and positions from the shader
+ * compilation. It also contains a shared pointer to a std::promise<void> which
+ * is used to signal the completion of the asynchronous shader compilation
+ * process.
+ */
+struct CompData {
+  CompilationInfo *compInfo;
+  std::shared_ptr<std::promise<void>> compPromise;
+};
+
+/**
+ * @brief A factory function to create a kernel asynchronously on the GPU.
+ * The kernel is created with the given WGSL code, input tensors,
+ * output tensor, and optional parameters.
  *
  * Note that the values of the input tensors are not used here, only the
  * reference handles to the underlying buffers as well as the size of the
@@ -1397,34 +1805,40 @@ inline Shape cdiv(Shape total, Shape group) {
  * @return Kernel instance representing the created kernel
  *
  * @code
- * Kernel kernel = createKernel(ctx, code, dataBindings, numInputs,
+ * std::future<Kernel> kernelFuture = createKernelAsync(ctx, code, dataBindings,
+ numInputs, output, nThreads, params, paramsSize);
+ * Kernel kernel = wait(ctx.instance, kernelFuture);
  * @endcode
- * output, nThreads, params, paramsSize);
+
  */
-inline Kernel createKernel(Context& ctx, const KernelCode &code,
-                           const Tensor *dataBindings, size_t numTensors,
-                           const size_t *viewOffsets,
-                           const Shape &totalWorkgroups,
-                           const void *params = nullptr, size_t paramsSize = 0,
-                           CompilationInfo *compilationInfo = nullptr,
-                           const char *cacheKey = nullptr) {
+inline std::future<Kernel>
+createKernelAsync(Context &ctx, const KernelCode &code,
+                  const Tensor *dataBindings, size_t numTensors,
+                  const size_t *viewOffsets, const Shape &totalWorkgroups,
+                  const void *params = nullptr, size_t paramsSize = 0,
+                  CompilationInfo *compilationInfo = nullptr,
+                  const char *cacheKey = nullptr) {
   // Create a cache key by the pointer values of the data bindings and the
   // kernel code
   if (cacheKey != nullptr &&
       ctx.kernelPool.data.find(cacheKey) != ctx.kernelPool.data.end()) {
-    LOG(kDefLog, kInfo, "Kernel cache hit");
-    return ctx.kernelPool.data[cacheKey];
+    std::promise<Kernel> ready;
+    ready.set_value(ctx.kernelPool.data[cacheKey]);
+    return ready.get_future();
   }
 
+  // Create an outer promise for the new kernel.
+  std::promise<Kernel> outerPromise;
+  std::future<Kernel> outerFuture = outerPromise.get_future();
+
   assert(totalWorkgroups.rank == 3);
   WGPUDevice device = ctx.device;
   WGPUQueue queue = ctx.queue;
   Kernel op(new RawKernel());
-
   // paramIndex is the index into bgLayoutEntries for the parameters buffer If
   // there are no parameters for the kernel, paramsSize == 0 and paramIndex is
   // effectively undefined (== -1)
-  size_t paramIndex = -1;
+  size_t paramIndex = static_cast<size_t>(-1);
   // Note: paramIndex is undefined unless paramsSize > 0
   size_t numBindings = numTensors;
   if (paramsSize > 0) {
@@ -1433,11 +1847,13 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
                                   // op.buffers, op.bufferSizes and
                                   // bgLayoutEntries
   }
+
   op->buffers = std::make_unique<WGPUBuffer[]>(numBindings);
   op->bufferSizes = std::make_unique<size_t[]>(numBindings);
   op->numBindings = numBindings;
-  std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
+
   // Create layout entries for input buffers
+  std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
   for (size_t i = 0; i < numTensors; ++i) {
     bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{
         .binding = static_cast<uint32_t>(i),
@@ -1450,8 +1866,6 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
     };
   }
   if (paramsSize > 0) {
-    LOG(kDefLog, kInfo, "Create layout entry for the params buffer");
-    // Create layout entry for the params buffer
     bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{
         .binding = static_cast<uint32_t>(paramIndex),
         .visibility = WGPUShaderStage_Compute,
@@ -1464,10 +1878,11 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   }
   WGPUBindGroupLayoutDescriptor bgLayoutDesc = {
       .entryCount = static_cast<uint32_t>(bgLayoutEntries.size()),
-      .entries = bgLayoutEntries.data(),
-  };
+      .entries = bgLayoutEntries.data()};
   WGPUBindGroupLayout bgLayout =
       wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);
+
+  // Assign buffers from dataBindings.
   for (size_t i = 0; i < numTensors; ++i) {
     op->buffers[i] = dataBindings[i].data.buffer;
     op->bufferSizes[i] = dataBindings[i].data.size;
@@ -1475,7 +1890,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   // Create a buffer for the Params struct
   if (paramsSize > 0) {
     WGPUBufferDescriptor paramsBufferDesc = {
-        .label = {.data = nullptr, .length = 0}, 
+        .label = {.data = nullptr, .length = 0},
         .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
         .size = paramsSize,
         .mappedAtCreation = false,
@@ -1487,6 +1902,8 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   } else {
     LOG(kDefLog, kTrace, "No params buffer needed");
   }
+
+  // Build bind group entries and the bind group.
   std::vector<WGPUBindGroupEntry> bindGroupEntries(numBindings);
   for (size_t i = 0; i < numTensors; ++i) {
     bindGroupEntries[i] = WGPUBindGroupEntry{
@@ -1514,6 +1931,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   };
   op->bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
 
+  // Create pipeline layout.
   WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
       .bindGroupLayoutCount = 1,
       .bindGroupLayouts = &bgLayout,
@@ -1521,63 +1939,151 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   WGPUPipelineLayout pipelineLayout =
       wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
 
+  // Prepare the WGSL source and shader module descriptor.
   WGPUShaderSourceWGSL wgslDesc = {
       .chain = {.sType = WGPUSType_ShaderSourceWGSL},
       .code = {.data = code.data.c_str(), .length = code.data.length()}};
-
   WGPUShaderModuleDescriptor shaderModuleDesc = {};
   shaderModuleDesc.nextInChain = &wgslDesc.chain;
   shaderModuleDesc.label = {code.label.c_str(), code.label.length()};
 
-  WGPUComputePipelineDescriptor computePipelineDesc = {};
-  computePipelineDesc.layout = pipelineLayout;
-  computePipelineDesc.compute.module =
+  // Create the shader module.
+  WGPUShaderModule shaderModule =
       wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
 
+  // If compilation info is requested, register the callback immediately.
+  if (compilationInfo) {
+    auto compPromise = std::make_shared<std::promise<void>>();
+    std::future<void> compFuture = compPromise->get_future();
+    // Allocate helper data to pass to the callback.
+    auto *compData = new CompData{compilationInfo, compPromise};
+
+    auto compilationCallback = [](WGPUCompilationInfoRequestStatus status,
+                                  WGPUCompilationInfo const *info,
+                                  void *userdata1, void * /*userdata2*/) {
+      CompData *cd = reinterpret_cast<CompData *>(userdata1);
+      if (info && cd->compInfo) {
+        cd->compInfo->status = status;
+        for (uint32_t i = 0; i < info->messageCount; ++i) {
+          cd->compInfo->messages.push_back(
+              std::string(info->messages[i].message.data,
+                          info->messages[i].message.length));
+          cd->compInfo->lineNums.push_back(info->messages[i].lineNum);
+          cd->compInfo->linePos.push_back(info->messages[i].linePos);
+        }
+        cd->compInfo->finished = true;
+      }
+      cd->compPromise->set_value();
+      delete cd;
+    };
+
+    WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {};
+    compilationCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+    compilationCallbackInfo.callback = compilationCallback;
+    compilationCallbackInfo.userdata1 = compData;
+    compilationCallbackInfo.userdata2 = nullptr;
+
+    // Register callback and then wait for the result.
+    wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo);
+    wait(ctx, compFuture);
+  }
+
+  // Now create the compute pipeline using the shader module.
+  WGPUComputePipelineDescriptor computePipelineDesc = {};
+  computePipelineDesc.layout = pipelineLayout;
+  computePipelineDesc.compute.module = shaderModule;
   computePipelineDesc.compute.entryPoint = {code.entryPoint.c_str(),
                                             code.entryPoint.length()};
   computePipelineDesc.label = {code.label.c_str(), code.label.length()};
-
   op->computePipeline =
       wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
+
   op->totalWorkgroups = {totalWorkgroups[0], totalWorkgroups[1],
                          totalWorkgroups[2]};
+
   resetCommandBuffer(device, op);
   if (cacheKey != nullptr)
     ctx.kernelPool.data[cacheKey] = op;
 
-  auto compilationInfoCallback = [](WGPUCompilationInfoRequestStatus status,
-                                    WGPUCompilationInfo const *compilationInfo,
-                                    void *userdata1, void *userdata2) {
-    CompilationInfo *result = static_cast<CompilationInfo *>(userdata1);
-    if (compilationInfo && result) {
-      result->status = status;
-      for (uint32_t i = 0; i < compilationInfo->messageCount; ++i) {
-        printf("Message %d: %.*s\n", i,
-               static_cast<int>(compilationInfo->messages[i].message.length),
-               compilationInfo->messages[i].message.data);
-        result->messages.push_back(
-            std::string(compilationInfo->messages[i].message.data,
-                        compilationInfo->messages[i].message.length));
-        result->lineNums.push_back(compilationInfo->messages[i].lineNum);
-        result->linePos.push_back(compilationInfo->messages[i].linePos);
-      }
-      result->finished = true;
-    } else {
-      LOG(kDefLog, kTrace, "No compilation info or result");
-    }
-  };
+  outerPromise.set_value(op);
+  return outerFuture;
+}
 
-  WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = compilationInfoCallback,
-      .userdata1 = static_cast<void *>(compilationInfo),
-      .userdata2 = nullptr};
+/*
+ * @brief Overload which wraps the createKernelAsync factory function to create
+ * a kernel on the GPU. This overload uses takes a pointer and size for the
+ * input tensors instead of a static collection and a void pointer for params
+ * instead of a static type.
+ *
+ * @param[in] ctx Context instance to manage the kernel
+ * @param[in] code WGSL code for the kernel
+ * @param[in] dataBindings Pointer to a span of tensors bound to the kernel
+ * @param[in] numTensors Number of tensors in the dataBindings span
+ * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be
+ * a Shape of rank == 3.
+ * @param[in] params Optional parameters for the kernel. If the kernel does
+ * not have any parameters, use NoParam.
+ * @return Kernel instance representing the created kernel
+ *
+ * @code
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance,
+ * kernelFuture);
+ * @endcode
+ */
+inline Kernel createKernel(Context &ctx, const KernelCode &code,
+                           const Tensor *dataBindings, size_t numTensors,
+                           const size_t *viewOffsets,
+                           const Shape &totalWorkgroups,
+                           const void *params = nullptr, size_t paramsSize = 0,
+                           CompilationInfo *compilationInfo = nullptr,
+                           const char *cacheKey = nullptr) {
+  std::future<Kernel> kernelFuture = createKernelAsync(
+      ctx, code, dataBindings, numTensors, viewOffsets, totalWorkgroups, params,
+      paramsSize, compilationInfo, cacheKey);
+  return wait(ctx, kernelFuture);
+}
 
-  while (compilationInfo && !compilationInfo->finished) {
-    processEvents(ctx.instance);
+/**
+ * @brief Overload which wraps the createKernelAsync factory function to create
+ * a kernel asynchronously on the GPU. This overload uses takes a static
+ * collection of input tensors instead of a pointer and a statically determined
+ * ParamsType instead of casting params to a void pointer.
+ *
+ * @param[in] ctx Context instance to manage the kernel
+ * @param[in] code WGSL code for the kernel
+ * @param[in] dataBindings A Bindings of tensors whose GPU buffers are bound
+ * to the kernel as inputs and outputs.
+ * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be
+ * a Shape of rank == 3.
+ * @param[in] params Optional parameters for the kernel. If the kernel does
+ * not have any parameters, use NoParam.
+ * @return Kernel instance representing the created kernel
+ *
+ * @code
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance,
+ * kernelFuture);
+ * @endcode
+ */
+template <typename ParamsType = NoParam, size_t numInputs>
+std::future<Kernel>
+createKernelAsync(Context &ctx, const KernelCode &code,
+                  const Bindings<numInputs> &dataBindings,
+                  const Shape &totalWorkgroups,
+                  const ParamsType &params = ParamsType{},
+                  CompilationInfo *compilationInfo = nullptr,
+                  const char *cacheKey = nullptr) {
+  if constexpr (!IsNoParam<ParamsType>) {
+    return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs,
+                             dataBindings.viewOffsets.data(), totalWorkgroups,
+                             reinterpret_cast<const void *>(&params),
+                             sizeof(ParamsType), compilationInfo, cacheKey);
+  } else {
+    return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs,
+                             dataBindings.viewOffsets.data(), totalWorkgroups,
+                             nullptr, 0, compilationInfo, cacheKey);
   }
-  return op;
 }
 
 /**
@@ -1597,9 +2103,9 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
  * @return Kernel instance representing the created kernel
  *
  * @code
- * Kernel kernel = createKernel(ctx, code, tensorData, output,
+ * Kernel kernel = createKernel(ctx, code, tensorData, output,totalWorkgroups,
+ * params);
  * @endcode
- * totalWorkgroups, params);
  */
 template <typename ParamsType = NoParam, size_t numInputs>
 Kernel createKernel(Context &ctx, const KernelCode &code,
@@ -1620,6 +2126,37 @@ Kernel createKernel(Context &ctx, const KernelCode &code,
   }
 }
 
+/**
+ * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
+ *
+ * This callback is invoked when the GPU queue signals the completion of the
+ * submitted workload for a kernel dispatch. It receives the work-done status
+ * and a userdata pointer, which is expected to be a heap‑allocated pointer to a
+ * std::promise<void>.
+ *
+ * On success, the promise is fulfilled by calling set_value(). Otherwise, it is
+ * set with an exception. After setting the promise state, the allocated memory
+ * for the promise is freed.
+ *
+ * @param status The status of the work done. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A heap allocated pointer to std::promise<void> which is set
+ * when the work is done.
+ * @param userdata2 Unused.
+ */
+inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
+                                   void *userdata1, void * /*userdata2*/) {
+  // Cast the userdata pointer back to our heap‑allocated promise.
+  auto *p = reinterpret_cast<std::promise<void> *>(userdata1);
+  if (status == WGPUQueueWorkDoneStatus_Success) {
+    p->set_value();
+  } else {
+    p->set_exception(std::make_exception_ptr(
+        std::runtime_error("Queue work did not complete successfully.")));
+  }
+  delete p; // free the heap allocation
+}
+
 /**
  * @brief Asynchronously submits a kernel to the GPU queue for execution.
  * It also sets up a callback to notify when the kernel has finished executing
@@ -1635,30 +2172,54 @@ Kernel createKernel(Context &ctx, const KernelCode &code,
  * @param[in] promise Promise to set when the kernel has finished executing
  *
  * @code
- * dispatchKernel(ctx, kernel);
+ * std::future<void> dispatchFuture = dispatchKernel(ctx, kernel);
+ * wait(ctx.instance, dispatchFuture);
  * @endcode
  */
-inline void dispatchKernel(Context &ctx, Kernel &kernel,
-                           std::promise<void> &promise) {
+inline std::future<void> dispatchKernelAsync(Context &ctx, Kernel &kernel) {
+  // If the kernel was used before, reset the command buffer.
   if (kernel->used) {
     resetCommandBuffer(ctx.device, kernel);
   }
+
+  // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &kernel->commandBuffer);
   wgpuCommandBufferRelease(kernel->commandBuffer);
   kernel->used = true;
 
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            auto *promise = static_cast<std::promise<void> *>(userdata1);
-            promise->set_value();
-          },
-      .userdata1 = &promise,
-      .userdata2 = nullptr};
+  // Allocate a promise on the heap so it remains valid beyond this function’s
+  // scope.
+  std::promise<void> *promise = new std::promise<void>();
+  std::future<void> future = promise->get_future();
+
+  // Set up the callback info.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {};
+  workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+  workDoneCallbackInfo.callback = dispatchKernelCallback;
+  workDoneCallbackInfo.userdata1 = reinterpret_cast<void *>(promise);
+  workDoneCallbackInfo.userdata2 = nullptr;
+
+  // IMPORTANT: Pass the address of the callback info structure.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+
+  return future;
+}
+
+/**
+ * @brief Synchronous wrapper for dispatchKernelAsync. This function submits
+ * the kernel to the GPU queue and waits for it to finish executing.
+ *
+ * @param[in] ctx Context instance to manage the kernel, from which the queue
+ * for the GPU is obtained
+ * @param[in] kernel Kernel instance to dispatch
+ *
+ * @code
+ * dispatchKernel(ctx, kernel);
+ * @endcode
+ */
+inline void dispatchKernel(Context &ctx, Kernel &kernel) {
+  auto future = dispatchKernelAsync(ctx, kernel);
+  wait(ctx, future);
 }
 
 } // namespace gpu
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index e5bdaf0..c183754 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -228,12 +228,9 @@ fn main(
   }
   Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf16);
-  std::promise<void> promise;
-  std::future<void> future = promise.get_future();
   Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  dispatchKernel(ctx, op, promise);
-  wait(ctx, future);
+  dispatchKernel(ctx, op);
   toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", static_cast<float>(inputArr[i]),
@@ -241,7 +238,7 @@ fn main(
   }
 }
 
-int main() {
+int testHalfMain() {
   printf("\nHalf-precision float tests\n==========================\n");
 
   printf("\nRegular values float round trips\n\n");
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
new file mode 100644
index 0000000..b855712
--- /dev/null
+++ b/test/test_gpu.cpp
@@ -0,0 +1,250 @@
+#include "gpu.hpp"
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <cstring>
+#include <future>
+#include <vector>
+
+using namespace gpu;
+using namespace std::chrono;
+
+
+// Forward declarations:
+void testToCPUWithTensor();
+void testToCPUWithBuffer();
+void testToCPUWithTensorSourceOffset();
+void testToCPUWithBufferSourceOffset();
+void stressTestToCPU();
+
+int main() {
+  LOG(kDefLog, kInfo, "Running GPU integration tests...");
+  testToCPUWithTensor();
+  testToCPUWithBuffer();
+  testToCPUWithTensorSourceOffset();
+  testToCPUWithBufferSourceOffset();
+  stressTestToCPU();
+  LOG(kDefLog, kInfo, "All tests passed.");
+  return 0;
+}
+
+
+// A simple WGSL copy kernel that copies input to output.
+static const char *kCopyKernel = R"(
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+  let i: u32 = gid.x;
+  if (i < arrayLength(&inp)) {
+    out[i] = inp[i];
+  }
+}
+)";
+
+
+// Test using the overload that takes a Tensor.
+void testToCPUWithTensor() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
+
+// Create a real GPU context.
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<float, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<float>(i);
+    outputData[i] = 0.0f;
+  }
+
+  // Create input and output tensors.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+  Tensor outputTensor = createTensor(ctx, Shape{N}, kf32);
+
+  // Create and dispatch the copy kernel.
+  Kernel copyKernel =
+      createKernel(ctx, {kCopyKernel, 256, kf32},
+                   Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1});
+  dispatchKernel(ctx, copyKernel);
+
+  // Synchronously copy GPU output to CPU using the tensor overload.
+  toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData));
+
+  // Verify the output matches the input.
+  for (size_t i = 0; i < N; ++i) {
+    LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
+    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
+}
+
+// Test using the overload that takes a raw GPU buffer.
+// We reuse the Tensor's underlying buffer for this test.
+void testToCPUWithBuffer() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithBuffer...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<float, N> data, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    data[i] = static_cast<float>(i * 2);
+    outputData[i] = 0.0f;
+  }
+
+  // Create a tensor to allocate a GPU buffer and initialize it.
+  Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data());
+
+  // Now extract the raw GPU buffer from the tensor.
+  WGPUBuffer gpuBuffer = tensor.data.buffer;
+
+  // Use the WGPUBuffer overload. This call returns a future.
+  auto future =
+      toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0);
+  wait(ctx, future);
+
+  // Verify that the CPU output matches the original data.
+  for (size_t i = 0; i < N; ++i) {
+    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    assert(outputData[i] == data[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
+}
+
+void testToCPUWithTensorSourceOffset() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset...");
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t numElements = 25;
+  constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements
+  constexpr size_t copyCount = 10;           // Number of floats to copy
+  size_t copySize = copyCount * sizeof(float);
+
+  // Create an input array with known data.
+  std::array<float, numElements> inputData{};
+  for (size_t i = 0; i < numElements; ++i) {
+    inputData[i] = static_cast<float>(i + 50); // Arbitrary values
+  }
+  // Create a tensor from the full data.
+  Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+
+  // Allocate a destination CPU buffer exactly as large as the data we want to
+  // copy.
+  std::vector<float> cpuOutput(copyCount, -1.0f);
+
+  // Set sourceOffset to skip the first few float elements
+  size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+  // Call the tensor overload with sourceOffset and destOffset = 0.
+  auto future =
+      toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes);
+  wait(ctx, future);
+
+  // Verify the copied data matches the expected subset.
+  for (size_t i = 0; i < copyCount; ++i) {
+    float expected = inputData[sourceOffsetElements + i];
+    float actual = cpuOutput[i];
+    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    assert(expected == actual);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
+}
+
+void testToCPUWithBufferSourceOffset() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset...");
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t numElements = 30;
+  constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements
+  constexpr size_t copyCount = 12;           // Number of floats to copy
+  size_t copySize = copyCount * sizeof(float);
+
+  // Create an input array with arbitrary data.
+  std::array<float, numElements> inputData{};
+  for (size_t i = 0; i < numElements; ++i) {
+    inputData[i] = static_cast<float>(i + 100);
+  }
+  // Create a tensor to initialize a GPU buffer.
+  Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+  // Extract the raw GPU buffer from the tensor.
+  WGPUBuffer buffer = tensor.data.buffer;
+
+  // Allocate a destination CPU buffer exactly as large as needed.
+  std::vector<float> cpuOutput(copyCount, -2.0f);
+  size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+
+  // Call the buffer overload with sourceOffset and destOffset = 0.
+  auto future =
+      toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes);
+  wait(ctx, future);
+
+  // Verify that the copied data matches the expected subset.
+  for (size_t i = 0; i < copyCount; ++i) {
+    float expected = inputData[sourceOffsetElements + i];
+    float actual = cpuOutput[i];
+    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    assert(expected == actual);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
+}
+
+void stressTestToCPU() {
+  LOG(kDefLog, kInfo, "Running stressTestToCPU for 2 seconds...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  // Create a persistent tensor with some test data.
+  std::vector<float> inputData(N, 0.0f);
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<float>(i);
+  }
+  Tensor tensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+
+  // Prepare to run for one second.
+  auto startTime = high_resolution_clock::now();
+  size_t opCount = 0;
+  while (high_resolution_clock::now() - startTime < seconds(2)) {
+    // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes)
+    auto outputData = std::make_shared<std::vector<float>>(N, 0.0f);
+    // Use the tensor overload; we’re copying the entire tensor (destOffset = 0)
+    LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float));
+    // log count
+    LOG(kDefLog, kInfo, "opCount = %zu", opCount);
+    auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0);
+    wait(ctx, fut);
+    ++opCount;
+  }
+  
+  auto endTime = high_resolution_clock::now();
+  auto totalMs = duration_cast<milliseconds>(endTime - startTime).count();
+  double throughput = (opCount / (totalMs / 1000.0));
+
+  LOG(kDefLog, kInfo, "Stress test completed:\n"
+            "  %zu GPU to CPU operations in %lld ms\n"
+            "  Throughput: %.2f ops/sec", opCount, totalMs, throughput);
+}