diff --git a/examples/matmul/Makefile b/examples/matmul/Makefile index 334e8eb..d47de4d 100644 --- a/examples/matmul/Makefile +++ b/examples/matmul/Makefile @@ -10,15 +10,15 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu else STDLIB := -stdlib=libc++ endif -FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn +FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib -ldl -ldawn -fexceptions run: ./build/$(TARGET) $(LIBSPEC) && ./build/$(TARGET) # Use clang -v to see the include paths # Note in this example optimization is turned on -build/$(TARGET): run.cpp - mkdir -p build && $(CXX) $(FLAGS) -o ./build/$(TARGET) +build/$(TARGET): run.cpp ../../gpu.h + mkdir -p build && $(CXX) $(FLAGS) -o $@ $< watch: @command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; } diff --git a/examples/matmul/run.cpp b/examples/matmul/run.cpp index 06e9297..55a99aa 100644 --- a/examples/matmul/run.cpp +++ b/examples/matmul/run.cpp @@ -15,6 +15,34 @@ using namespace gpu; +struct MatmulParameters { + size_t M; + size_t K; + size_t N; + NumType numtype; + size_t BM; + size_t BK; + size_t BN; + size_t TM; + size_t TN; + bool interactive; + bool transpose; +}; + +void show_parameters(const MatmulParameters ¶meters, double duration) { + double tflops = 2 * parameters.M * parameters.N * parameters.K / + duration / 1000000.0; + LOG(kDefLog, kInfo, + "M: %4d, K: %4d, N: %4d, " + "BM: %3d, BK: %3d, BN: %3d, TM: %3d, TN: %3d, " + "TRANSPOSE: %d, " + "TFlops: %.2f", + parameters.M, parameters.K, parameters.N, + parameters.BM, parameters.BK, parameters.BN, parameters.TM, parameters.TN, + (int) parameters.transpose, + tflops); +} + const std::string versionToStr(int version); void matmulf16_forward_cpu(half* out, @@ -677,8 +705,17 @@ void checkCPU(size_t M, size_t K, size_t N, std::unique_ptr &inputP Kernel selectMatmul(Context &ctx, int version, const Bindings &bindings, - size_t M, size_t K, size_t N, NumType numtype) { + const MatmulParameters ¶meters) { Kernel kernel; + size_t M = parameters.M; + size_t K = parameters.K; + size_t N = parameters.N; + NumType numtype = parameters.numtype; + size_t BM = parameters.BM; + size_t BK = parameters.BK; + size_t BN = parameters.BN; + size_t TM = parameters.TM; + size_t TN = parameters.TN; if (version == 1) { Shape wgSize = {256, 1, 1}; Shape nWorkgroups = cdiv({M, N, 1}, {16, 16, 1}); @@ -700,11 +737,10 @@ Kernel selectMatmul(Context &ctx, int version, createKernel(ctx, matmul, bindings, /* nWorkgroups*/ cdiv({M, N, 1}, {tileSize, tileSize, 1})); } else if (version == 4 || version == 6) { - static constexpr size_t BM = 64; - static constexpr size_t BK = 4; - static constexpr size_t BN = BM; - static constexpr size_t TM = - BN / BK; // BM * BN / TM == BM * BK, therefore TM == BN / BK + BM = 64; + BK = 4; + BN = BM; + TM = BN / BK; // BM * BN / TM == BM * BK, therefore TM == BN / BK Shape wgSize = {BM * BN / TM, 1, 1}; // BM * BN values per workgroup, TM values per thread Shape nWorkgroups = {cdiv(M, BM), cdiv(N, BN), 1}; @@ -719,11 +755,11 @@ Kernel selectMatmul(Context &ctx, int version, kernel = createKernel(ctx, matmul, bindings, /*nWorkgroups*/ nWorkgroups); } else if (version == 5 || version == 7) { - static constexpr size_t BM = 64; - static constexpr size_t BK = 8; - static constexpr size_t BN = 64; - static constexpr size_t TM = BM / BK; - static constexpr size_t TN = BN / BK; + BM = 64; + BK = 8; + BN = 64; + TM = BM / BK; + TN = BN / BK; Shape wgSize = {(BM / TM) * (BN / TN), 1, 1}; // This is the same as BK * BK. Shape nWorkgroups = {cdiv(M, BM), cdiv(N, BN), 1}; LOG(kDefLog, kInfo, "M: %d, K: %d, N: %d", M, K, N); @@ -737,11 +773,12 @@ Kernel selectMatmul(Context &ctx, int version, kernel = createKernel(ctx, matmul, bindings, /*nWorkgroups*/ nWorkgroups); } else if (version == 8 || version == 10) { - static constexpr size_t BM = 64; - static constexpr size_t BK = 8; - static constexpr size_t BN = 64; - static constexpr size_t TM = BM / BK; - static constexpr size_t TN = BN / BK; + // These parameters are optimized for m2 pro. + BM = 128; + BK = 16; + BN = 64; + TM = 4; + TN = 8; Shape wgSize = {(BM / TM) * (BN / TN), 1, 1}; // This is the same as BK * BK. Shape nWorkgroups = {cdiv(M, BM), cdiv(N, BN), 1}; LOG(kDefLog, kInfo, "M: %d, K: %d, N: %d", M, K, N); @@ -755,11 +792,11 @@ Kernel selectMatmul(Context &ctx, int version, kernel = createKernel(ctx, matmul, bindings, /*nWorkgroups*/ nWorkgroups); } else if (version == 9 || version == 11) { - static constexpr size_t BM = 64; - static constexpr size_t BK = 8; - static constexpr size_t BN = 64; - static constexpr size_t TM = BM / BK; - static constexpr size_t TN = BN / BK; + BM = 64; + BK = 8; + BN = 64; + TM = BM / BK; + TN = BN / BK; Shape wgSize = {(BM / TM) * (BN / TN), 1, 1}; // This is the same as BK * BK. Shape nWorkgroups = {cdiv(M, BM), cdiv(N, BN), 1}; LOG(kDefLog, kInfo, "M: %d, K: %d, N: %d", M, K, N); @@ -771,16 +808,39 @@ Kernel selectMatmul(Context &ctx, int version, numtype); kernel = createKernel(ctx, matmul, bindings, /*nWorkgroups*/ nWorkgroups); + } else if (version == 12) { + Shape wgSize = {(BM / TM) * (BN / TN), 1, 1}; // This is the same as BK * BK. + Shape nWorkgroups = {cdiv(M, BM), cdiv(N, BN), 1}; + LOG(kDefLog, kInfo, "M: %d, K: %d, N: %d", M, K, N); + LOG(kDefLog, kInfo, "BM: %d, BK: %d, BN: %d, TM: %d, TN: %d", BM, BK, BN, TM, TN); + LOG(kDefLog, kInfo, "wgSize: ( %s )", toString(wgSize).c_str()); + LOG(kDefLog, kInfo, "nWorkgroups: ( %s )", toString(nWorkgroups).c_str()); + if (parameters.transpose) { + KernelCode matmul = createMatmulWithTranspose(kShaderMatmulWithTranspose, M, K, N, BM, BK, BN, TM, TN, + /*wgSize*/ wgSize, + numtype); + kernel = createKernel(ctx, matmul, bindings,nWorkgroups); + } else { + KernelCode matmul = createMatmulWithVectorization(kShaderMatmulWithVectorization, M, K, N, BM, BK, BN, TM, TN, + /*wgSize*/ wgSize, + numtype, + /*Loop unrolling*/ true); + kernel = createKernel(ctx, matmul, bindings, nWorkgroups); + } } return kernel; } -template -void runTest(int version, size_t M, size_t K, size_t N, - std::unique_ptr &inputPtr, - std::unique_ptr &weightsPtr, - std::unique_ptr &outputPtr, - NumType numtype) { +template +double runTest(int version, + std::unique_ptr &inputPtr, + std::unique_ptr &weightsPtr, + std::unique_ptr &outputPtr, + const MatmulParameters ¶meters) { + size_t M = parameters.M; + size_t K = parameters.K; + size_t N = parameters.N; + NumType numtype = parameters.numtype; if constexpr (std::is_same::value) { assert(numtype == kf32); } else if constexpr (std::is_same::value) { @@ -799,8 +859,6 @@ void runTest(int version, size_t M, size_t K, size_t N, Tensor input = createTensor(ctx, Shape{M, K}, numtype, inputPtr.get()); Tensor weights = createTensor(ctx, Shape{N, K}, numtype, weightsPtr.get()); // column-major - constexpr size_t nIter = 30; - // Initialize Kernel and bind GPU buffers @@ -809,14 +867,19 @@ void runTest(int version, size_t M, size_t K, size_t N, std::array, nIter> futures; std::array kernels; std::array outputs; + auto logLevel = kDefLog; + kDefLog = {stdout, "", kError}; for (int i = 0; i < nIter; i++) { futures[i] = promises[i].get_future(); outputs[i] = createTensor(ctx, Shape{M, N}, numtype); - kernels[i] = selectMatmul(ctx, version, {input, weights, outputs[i]}, M, K, N, numtype); + kernels[i] = selectMatmul(ctx, version, {input, weights, outputs[i]}, parameters); } + kDefLog = logLevel; - printf("[ Press enter to start tests ... ]\n"); - getchar(); + if (parameters.interactive){ + printf("[ Press enter to start tests ... ]\n"); + getchar(); + } LOG(kDefLog, kInfo, "Dispatching Kernel version %d: %s, %d iterations ...", version, versionToStr(version).c_str(), nIter); @@ -851,28 +914,103 @@ void runTest(int version, size_t M, size_t K, size_t N, "GFLOPS\n================================================================" "================\n\n", M, K, N, nIter, duration.count() / static_cast(nIter) / 1000.0 /* us -> ms */, gflops); + return (duration.count() / static_cast(nIter)); } -template -void runTestWithCheck(int version, size_t M, size_t K, size_t N, - bool transposedInput, int kTestSize, NumType numtype) { - std::unique_ptr inputPtr = std::make_unique(M * K); - std::unique_ptr weightsPtr = std::make_unique(N * K); - std::unique_ptr outputPtr = std::make_unique(M * N); - - initData(M, K, N, inputPtr, weightsPtr); - if (transposedInput) { - std::unique_ptr transposedWeightPtr = std::make_unique(K * N); - transpose(weightsPtr.get(), transposedWeightPtr.get(), N, K); - runTest(version, M, K, N, inputPtr, transposedWeightPtr, outputPtr, numtype); - } else { - runTest(version, M, K, N, inputPtr, weightsPtr, outputPtr, numtype); - } +template +double runTestWithCheck(int version, + int kTestSize, + const MatmulParameters ¶meters) { + bool transposedInput = parameters.transpose; + size_t M = parameters.M; + size_t K = parameters.K; + size_t N = parameters.N; + NumType numtype = parameters.numtype; + std::unique_ptr inputPtr = std::make_unique(M * K); + std::unique_ptr weightsPtr = std::make_unique(N * K); + std::unique_ptr outputPtr = std::make_unique(M * N); + double duration; + + initData(M, K, N, inputPtr, weightsPtr); + if (transposedInput) { + std::unique_ptr transposedWeightPtr = std::make_unique(K * N); + transpose(weightsPtr.get(), transposedWeightPtr.get(), N, K); + duration = runTest(version, inputPtr, transposedWeightPtr, outputPtr, parameters); + } else { + duration = runTest(version, inputPtr, weightsPtr, outputPtr, parameters); + } + + if (kTestSize <= 1) { + // Check result with CPU reference implementation for tiny/small tests + checkCPU(M, K, N, inputPtr, weightsPtr, outputPtr); + } + return duration; +} - if (kTestSize <= 1) { - // Check result with CPU reference implementation for tiny/small tests - checkCPU(M, K, N, inputPtr, weightsPtr, outputPtr); +template +MatmulParameters runAutotune(int version, int kTestSize, const MatmulParameters& default_parameters) { + size_t BM_VALUES[]={64, 128, 256}; + size_t BN_VALUES[]={64, 128, 256}; + size_t BK_VALUES[]={8, 16, 32, 64}; + size_t TM_VALUES[]={4, 8, 16, 32}; + size_t TN_VALUES[]={4, 8, 16, 32}; + size_t NUM_THREADS_VALUES[]={64, 128, 256}; + bool TRANSPOSE_VALUES[]={false, true}; + + double min_duration = FLT_MAX; + MatmulParameters min_parameters; + + for(auto TRANSPOSE: TRANSPOSE_VALUES){ + for(auto BM: BM_VALUES){ + for(auto BN: BN_VALUES){ + for(auto BK: BK_VALUES){ + for(auto TM: TM_VALUES){ + for(auto TN: TN_VALUES){ + for(auto NUM_THREADS: NUM_THREADS_VALUES){ + MatmulParameters parameters = { + .M = default_parameters.M, + .K = default_parameters.K, + .N = default_parameters.N, + .numtype = kf16, + .BM = BM, + .BK = BK, + .BN = BN, + .TM = TM, + .TN = TN, + .interactive = false, + .transpose = TRANSPOSE, + }; + if (BM % TM == 0 && + BN % TN == 0 && + (BM * BN) / (TM * TN) == NUM_THREADS && + (BM*BK+BN*BK)*2 <= 16384 // The limit of workgroup storage + ) { + kDefLog = {stdout, "", kError}; + double duration; + try { + duration = runTestWithCheck(version, kTestSize, parameters); + } catch (const std::runtime_error& e) { + LOG(kDefLog, kError, "Exception:\n%s", e.what()); + continue; + } catch (const std::exception &e) { + LOG(kDefLog, kError, "Exception:\n%s", e.what()); + continue; + } + kDefLog = {stdout, "", kInfo}; + show_parameters(parameters, duration); + if (duration < min_duration) { + min_duration = duration; + min_parameters = parameters; + } + } + } + } + } + } + } } + } + return min_parameters; } const std::string versionToStr(int version){ @@ -888,52 +1026,59 @@ const std::string versionToStr(int version){ case 9: return "f32: 2D blocktiling with loop unrolling, vectorization and transpose"; case 10: return "f16: 2D blocktiling with loop unrolling and vectorization"; case 11: return "f16: 2D blocktiling with loop unrolling, vectorization and transpose"; + case 12: return "f16: autotune"; default: return "Not specified"; } } int main() { + // The meaning of version is the same as versionToStr's one. char* version_str = getenv("MATMUL_VERSION"); int version = version_str == NULL ? 10 : atoi(version_str); - // 1 == f32: No-Op - // 2 == f32: naive matmul - // 3 == f32: tiling - // 4 == f32: 1D blocktiling - // 5 == f32: 2D blocktiling - // 6 == f32: 1D blocktiling with loop unrolling - // 7 == f32: 2D blocktiling with loop unrolling - // 8 == f32: 2D blocktiling with loop unrolling and vectorization - // 9 == f32: 2D blocktiling with loop unrolling, vectorization and transpose - // 10 == f16: 2D blocktiling with loop unrolling and vectorization (default) - // 11 == f16: 2D blocktiling with loop unrolling, vectorization and transpose + bool enableF16 = version == 10 || version ==11; bool transposedInput = version == 9 || version == 11; NumType numtype = enableF16 ? kf16 : kf32; + bool autotune = version == 12; size_t M, K, N; // Matrix dimensions char* kTestSize_str = getenv("MATMUL_SIZE"); int kTestSize = kTestSize_str == NULL ? 2 : atoi(kTestSize_str); + + MatmulParameters parameters; + double duration = FLT_MAX; + parameters.interactive = true; + parameters.transpose = transposedInput; + if (kTestSize == 0) { // Tiny test - M = 32; - K = 32; - N = 32; + parameters.M = 32; + parameters.K = 32; + parameters.N = 32; } else if (kTestSize == 1) { // Small test - M = 256; - K = 128; - N = 512; + parameters.M = 256; + parameters.K = 128; + parameters.N = 512; } else { // Large test - M = 4096; - K = 4096; - N = 2 * 4096; + parameters.M = 4096; + parameters.K = 4096; + parameters.N = 2 * 4096; } - if (enableF16) { - runTestWithCheck(version, M, K, N, transposedInput, kTestSize, numtype); + if (autotune) { + MatmulParameters min_parameters = runAutotune(version, kTestSize, parameters); + double min_duration = runTestWithCheck(version, kTestSize, min_parameters); + show_parameters(min_parameters, min_duration); } else { - runTestWithCheck(version, M, K, N, transposedInput, kTestSize, numtype); + if (enableF16) { + parameters.numtype = kf16; + runTestWithCheck(version, kTestSize, parameters); + } else { + parameters.numtype = kf32; + runTestWithCheck(version, kTestSize, parameters); + } } LOG(kDefLog, kInfo, "Done."); diff --git a/gpu.h b/gpu.h index 5b50ec8..526aec1 100644 --- a/gpu.h +++ b/gpu.h @@ -461,6 +461,11 @@ struct Context { WGPUQueue queue; TensorPool pool = TensorPool(this); KernelPool kernelPool = KernelPool(this); + + std::mutex* error_mtx; + std::vector error_types; + std::vector error_messages; + ~Context() { LOG(kDefLog, kTrace, "Destroying context"); if (queue) { @@ -485,6 +490,9 @@ struct Context { } else { LOG(kDefLog, kWarn, "Instance is null"); } + if (error_mtx) { + delete error_mtx; + } LOG(kDefLog, kInfo, "Context destroyed"); } }; @@ -702,6 +710,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {}, const WGPURequestAdapterOptions &adapterOpts = {}, const WGPUDeviceDescriptor &devDescriptor = {}) { Context context; + context.error_mtx = new std::mutex(); { #ifdef __EMSCRIPTEN__ // Emscripten does not support the instance descriptor @@ -786,21 +795,38 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {}, context.device = devData.device; wgpuDeviceSetUncapturedErrorCallback( context.device, - [](WGPUErrorType type, char const *message, void *devData) { - LOG(kDefLog, kError, "Device uncaptured error: %s", message); - throw std::runtime_error("Device uncaptured exception."); + [](WGPUErrorType type, char const *message, void *userData) { + Context* pctx = reinterpret_cast(userData); + std::lock_guard lock(*pctx->error_mtx); + pctx->error_types.push_back(type); + pctx->error_messages.push_back(std::string(message)); }, - nullptr); + &context); } context.queue = wgpuDeviceGetQueue(context.device); return context; } +inline void throwDeviceUncapturedException(Context &ctx) { + std::lock_guard lock(*ctx.error_mtx); + if (ctx.error_types.size() != 0) { + std::string errs; + errs += "Device uncaptured error:\n"; + for(auto err: ctx.error_messages){ + errs += err; + } + ctx.error_types.clear(); + ctx.error_messages.clear(); + throw std::runtime_error(errs.c_str()); + } +} + inline void wait(Context &ctx, std::future &future) { while (future.wait_for(std::chrono::seconds(0)) != std::future_status::ready) { processEvents(ctx.instance); } + throwDeviceUncapturedException(ctx); } /** @@ -1185,6 +1211,8 @@ inline Kernel createKernel(Context &ctx, const KernelCode &code, while (compilationInfo && !compilationInfo->finished) { processEvents(ctx.instance); } + + throwDeviceUncapturedException(ctx); return op; }