CUTLASS 3.5.0 (NVIDIA#1411)

Daydream0929 · Mar 19, 2024 · 629f465 · 629f465
1 parent ffa34e7
commit 629f465
Show file tree

Hide file tree

Showing 468 changed files with 48,805 additions and 7,328 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -67,14 +67,13 @@ elseif (CUDA_VERSION VERSION_LESS 11.4)
   message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.8 or higher.")
 endif()
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.5)
-  message(FATAL_ERROR "GCC version must be at least 7.5!")
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.3)
+  message(FATAL_ERROR "GCC version must be at least 7.3!")
 endif()
 
 if (CUDA_COMPILER MATCHES "[Cc]lang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
   message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
 endif()
-
 find_package(Doxygen QUIET)
 
 ################################################################################
@@ -168,6 +167,7 @@ endif()
 include(GNUInstallDirs)
 
 link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 
 ###################################################################################################
 #

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -1,6 +1,6 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")
+![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")
 
-[README](/README.md#documentation) > **Contributors**
+[README](./README.md#documentation) > **Contributors**
 
 # CUTLASS Developers and Contributors
 

diff --git a/CUDA.cmake b/CUDA.cmake
@@ -326,6 +326,14 @@ function(cutlass_add_library NAME)
    cxx_std_11
    )
 
+  get_target_property(TARGET_TYPE ${NAME} TYPE)
+
+  if (TARGET_TYPE MATCHES "SHARED")
+    set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+  elseif(TARGET_TYPE MATCHES "STATIC")
+    set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY Static)
+  endif()
+
   if(__EXPORT_NAME)
     add_library(nvidia::cutlass::${__EXPORT_NAME} ALIAS ${NAME})
     set_target_properties(${NAME} PROPERTIES EXPORT_NAME ${__EXPORT_NAME})
@@ -336,10 +344,19 @@ endfunction()
 function(cutlass_add_executable NAME)
 
   set(options)
-  set(oneValueArgs)
+  set(oneValueArgs CUDA_RUNTIME_LIBRARY)
   set(multiValueArgs)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
+  if (NOT DEFINED __CUDA_RUNTIME_LIBRARY)
+    set(__CUDA_RUNTIME_LIBRARY Shared)
+  endif()
+
+  set(__CUDA_RUNTIME_LIBRARY_ALLOWED None Shared Static)
+  if (NOT __CUDA_RUNTIME_LIBRARY IN_LIST __CUDA_RUNTIME_LIBRARY_ALLOWED)
+    message(FATAL_ERROR "CUDA_RUNTIME_LIBRARY value '${__CUDA_RUNTIME_LIBRARY}' is not in allowed list of '${__CUDA_RUNTIME_LIBRARY_ALLOWED}'")
+  endif()
+
   cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
 
   if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
@@ -359,6 +376,8 @@ function(cutlass_add_executable NAME)
    cxx_std_11
    )
 
+  set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY ${__CUDA_RUNTIME_LIBRARY})
+
 endfunction()
 
 function(cutlass_target_sources NAME)

diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md
@@ -4,6 +4,7 @@
 
 - ["A Case Study in CUDA Kernel Fusion: Implementing FlashAttention-2 on NVIDIA Hopper Architecture using the CUTLASS Library"](https://arxiv.org/abs/2312.11918). Ganesh Bikshandi, Jay Shah. _arXiv_, December 2023.
 
+- ["Benchmarking GPU Tensor Cores on General Matrix Multiplication Kernels through CUTLASS"](https://www.mdpi.com/2076-3417/13/24/13022). Xuanteng Huang, Xianwei Zhang, Panfei Yang, Nong Xiao. _Journal of Applied Sciences_, December 2023.
 
 - ["A Speed Odyssey for Deployable Quantization of LLMs"](https://arxiv.org/abs/2311.09550). Qingyuan Li, Ran Meng, Yiduo Li, Bo Zhang, Liang Li, Yifan Lu, Xiangxiang Chu, Yerui Sun, Yuchen Xie. _arXiv_, November 2023.
 

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
+![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 3.4
+# CUTLASS 3.5
 
-_CUTLASS 3.4 - February 2024_
+_CUTLASS 3.5 - March 2024_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-matrix multiplication (GEMM) and related computations at all levels 
@@ -19,16 +19,16 @@ mixed-precision computations, providing specialized data-movement and
 multiply-accumulate abstractions for half-precision floating
 point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32),
 single-precision floating point (FP32),
-[FP32 emulation via tensor core instruction](/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm),
+[FP32 emulation via tensor core instruction](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm),
 double-precision floating
 point (FP64) types, integer data types (4b and 8b), and binary data types (1b).
 CUTLASS demonstrates warp-synchronous matrix multiply operations
 targeting the programmable, high-throughput _Tensor Cores_ implemented by
 NVIDIA's Volta, Turing, Ampere, and Hopper architectures.
 
-See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
+See the [Quick Start Guide](./media/docs/quickstart.md) to get started quickly.
 
-See the [functionality listing](/media/docs/functionality.md) for the list of operations
+See the [functionality listing](./media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.
 
 CUTLASS 3.0 introduced a new core library, CuTe, to describe and manipulate tensors of threads and data.
@@ -37,25 +37,27 @@ CuTe is a collection of C++ CUDA template abstractions for defining and operatin
 The core abstractions of CuTe are hierarchically multidimensional layouts which can be composed with data arrays to represent tensors. The representation of layouts is powerful enough to represent nearly everything we need to implement efficient dense linear algebra. Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning.
 
 CUTLASS 3.0 and beyond adopts CuTe throughout the GEMM hierarchy in its templates. This greatly simplifies the design
-and improves code composability and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](/media/docs/cute/00_quickstart.md).
+and improves code composability and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](./media/docs/cute/00_quickstart.md).
 
 In addition to GEMMs, CUTLASS implements high-performance convolution via the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline. This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components.
 
-# What's New in CUTLASS 3.4
+# What's New in CUTLASS 3.5
 
-CUTLASS 3.4.1 is an update to CUTLASS adding:
-- Statically available [CUTLASS Version macros](/include/cutlass/version.h) that allow for handling API changes between CUTLASS releases on the users' side.
-- Improvements for Hopper [Group-GEMM](/examples/57_hopper_grouped_gemm) and [Pointer-Array Batched GEMM](/examples/56_hopper_ptr_array_batched_gemm).
-- Updates and bugfixes from the community (thanks!).
+CUTLASS 3.5 is an update to CUTLASS adding:
 
-CUTLASS 3.4.0 is an update to CUTLASS adding:
-
-- Improved [Mixed-input Hopper GEMMs](/examples/55_hopper_mixed_dtype_gemm) supporting {16-bit, 8-bit} x {8-bit, 4-bit} input types with fast numerical converters and group scaling factors tuned for optimal performance on Hopper H100.
-- Beta release of [Pointer-Array Batched GEMMs](/examples/56_hopper_ptr_array_batched_gemm) utilizing TMA and Hopper H100 tensor cores now available. (Requires CUDA 12.3 or above)
-- Beta release of [Group-GEMM](/examples/57_hopper_grouped_gemm) - commonly used in optimization of Mixture-Of-Expert models, is now available on Hopper GPUs taking advantage of TMA and Hopper H100 tensor cores. (Requires CUDA 12.3 or above)
-- [Ampere Sparse GEMM](/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu) supports Epilogue Visitor Tree (EVT) now.
-- Improvements to NamedBarriers including details of [ReservedNamedBarriers](/include/cutlass/arch/barrier.h) used within the CUTLASS library.
-- Improved [CuTe documentation](/media/docs/cute/) including improved clarity and depth of [Quickstart](/media/docs/cute/00_quickstart.md), [CuTe Layout](/media/docs/cute/01_layout.md), and [CuTe Layout Algebra](/media/docs/cute/02_layout_algebra.md). Associated code comments, post-conditions, and details in [CuTe Core Unit Tests](/test/unit/cute/core/) also improved.
+- Implicit GEMM Convolutions targeting Hopper SM90A via WGMMA + [TMA im2col](./include/cute/atom/copy_traits_sm90_im2col.hpp)
+  + Native implementation in CUTLASS 3.x using CuTe, mirroring the [same design hierarchy as that of GEMMs](./media/docs/gemm_api_3x.md).
+  + Support for 1D, 2D, and 3D convolutions in a [rank-agnostic fashion](./include/cutlass/conv/convnd_problem_shape.hpp).
+  + Support for [Fprop](./test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu), [Dgrad](./test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu), and [Wgrad](./test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu) algorithms
+  + [CUTLASS profiler support](./python/cutlass_library/conv3x_emitter.py) for 2D and 3D convolutions implemented via the 3.x API.
+  + NOTE: this is a beta release. Further updates to CUTLASS will include major performance improvements, feature enablement, and possible breaking changes to the API until 3.7 release. Your feedback is welcome on the design!
+- Support for [Ada (SM89) FP8 tensor cores via the 2.x API](./examples/58_ada_fp8_gemm/ada_fp8_gemm.cu). Requires CUDA 12.4 or newer.
+- [Ampere gather/scatter convolution example](./examples/59_ampere_gather_scatter_gemm/README.md) in CuTe and CUTLASS 3.x
+  + Showcasing how custom kernels can be written and optimized using CUTLASS 3.x and CuTe and the general strategy for implementing convolutions as specializations of GETTs.
+  + Implementation of a coarse grained sparse gather/scatter kernel achieving peak performance on Ampere class tensor cores.
+- Updates to CuTe documentation for [`cute::Tensor<>`](./media/docs/cute/03_tensor.md), [MMA atoms](./media/docs/cute/0t_mma_atom.md), and an overhauled [CuTe GEMM tutorial series](./examples/cute/tutorial).
+- Extensions to CuTe to support [L2 prefetching](./include/cute/algorithm/prefetch.hpp) and [TMA store+reductions](./include/cute/arch/copy_sm90_tma.hpp#L1337).
+- Updates and bugfixes from the community (thanks!)
 
 Minimum requirements:
 
@@ -98,7 +100,7 @@ as shown in the above figure.  Tensor Core operations are implemented using CUDA
 # Compatibility
 
 CUTLASS requires a C++17 host compiler and 
-performs best when built with the [**CUDA 12.3.2 Toolkit**](https://developer.nvidia.com/cuda-downloads).
+performs best when built with the [**CUDA 12.4 Toolkit**](https://developer.nvidia.com/cuda-downloads).
 It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, CUDA 11.8, CUDA 12.0, CUDA 12.1, CUDA 12.2.2, CUDA 12.3.1 and CUDA 12.3.2.
 
 ## Operating Systems
@@ -142,28 +144,28 @@ The target architecture information is passed on to CUTLASS via the cmake flag `
 cmake .. -DCUTLASS_NVCC_ARCHS="90a" 
 ```
 
-Please refer to the [functionality documentation](media/docs/functionality.md) for details on which kernels require which target architectures.
+Please refer to the [functionality documentation](./media/docs/functionality.md) for details on which kernels require which target architectures.
 
 # Documentation
 
 CUTLASS is described in the following documents and the accompanying
 [Doxygen documentation](https://nvidia.github.io/cutlass).
 
-- [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS
-- [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS
-- [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
-- [CUTLASS 3.x Design](media/docs/cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components
-- [GEMM API 3.x](media/docs/gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts
-- [GEMM API 2.x](media/docs/gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts
-- [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
-- [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
-- [Terminology](media/docs/terminology.md) - describes terms used in the code
-- [Programming Guidelines](media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
-- [Fundamental types](media/docs/fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays
-- [Layouts](media/docs/layout.md) - describes layouts of matrices and tensors in memory
-- [Tile Iterators](media/docs/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
-- [CUTLASS Profiler](media/docs/profiler.md) - command-line driven profiling application
-- [CUTLASS Utilities](media/docs/utilities.md) - additional templates used to facilate rapid development
+- [Quick Start Guide](./media/docs/quickstart.md) - build and run CUTLASS
+- [Functionality](./media/docs/functionality.md) - summarizes functionality available in CUTLASS
+- [Efficient GEMM in CUDA](./media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
+- [CUTLASS 3.x Design](./media/docs/cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components
+- [GEMM API 3.x](./media/docs/gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts
+- [GEMM API 2.x](./media/docs/gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts
+- [Implicit GEMM Convolution](./media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
+- [Code Organization](./media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
+- [Terminology](./media/docs/terminology.md) - describes terms used in the code
+- [Programming Guidelines](./media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
+- [Fundamental types](./media/docs/fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays
+- [Layouts](./media/docs/layout.md) - describes layouts of matrices and tensors in memory
+- [Tile Iterators](./media/docs/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
+- [CUTLASS Profiler](./media/docs/profiler.md) - command-line driven profiling application
+- [CUTLASS Utilities](./media/docs/utilities.md) - additional templates used to facilate rapid development
 
 # Resources
 We have also described the structure of an efficient GEMM in our talk at the
@@ -182,7 +184,7 @@ projects. Client applications should target CUTLASS's `include/` directory in th
 paths.
 
 CUTLASS unit tests, examples, and utilities can be build with CMake.
-The minimum version of CMake is given in the [Quickstart guide](media/docs/quickstart.md).
+The minimum version of CMake is given in the [Quickstart guide](./media/docs/quickstart.md).
 Make sure the `CUDACXX` environment  variable points to NVCC in the CUDA Toolkit installed
 on your system.
 
@@ -227,7 +229,7 @@ CUTLASS is arranged as a header-only library along with Utilities, Tools, Exampl
 and template concepts defined in the CUTLASS project.
 
 A detailed explanation of the source code organization may be found in the 
-[CUTLASS documentation](media/docs/code_organization.md), but several main components are summarized below.
+[CUTLASS documentation](./media/docs/code_organization.md), but several main components are summarized below.
 
 ## CUTLASS Template Library
 
@@ -276,7 +278,7 @@ include/                     # client applications should target this directory
 
 ### CUTLASS SDK Examples
 
-[CUTLASS SDK examples](/examples) apply CUTLASS templates to implement basic computations.
+[CUTLASS SDK examples](./examples) apply CUTLASS templates to implement basic computations.
 
 ### Tools
 
@@ -301,7 +303,7 @@ tools/
 The `test/unit/` directory consist of unit tests implemented with Google Test that demonstrate
 basic usage of Core API components and complete tests of the CUTLASS GEMM computations.
 
-Instructions for building and running the Unit tests are described in the [Quickstart guide](media/docs/quickstart.md).
+Instructions for building and running the Unit tests are described in the [Quickstart guide](./media/docs/quickstart.md).
 
 # Performance Profiling
 
@@ -517,9 +519,9 @@ reference_device: Passed
 
 ## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
 - Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
-  - [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples) 
-  - [Implicit GEMM convolution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
-- [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
+  - [GEMM CMake Examples](./media/docs/quickstart.md#gemm-cmake-examples) 
+  - [Implicit GEMM convolution CMake Examples](./media/docs/quickstart.md#convolution-cmake-examples)
+- [Further details about the CUTLASS Profiler are described here.](./media/docs/profiler.md)
 
 
 # About

diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
@@ -37,7 +37,7 @@ endif()
 FetchContent_Declare(
   googletest
   GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG        v1.13.0
+  GIT_TAG        v1.14.0
   )
 
 FetchContent_GetProperties(googletest)

diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h
@@ -260,7 +260,7 @@ class VisualizeLayout : public VisualizeLayoutBase {
     if (options.vectorize <= 2) return std::make_pair(false, -1);
 
     // Boundary check.
-    if (i > elements.size() || (i + options.vectorize - 1) > elements.size())
+    if (i > int(elements.size()) || (i + options.vectorize - 1) > int(elements.size()))
       return std::make_pair(false, -1);
 
     // Check if either all elements are valid or invalid.

diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu
@@ -94,7 +94,7 @@ __global__ void copy(
 
     typename Iterator::Fragment fragment;
 
-    for(int i = 0; i < fragment.size(); ++i) {
+    for(size_t i = 0; i < fragment.size(); ++i) {
       fragment[i] = 0;
     }
 

diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu
@@ -207,15 +207,15 @@ cudaError_t strided_batched_gemm_nn_reference(
 
   cudaError_t result = cudaSuccess;
 
-  if (A.size() < lda * k * batch_count) {
+  if (A.size() < size_t(lda * k * batch_count)) {
     std::cout << "the size of A is too small" << std::endl;
     return cudaErrorInvalidValue;
   }
-  if (B.size() < ldb * n) {
+  if (B.size() < size_t(ldb * n)) {
     std::cout << "the size of B is too small" << std::endl;
     return cudaErrorInvalidValue;
   }
-  if (C.size() < ldc * n * batch_count) {
+  if (C.size() < size_t(ldc * n * batch_count)) {
     std::cout << "the size of C is too small" << std::endl;
     return cudaErrorInvalidValue;
   }

diff --git a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
@@ -102,7 +102,7 @@ struct B2bFusedGroupedGemmRun
     if (dist_kind == cutlass::Distribution::Uniform) {
 
       cutlass::reference::host::TensorFillRandomUniform(
-        view, seed, 2, -2, 0);
+        view, seed, 1, -1, 0);
     } 
     else if (dist_kind == cutlass::Distribution::Identity) {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,6 +4,7 @@

		- ["A Case Study in CUDA Kernel Fusion: Implementing FlashAttention-2 on NVIDIA Hopper Architecture using the CUTLASS Library"](https://arxiv.org/abs/2312.11918). Ganesh Bikshandi, Jay Shah. _arXiv_, December 2023.

		- ["Benchmarking GPU Tensor Cores on General Matrix Multiplication Kernels through CUTLASS"](https://www.mdpi.com/2076-3417/13/24/13022). Xuanteng Huang, Xianwei Zhang, Panfei Yang, Nong Xiao. _Journal of Applied Sciences_, December 2023.

		- ["A Speed Odyssey for Deployable Quantization of LLMs"](https://arxiv.org/abs/2311.09550). Qingyuan Li, Ran Meng, Yiduo Li, Bo Zhang, Liang Li, Yifan Lu, Xiangxiang Chu, Yerui Sun, Yuchen Xie. _arXiv_, November 2023.

Expand Down