Skip to content

Commit

Permalink
CUTLASS 3.5.0 (NVIDIA#1411)
Browse files Browse the repository at this point in the history
  • Loading branch information
thakkarV authored Mar 19, 2024
1 parent ffa34e7 commit 629f465
Show file tree
Hide file tree
Showing 468 changed files with 48,805 additions and 7,328 deletions.
253 changes: 135 additions & 118 deletions CHANGELOG.md

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,13 @@ elseif (CUDA_VERSION VERSION_LESS 11.4)
message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.8 or higher.")
endif()

if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.5)
message(FATAL_ERROR "GCC version must be at least 7.5!")
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.3)
message(FATAL_ERROR "GCC version must be at least 7.3!")
endif()

if (CUDA_COMPILER MATCHES "[Cc]lang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
endif()

find_package(Doxygen QUIET)

################################################################################
Expand Down Expand Up @@ -168,6 +167,7 @@ endif()
include(GNUInstallDirs)

link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)

###################################################################################################
#
Expand Down
4 changes: 2 additions & 2 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")
![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")

[README](/README.md#documentation) > **Contributors**
[README](./README.md#documentation) > **Contributors**

# CUTLASS Developers and Contributors

Expand Down
21 changes: 20 additions & 1 deletion CUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,14 @@ function(cutlass_add_library NAME)
cxx_std_11
)

get_target_property(TARGET_TYPE ${NAME} TYPE)

if (TARGET_TYPE MATCHES "SHARED")
set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
elseif(TARGET_TYPE MATCHES "STATIC")
set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY Static)
endif()

if(__EXPORT_NAME)
add_library(nvidia::cutlass::${__EXPORT_NAME} ALIAS ${NAME})
set_target_properties(${NAME} PROPERTIES EXPORT_NAME ${__EXPORT_NAME})
Expand All @@ -336,10 +344,19 @@ endfunction()
function(cutlass_add_executable NAME)

set(options)
set(oneValueArgs)
set(oneValueArgs CUDA_RUNTIME_LIBRARY)
set(multiValueArgs)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

if (NOT DEFINED __CUDA_RUNTIME_LIBRARY)
set(__CUDA_RUNTIME_LIBRARY Shared)
endif()

set(__CUDA_RUNTIME_LIBRARY_ALLOWED None Shared Static)
if (NOT __CUDA_RUNTIME_LIBRARY IN_LIST __CUDA_RUNTIME_LIBRARY_ALLOWED)
message(FATAL_ERROR "CUDA_RUNTIME_LIBRARY value '${__CUDA_RUNTIME_LIBRARY}' is not in allowed list of '${__CUDA_RUNTIME_LIBRARY_ALLOWED}'")
endif()

cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})

if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
Expand All @@ -359,6 +376,8 @@ function(cutlass_add_executable NAME)
cxx_std_11
)

set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY ${__CUDA_RUNTIME_LIBRARY})

endfunction()

function(cutlass_target_sources NAME)
Expand Down
1 change: 1 addition & 0 deletions PUBLICATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

- ["A Case Study in CUDA Kernel Fusion: Implementing FlashAttention-2 on NVIDIA Hopper Architecture using the CUTLASS Library"](https://arxiv.org/abs/2312.11918). Ganesh Bikshandi, Jay Shah. _arXiv_, December 2023.

- ["Benchmarking GPU Tensor Cores on General Matrix Multiplication Kernels through CUTLASS"](https://www.mdpi.com/2076-3417/13/24/13022). Xuanteng Huang, Xianwei Zhang, Panfei Yang, Nong Xiao. _Journal of Applied Sciences_, December 2023.

- ["A Speed Odyssey for Deployable Quantization of LLMs"](https://arxiv.org/abs/2311.09550). Qingyuan Li, Ran Meng, Yiduo Li, Bo Zhang, Liang Li, Yifan Lu, Xiangxiang Chu, Yerui Sun, Yuchen Xie. _arXiv_, November 2023.

Expand Down
90 changes: 46 additions & 44 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")

# CUTLASS 3.4
# CUTLASS 3.5

_CUTLASS 3.4 - February 2024_
_CUTLASS 3.5 - March 2024_

CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-matrix multiplication (GEMM) and related computations at all levels
Expand All @@ -19,16 +19,16 @@ mixed-precision computations, providing specialized data-movement and
multiply-accumulate abstractions for half-precision floating
point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32),
single-precision floating point (FP32),
[FP32 emulation via tensor core instruction](/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm),
[FP32 emulation via tensor core instruction](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm),
double-precision floating
point (FP64) types, integer data types (4b and 8b), and binary data types (1b).
CUTLASS demonstrates warp-synchronous matrix multiply operations
targeting the programmable, high-throughput _Tensor Cores_ implemented by
NVIDIA's Volta, Turing, Ampere, and Hopper architectures.

See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
See the [Quick Start Guide](./media/docs/quickstart.md) to get started quickly.

See the [functionality listing](/media/docs/functionality.md) for the list of operations
See the [functionality listing](./media/docs/functionality.md) for the list of operations
supported at each level of the execution model hierarchy.

CUTLASS 3.0 introduced a new core library, CuTe, to describe and manipulate tensors of threads and data.
Expand All @@ -37,25 +37,27 @@ CuTe is a collection of C++ CUDA template abstractions for defining and operatin
The core abstractions of CuTe are hierarchically multidimensional layouts which can be composed with data arrays to represent tensors. The representation of layouts is powerful enough to represent nearly everything we need to implement efficient dense linear algebra. Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning.

CUTLASS 3.0 and beyond adopts CuTe throughout the GEMM hierarchy in its templates. This greatly simplifies the design
and improves code composability and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](/media/docs/cute/00_quickstart.md).
and improves code composability and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](./media/docs/cute/00_quickstart.md).

In addition to GEMMs, CUTLASS implements high-performance convolution via the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline. This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components.

# What's New in CUTLASS 3.4
# What's New in CUTLASS 3.5

CUTLASS 3.4.1 is an update to CUTLASS adding:
- Statically available [CUTLASS Version macros](/include/cutlass/version.h) that allow for handling API changes between CUTLASS releases on the users' side.
- Improvements for Hopper [Group-GEMM](/examples/57_hopper_grouped_gemm) and [Pointer-Array Batched GEMM](/examples/56_hopper_ptr_array_batched_gemm).
- Updates and bugfixes from the community (thanks!).
CUTLASS 3.5 is an update to CUTLASS adding:

CUTLASS 3.4.0 is an update to CUTLASS adding:

- Improved [Mixed-input Hopper GEMMs](/examples/55_hopper_mixed_dtype_gemm) supporting {16-bit, 8-bit} x {8-bit, 4-bit} input types with fast numerical converters and group scaling factors tuned for optimal performance on Hopper H100.
- Beta release of [Pointer-Array Batched GEMMs](/examples/56_hopper_ptr_array_batched_gemm) utilizing TMA and Hopper H100 tensor cores now available. (Requires CUDA 12.3 or above)
- Beta release of [Group-GEMM](/examples/57_hopper_grouped_gemm) - commonly used in optimization of Mixture-Of-Expert models, is now available on Hopper GPUs taking advantage of TMA and Hopper H100 tensor cores. (Requires CUDA 12.3 or above)
- [Ampere Sparse GEMM](/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu) supports Epilogue Visitor Tree (EVT) now.
- Improvements to NamedBarriers including details of [ReservedNamedBarriers](/include/cutlass/arch/barrier.h) used within the CUTLASS library.
- Improved [CuTe documentation](/media/docs/cute/) including improved clarity and depth of [Quickstart](/media/docs/cute/00_quickstart.md), [CuTe Layout](/media/docs/cute/01_layout.md), and [CuTe Layout Algebra](/media/docs/cute/02_layout_algebra.md). Associated code comments, post-conditions, and details in [CuTe Core Unit Tests](/test/unit/cute/core/) also improved.
- Implicit GEMM Convolutions targeting Hopper SM90A via WGMMA + [TMA im2col](./include/cute/atom/copy_traits_sm90_im2col.hpp)
+ Native implementation in CUTLASS 3.x using CuTe, mirroring the [same design hierarchy as that of GEMMs](./media/docs/gemm_api_3x.md).
+ Support for 1D, 2D, and 3D convolutions in a [rank-agnostic fashion](./include/cutlass/conv/convnd_problem_shape.hpp).
+ Support for [Fprop](./test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu), [Dgrad](./test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu), and [Wgrad](./test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu) algorithms
+ [CUTLASS profiler support](./python/cutlass_library/conv3x_emitter.py) for 2D and 3D convolutions implemented via the 3.x API.
+ NOTE: this is a beta release. Further updates to CUTLASS will include major performance improvements, feature enablement, and possible breaking changes to the API until 3.7 release. Your feedback is welcome on the design!
- Support for [Ada (SM89) FP8 tensor cores via the 2.x API](./examples/58_ada_fp8_gemm/ada_fp8_gemm.cu). Requires CUDA 12.4 or newer.
- [Ampere gather/scatter convolution example](./examples/59_ampere_gather_scatter_gemm/README.md) in CuTe and CUTLASS 3.x
+ Showcasing how custom kernels can be written and optimized using CUTLASS 3.x and CuTe and the general strategy for implementing convolutions as specializations of GETTs.
+ Implementation of a coarse grained sparse gather/scatter kernel achieving peak performance on Ampere class tensor cores.
- Updates to CuTe documentation for [`cute::Tensor<>`](./media/docs/cute/03_tensor.md), [MMA atoms](./media/docs/cute/0t_mma_atom.md), and an overhauled [CuTe GEMM tutorial series](./examples/cute/tutorial).
- Extensions to CuTe to support [L2 prefetching](./include/cute/algorithm/prefetch.hpp) and [TMA store+reductions](./include/cute/arch/copy_sm90_tma.hpp#L1337).
- Updates and bugfixes from the community (thanks!)

Minimum requirements:

Expand Down Expand Up @@ -98,7 +100,7 @@ as shown in the above figure. Tensor Core operations are implemented using CUDA
# Compatibility

CUTLASS requires a C++17 host compiler and
performs best when built with the [**CUDA 12.3.2 Toolkit**](https://developer.nvidia.com/cuda-downloads).
performs best when built with the [**CUDA 12.4 Toolkit**](https://developer.nvidia.com/cuda-downloads).
It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, CUDA 11.8, CUDA 12.0, CUDA 12.1, CUDA 12.2.2, CUDA 12.3.1 and CUDA 12.3.2.

## Operating Systems
Expand Down Expand Up @@ -142,28 +144,28 @@ The target architecture information is passed on to CUTLASS via the cmake flag `
cmake .. -DCUTLASS_NVCC_ARCHS="90a"
```

Please refer to the [functionality documentation](media/docs/functionality.md) for details on which kernels require which target architectures.
Please refer to the [functionality documentation](./media/docs/functionality.md) for details on which kernels require which target architectures.

# Documentation

CUTLASS is described in the following documents and the accompanying
[Doxygen documentation](https://nvidia.github.io/cutlass).

- [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS
- [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS
- [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
- [CUTLASS 3.x Design](media/docs/cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components
- [GEMM API 3.x](media/docs/gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts
- [GEMM API 2.x](media/docs/gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts
- [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
- [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
- [Terminology](media/docs/terminology.md) - describes terms used in the code
- [Programming Guidelines](media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
- [Fundamental types](media/docs/fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays
- [Layouts](media/docs/layout.md) - describes layouts of matrices and tensors in memory
- [Tile Iterators](media/docs/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
- [CUTLASS Profiler](media/docs/profiler.md) - command-line driven profiling application
- [CUTLASS Utilities](media/docs/utilities.md) - additional templates used to facilate rapid development
- [Quick Start Guide](./media/docs/quickstart.md) - build and run CUTLASS
- [Functionality](./media/docs/functionality.md) - summarizes functionality available in CUTLASS
- [Efficient GEMM in CUDA](./media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
- [CUTLASS 3.x Design](./media/docs/cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components
- [GEMM API 3.x](./media/docs/gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts
- [GEMM API 2.x](./media/docs/gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts
- [Implicit GEMM Convolution](./media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
- [Code Organization](./media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
- [Terminology](./media/docs/terminology.md) - describes terms used in the code
- [Programming Guidelines](./media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
- [Fundamental types](./media/docs/fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays
- [Layouts](./media/docs/layout.md) - describes layouts of matrices and tensors in memory
- [Tile Iterators](./media/docs/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
- [CUTLASS Profiler](./media/docs/profiler.md) - command-line driven profiling application
- [CUTLASS Utilities](./media/docs/utilities.md) - additional templates used to facilate rapid development

# Resources
We have also described the structure of an efficient GEMM in our talk at the
Expand All @@ -182,7 +184,7 @@ projects. Client applications should target CUTLASS's `include/` directory in th
paths.

CUTLASS unit tests, examples, and utilities can be build with CMake.
The minimum version of CMake is given in the [Quickstart guide](media/docs/quickstart.md).
The minimum version of CMake is given in the [Quickstart guide](./media/docs/quickstart.md).
Make sure the `CUDACXX` environment variable points to NVCC in the CUDA Toolkit installed
on your system.

Expand Down Expand Up @@ -227,7 +229,7 @@ CUTLASS is arranged as a header-only library along with Utilities, Tools, Exampl
and template concepts defined in the CUTLASS project.

A detailed explanation of the source code organization may be found in the
[CUTLASS documentation](media/docs/code_organization.md), but several main components are summarized below.
[CUTLASS documentation](./media/docs/code_organization.md), but several main components are summarized below.

## CUTLASS Template Library

Expand Down Expand Up @@ -276,7 +278,7 @@ include/ # client applications should target this directory

### CUTLASS SDK Examples

[CUTLASS SDK examples](/examples) apply CUTLASS templates to implement basic computations.
[CUTLASS SDK examples](./examples) apply CUTLASS templates to implement basic computations.

### Tools

Expand All @@ -301,7 +303,7 @@ tools/
The `test/unit/` directory consist of unit tests implemented with Google Test that demonstrate
basic usage of Core API components and complete tests of the CUTLASS GEMM computations.

Instructions for building and running the Unit tests are described in the [Quickstart guide](media/docs/quickstart.md).
Instructions for building and running the Unit tests are described in the [Quickstart guide](./media/docs/quickstart.md).

# Performance Profiling

Expand Down Expand Up @@ -517,9 +519,9 @@ reference_device: Passed

## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
- Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
- [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples)
- [Implicit GEMM convolution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
- [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
- [GEMM CMake Examples](./media/docs/quickstart.md#gemm-cmake-examples)
- [Implicit GEMM convolution CMake Examples](./media/docs/quickstart.md#convolution-cmake-examples)
- [Further details about the CUTLASS Profiler are described here.](./media/docs/profiler.md)


# About
Expand Down
2 changes: 1 addition & 1 deletion cmake/googletest.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ endif()
FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG v1.13.0
GIT_TAG v1.14.0
)

FetchContent_GetProperties(googletest)
Expand Down
2 changes: 1 addition & 1 deletion examples/03_visualize_layout/visualize_layout.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ class VisualizeLayout : public VisualizeLayoutBase {
if (options.vectorize <= 2) return std::make_pair(false, -1);

// Boundary check.
if (i > elements.size() || (i + options.vectorize - 1) > elements.size())
if (i > int(elements.size()) || (i + options.vectorize - 1) > int(elements.size()))
return std::make_pair(false, -1);

// Check if either all elements are valid or invalid.
Expand Down
2 changes: 1 addition & 1 deletion examples/04_tile_iterator/tile_iterator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ __global__ void copy(

typename Iterator::Fragment fragment;

for(int i = 0; i < fragment.size(); ++i) {
for(size_t i = 0; i < fragment.size(); ++i) {
fragment[i] = 0;
}

Expand Down
6 changes: 3 additions & 3 deletions examples/05_batched_gemm/batched_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,15 @@ cudaError_t strided_batched_gemm_nn_reference(

cudaError_t result = cudaSuccess;

if (A.size() < lda * k * batch_count) {
if (A.size() < size_t(lda * k * batch_count)) {
std::cout << "the size of A is too small" << std::endl;
return cudaErrorInvalidValue;
}
if (B.size() < ldb * n) {
if (B.size() < size_t(ldb * n)) {
std::cout << "the size of B is too small" << std::endl;
return cudaErrorInvalidValue;
}
if (C.size() < ldc * n * batch_count) {
if (C.size() < size_t(ldc * n * batch_count)) {
std::cout << "the size of C is too small" << std::endl;
return cudaErrorInvalidValue;
}
Expand Down
2 changes: 1 addition & 1 deletion examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ struct B2bFusedGroupedGemmRun
if (dist_kind == cutlass::Distribution::Uniform) {

cutlass::reference::host::TensorFillRandomUniform(
view, seed, 2, -2, 0);
view, seed, 1, -1, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {

Expand Down
Loading

0 comments on commit 629f465

Please sign in to comment.