From 8724aeeeedf934b04dba270bb1a87a183395a3f8 Mon Sep 17 00:00:00 2001 From: Danial Javady Date: Thu, 1 Aug 2024 14:26:31 -0400 Subject: [PATCH] Avoid LDGSTG instructions by changing default copy to be universalcopy --- .gitignore | 4 +++- .../50_hopper_gemm_with_epilogue_swizzle.cu | 6 +++--- include/cute/arch/copy.hpp | 3 ++- test/unit/cute/volta/vectorization_auto.cu | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index acddb1f9d1..924d2600a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ # PyCache files __pycache__/ -cutlass_library.egg-info/ \ No newline at end of file +cutlass_library.egg-info/ +build/ +.vscode/ \ No newline at end of file diff --git a/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu index a736e5ce31..1d0a667140 100644 --- a/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu +++ b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu @@ -485,7 +485,7 @@ int main(int argc, char const **args) { // Tiled copy from Smem to Registers // Note : CuTe will vectorize this copy if the tiling + swizzling above were right using TiledCopyS2R = TiledCopy< - Copy_Atom, + Copy_Atom, ElementAcc>, Layout< Shape<_128,_16>, Stride<_16,_1>>, TileShapeS2R>; @@ -496,9 +496,9 @@ int main(int argc, char const **args) { cutlass::gemm::TagToStrideC_t, cutlass::epilogue::thread::LinearCombination, SmemLayout, - Copy_Atom, + Copy_Atom, ElementAcc>, TiledCopyS2R, - Copy_Atom>>; + Copy_Atom, ElementOutput>>>; // // Assembling the GemmKernel diff --git a/include/cute/arch/copy.hpp b/include/cute/arch/copy.hpp index 5139289995..7d95307b57 100644 --- a/include/cute/arch/copy.hpp +++ b/include/cute/arch/copy.hpp @@ -90,7 +90,8 @@ using AutoVectorizingCopy = AutoVectorizingCopyWithAssumedAlignment<128>; // DefaultCopy alias does not assume alignment of pointers or dynamic strides. // -using DefaultCopy = AutoVectorizingCopyWithAssumedAlignment<8>; + +using DefaultCopy = UniversalCopy>; // // Global memory prefetch into L2 diff --git a/test/unit/cute/volta/vectorization_auto.cu b/test/unit/cute/volta/vectorization_auto.cu index b378f8b329..b21992c85b 100644 --- a/test/unit/cute/volta/vectorization_auto.cu +++ b/test/unit/cute/volta/vectorization_auto.cu @@ -109,7 +109,7 @@ template void test_copy_vectorization(GmemLayout gmem_layout, RmemTiler rmem_tiler) { - test_copy_vectorization(DefaultCopy{}, gmem_layout, rmem_tiler); + test_copy_vectorization(AutoVectorizingCopyWithAssumedAlignment<128>{}, gmem_layout, rmem_tiler); } TEST(SM70_CuTe_Volta, SimpleVec)