diff --git a/include/cutlass/block_striped.h b/include/cutlass/block_striped.h index 09f3fb04fc..03989c22c0 100644 --- a/include/cutlass/block_striped.h +++ b/include/cutlass/block_striped.h @@ -263,5 +263,36 @@ struct BlockStripedReduce : }; +/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable, +/// statically-sized array types to global memory. +/// (Specialization for bfloat16_t. Uses __nv_bfloat162 vectorized-reduction.) +template < + int BlockThreads, + typename ArrayT> +struct BlockStripedReduce : + BlockStriped< + BlockThreads, + ArrayT, + __nv_bfloat162> +{ + static_assert(BlockStripedReduce::kStripes % 2 == 0, "Array of bfloat16 must be even number in length"); + + /// Reduce + CUTLASS_DEVICE + static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx) + { + cutlass::atomic_add<__nv_bfloat162> reduce; + __nv_bfloat162 *access_output = reinterpret_cast<__nv_bfloat162*>(ptr); + const __nv_bfloat162 *access_data = reinterpret_cast(&data); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < BlockStripedReduce::kStripes; ++i) + { + reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]); + } + } +}; + + } // namespace cutlass diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index 964d2ff35f..d03a87377a 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -626,6 +626,23 @@ struct atomic_add } }; +template<> +struct atomic_add<__nv_bfloat162> +{ + CUTLASS_DEVICE + void operator()(__nv_bfloat162 *ptr, const __nv_bfloat162) + { +#if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 900)) + CUTLASS_UNUSED(ptr); + CUTLASS_UNUSED(data); +#else + // Vector-2 bf16 atomic reduction requires .target sm_90 or higher + uint32_t word = reinterpret_cast(data); + asm volatile ("red.gpu.global.add.noftz.bf16x2 [%0], %1;\n" : : "l"(ptr), "r"(word)); +#endif // (__CUDA_ARCH__ >= 900) + } +}; + template using red [[deprecated("use atomic_add instead")]] = atomic_add;