Dpp opts for wavefront 32 (ROCm#951)

Checks wavefront size, then changes implementation and number of threads for DPP reduce
applib-sg · Sep 27, 2021 · 6e2df9d · 6e2df9d
1 parent 95431eb
commit 6e2df9d
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 14 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -4,7 +4,7 @@ CheckOptions:
   - key:             bugprone-unused-return-value.CheckedFunctions
     value:           '::std::async;::std::launder;::std::remove;::std::remove_if;::std::unique;::std::unique_ptr::release;::std::basic_string::empty;::std::vector::empty;::std::find;::std::find_if;::std::find_if_not;::std::all_of;::std::any_of;::std::none_of;::std::count;::std::count_if;::std::mismatch;::std::find_end;::std::find_first_of;::std::adjacent_find;::std::search;::std::search_n;::std::nth_element;::std::lower_bound;::std::upper_bound;::std::binary_search;::std::equal_range;::std::max;::std::max_element;::std::min;::std::min_element;::std::minmax;::std::minmax_element;::std::equal;::std::lexicographical_compare;::std::accumulate;::std::inner_product'
   - key:             cppcoreguidelines-macro-usage.AllowedRegexp
-    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED'
+    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED|_WORKAROUND_'
   - key:             modernize-loop-convert.MinConfidence
     value:           risky   
   - key:             modernize-loop-convert.NamingStyle

diff --git a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 
-#if __AMDGCN_WAVEFRONT_SIZE == 32
-#define MIGRAPHX_NO_DPP
-#endif
-
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
           class Op,
@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
     in  = op(in, out);
     out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
     in  = op(in, out);
+#if __AMDGCN_WAVEFRONT_SIZE == 64
     out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
     in  = op(in, out);
     out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
     in  = op(in, out);
+#endif
 }
 
 __device__ inline void dpp_reduce(float& x, sum)
@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
                      "s_nop 1\n"
                      "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
                      "s_nop 1\n"
+#if __AMDGCN_WAVEFRONT_SIZE == 64
                      "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
                      "s_nop 1\n"
                      "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+#endif
                      "s_nop 1\n"
                      : "=v"(x)
                      : "0"(x));
@@ -135,21 +135,27 @@ template <index_int N,
           MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
 {
-    using type = decltype(f(deduce_for_stride(fs)));
-    MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
+
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int nthreads = 16;
+#else
+    constexpr index_int nthreads = 64;
+#endif
+    using type                   = decltype(f(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
     type x = init;
     fs([&](auto i) { x = op(x, f(i)); });
     dpp_reduce(x, op);
 
-    const auto ldsidx = idx.local / 64;
-    if((idx.local % 64) == 63)
+    const auto ldsidx = idx.local / nthreads;
+    if((idx.local % nthreads) == nthreads - 1)
     {
         buffer[ldsidx] = x;
     }
     __syncthreads();
 
     type y = init;
-    for(index_int i = 0; i < idx.nlocal() / 64; i++)
+    for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
     {
         y = op(y, buffer[i]);
     }

diff --git a/src/targets/gpu/device/layernorm.cpp b/src/targets/gpu/device/layernorm.cpp
@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 
+#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
+#else
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
+#endif
+#endif
+
 template <class T>
 struct vector_type
 {
@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
     const bool in_range    = idx.local < relements_v;
 
     auto mean = [&](auto z) {
-        return auto_block_reduce<MaxBlockSize>(
-                   idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
-               value_type(relements);
-
+        auto m = auto_block_reduce<MaxBlockSize>(
+                     idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
+                 value_type(relements);
+#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+        __builtin_amdgcn_s_barrier();
+#endif
+        return m;
     };
 
     // m = x - mean(x)

diff --git a/test/verify/test_layernorm.cpp b/test/verify/test_layernorm.cpp
@@ -81,3 +81,20 @@ struct test_layernorm_triadd : verify_program<test_layernorm_triadd>
         return p;
     }
 };
+
+struct test_layernorm_triadd_large : verify_program<test_layernorm_triadd_large>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm                 = p.get_main_module();
+        std::vector<size_t> dims = {1, 384, 1024};
+        auto x    = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
+        auto y    = mm->add_parameter("y", migraphx::shape{migraphx::shape::float_type, dims});
+        auto z    = mm->add_parameter("z", migraphx::shape{migraphx::shape::float_type, dims});
+        auto add1 = mm->add_instruction(migraphx::make_op("add"), x, y);
+        auto add2 = mm->add_instruction(migraphx::make_op("add"), add1, z);
+        add_layernorm(*mm, add2, dims);
+        return p;
+    }
+};
diff --git a/test/verify/test_reduce_op_large.cpp b/test/verify/test_reduce_op_large.cpp
@@ -27,3 +27,16 @@ template struct test_reduce_op_large<migraphx::op::reduce_mean, 1, migraphx::sha
 template struct test_reduce_op_large<migraphx::op::reduce_min, 1, migraphx::shape::float_type>;
 template struct test_reduce_op_large<migraphx::op::reduce_prod, 2, migraphx::shape::float_type>;
 template struct test_reduce_op_large<migraphx::op::reduce_sum, 1, migraphx::shape::float_type>;
+
+struct test_reduce_mean : verify_program<test_reduce_mean>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {1, 384, 1024}};
+        auto x = mm->add_parameter("x", s);
+        mm->add_instruction(migraphx::op::reduce_mean{{1}}, x);
+        return p;
+    };
+};