Skip to content

Commit

Permalink
Dpp opts for wavefront 32 (ROCm#951)
Browse files Browse the repository at this point in the history
Checks wavefront size, then changes implementation and number of threads for DPP reduce
  • Loading branch information
kahmed10 authored Sep 27, 2021
1 parent 95431eb commit 6e2df9d
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CheckOptions:
- key: bugprone-unused-return-value.CheckedFunctions
value: '::std::async;::std::launder;::std::remove;::std::remove_if;::std::unique;::std::unique_ptr::release;::std::basic_string::empty;::std::vector::empty;::std::find;::std::find_if;::std::find_if_not;::std::all_of;::std::any_of;::std::none_of;::std::count;::std::count_if;::std::mismatch;::std::find_end;::std::find_first_of;::std::adjacent_find;::std::search;::std::search_n;::std::nth_element;::std::lower_bound;::std::upper_bound;::std::binary_search;::std::equal_range;::std::max;::std::max_element;::std::min;::std::min_element;::std::minmax;::std::minmax_element;::std::equal;::std::lexicographical_compare;::std::accumulate;::std::inner_product'
- key: cppcoreguidelines-macro-usage.AllowedRegexp
value: 'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED'
value: 'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED|_WORKAROUND_'
- key: modernize-loop-convert.MinConfidence
value: risky
- key: modernize-loop-convert.NamingStyle
Expand Down
24 changes: 15 additions & 9 deletions src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {

#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_NO_DPP
#endif

#ifdef MIGRAPHX_NO_DPP
template <index_int N,
class Op,
Expand Down Expand Up @@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
in = op(in, out);
out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
in = op(in, out);
#if __AMDGCN_WAVEFRONT_SIZE == 64
out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
in = op(in, out);
out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
in = op(in, out);
#endif
}

__device__ inline void dpp_reduce(float& x, sum)
Expand All @@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
"s_nop 1\n"
#if __AMDGCN_WAVEFRONT_SIZE == 64
"v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
#endif
"s_nop 1\n"
: "=v"(x)
: "0"(x));
Expand All @@ -135,21 +135,27 @@ template <index_int N,
MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
__device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
{
using type = decltype(f(deduce_for_stride(fs)));
MIGRAPHX_DEVICE_SHARED type buffer[N / 64];

#if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr index_int nthreads = 16;
#else
constexpr index_int nthreads = 64;
#endif
using type = decltype(f(deduce_for_stride(fs)));
MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
type x = init;
fs([&](auto i) { x = op(x, f(i)); });
dpp_reduce(x, op);

const auto ldsidx = idx.local / 64;
if((idx.local % 64) == 63)
const auto ldsidx = idx.local / nthreads;
if((idx.local % nthreads) == nthreads - 1)
{
buffer[ldsidx] = x;
}
__syncthreads();

type y = init;
for(index_int i = 0; i < idx.nlocal() / 64; i++)
for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
{
y = op(y, buffer[i]);
}
Expand Down
19 changes: 15 additions & 4 deletions src/targets/gpu/device/layernorm.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {

#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
#else
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
#endif
#endif

template <class T>
struct vector_type
{
Expand Down Expand Up @@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
const bool in_range = idx.local < relements_v;

auto mean = [&](auto z) {
return auto_block_reduce<MaxBlockSize>(
idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
value_type(relements);

auto m = auto_block_reduce<MaxBlockSize>(
idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
value_type(relements);
#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
__builtin_amdgcn_s_barrier();
#endif
return m;
};

// m = x - mean(x)
Expand Down
17 changes: 17 additions & 0 deletions test/verify/test_layernorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,20 @@ struct test_layernorm_triadd : verify_program<test_layernorm_triadd>
return p;
}
};

struct test_layernorm_triadd_large : verify_program<test_layernorm_triadd_large>
{
migraphx::program create_program() const
{
migraphx::program p;
auto* mm = p.get_main_module();
std::vector<size_t> dims = {1, 384, 1024};
auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
auto y = mm->add_parameter("y", migraphx::shape{migraphx::shape::float_type, dims});
auto z = mm->add_parameter("z", migraphx::shape{migraphx::shape::float_type, dims});
auto add1 = mm->add_instruction(migraphx::make_op("add"), x, y);
auto add2 = mm->add_instruction(migraphx::make_op("add"), add1, z);
add_layernorm(*mm, add2, dims);
return p;
}
};
13 changes: 13 additions & 0 deletions test/verify/test_reduce_op_large.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,16 @@ template struct test_reduce_op_large<migraphx::op::reduce_mean, 1, migraphx::sha
template struct test_reduce_op_large<migraphx::op::reduce_min, 1, migraphx::shape::float_type>;
template struct test_reduce_op_large<migraphx::op::reduce_prod, 2, migraphx::shape::float_type>;
template struct test_reduce_op_large<migraphx::op::reduce_sum, 1, migraphx::shape::float_type>;

struct test_reduce_mean : verify_program<test_reduce_mean>
{
migraphx::program create_program() const
{
migraphx::program p;
auto* mm = p.get_main_module();
migraphx::shape s{migraphx::shape::float_type, {1, 384, 1024}};
auto x = mm->add_parameter("x", s);
mm->add_instruction(migraphx::op::reduce_mean{{1}}, x);
return p;
};
};

0 comments on commit 6e2df9d

Please sign in to comment.