Skip to content

Commit

Permalink
[GPU] BugFix reduce_b_fs_yx_fsv16 kernel (openvinotoolkit#17477)
Browse files Browse the repository at this point in the history
+ Invalid calculation in reducing un-aligned feature axis for b_fs_yx_fsv16
+ Some reduce modes are not invariant by using 0 value out of range
+ Added jit ZERO_INVARIANT_REDUCTION
+ Enable blocked unit-tests on dGPU by PR#15873

Signed-off-by: Min, Byungil <[email protected]>
  • Loading branch information
byungilm authored May 24, 2023
1 parent 60d5572 commit 0d3b636
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ KERNEL(reduce_fsv16)(
const uint xy = (uint)get_global_id(1) * READ_OFFSET;
const uint x = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
const uint y = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
#endif
#endif // !IS_REDUCE_XY
const uint bf = (uint)get_global_id(2) * SIMD;
const uint b = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
const uint f = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
Expand Down Expand Up @@ -252,7 +252,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
for (uint yi = y_out; yi < y_max_val; ++yi) {
for (uint xi = x_out; xi < x_max_val; ++xi) {
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
if (fi + FSV <= INPUT0_FEATURE_NUM)
input = BLOCK_READ(data, offset);
else
Expand All @@ -269,7 +269,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
#if INPUT0_SIZE_X % READ_OFFSET != 0
for (uint xi = x_leftover_start; xi < x_leftover_end; ++xi) {
INPUT0_TYPE leftovers = INIT_VAL;
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
if (fi + FSV <= INPUT0_FEATURE_NUM)
leftovers = DT_INPUT_BLOCK_READ(data, offset);
else
Expand Down Expand Up @@ -330,27 +330,25 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
if (get_sub_group_local_id() == 0)
output[out_idx] = final_result;
#endif
#else
#else // !REDUCE_X
ACCUMULATOR_VEC acc = (ACCUMULATOR_VEC)(INIT_VAL);
for (uint bi = batch_out; bi < batch_max_val; ++bi) {
for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {

for (uint yi = y_out; yi < y_max_val; ++yi) {
for (uint xi = x_out; xi < x_max_val; ++xi) {
#if HANDLE_FEATURE_REMAINDER
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
if (fi + FSV <= INPUT0_FEATURE_NUM)
input = BLOCK_READ(data, offset);
else
if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
for (int i = 0; i < READ_OFFSET; ++i)
input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
#else
input = BLOCK_READ(data, offset);
INPUT_VEC input = BLOCK_READ(data, offset);
#endif
#else
INPUT_VEC input = BLOCK_READ(data, offset);
#endif

unroll_for (int i = 0; i < READ_OFFSET; ++i)
acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
offset += input_x_pitch;
Expand Down Expand Up @@ -410,7 +408,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
#endif
}
}
#endif
#endif // !REDUCE_X
}

#undef SIMD
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ namespace kernel_selector {

static const size_t SIMD = 16;
static const size_t XY_OPT_F_LIMITS = 96;
static const size_t AXIS_F = 1;
static const size_t AXIS_Y = 2;
static const size_t AXIS_X = 3;
using NDims = std::vector<kernel_selector::Tensor::Dim>;
Expand Down Expand Up @@ -78,6 +79,15 @@ static bool can_opt_reduce_xy(const reduce_params& params) {
input_dims[1].v <= XY_OPT_F_LIMITS;
}

static bool reducing_unaligned_f_axis(const reduce_params& params) {
if (count(params.reduceAxes.begin(), params.reduceAxes.end(), AXIS_F) > 0) {
if (params.inputs[0].Feature().v % 16 != 0)
return true;
}

return false;
}

ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
Expand Down Expand Up @@ -216,13 +226,14 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
}
}

// MIN/MAX mode should handle feature remainder in case reduce axes includes feature
if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) {
if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) {
if (params.inputs[0].Feature().v % 16 != 0) {
jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1));
}
}
// Some reduction modes are affected by 0 value (e.g. min, max, prod ...)
bool zero_invariant_mode = params.reduceMode == ReduceMode::L1 || params.reduceMode == ReduceMode::L2 ||
params.reduceMode == ReduceMode::LOG_SUM || params.reduceMode == ReduceMode::LOG_SUM_EXP ||
params.reduceMode == ReduceMode::MEAN || params.reduceMode == ReduceMode::OR ||
params.reduceMode == ReduceMode::SUM || params.reduceMode == ReduceMode::SUM_SQUARE;

if (zero_invariant_mode && reducing_unaligned_f_axis(params)) {
jit.AddConstant(MakeJitConstant("ZERO_INVARIANT_REDUCTION", 1));
}

return jit;
Expand All @@ -232,6 +243,8 @@ KernelsData ReduceKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, con
KernelsData kds = GetCommonKernelsData(params, options);
const reduce_params& orgParams = static_cast<const reduce_params&>(params);

// To get perf gain of reduction of un-aligned f axis,
// Reduce kernel uses 0 value out of range in inner block by disabling re-use memory
if (orgParams.inputs[0].Feature().v % 16 != 0) {
kds[0].can_reuse_memory = false;
}
Expand Down
44 changes: 30 additions & 14 deletions src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ struct reduce_test_params {

class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
public:
void execute(reduce_test_params& p, bool is_dynamic = false) {
auto input_prim = get_mem(get_input_layout(p));
// If an input generator fills values for blocked formats, it sets random values outside of shape.
// To avoid this issue made by a generator, it could use a proper planar format given by 'default_format' and add a reorder.
void execute(reduce_test_params& p, bool is_dynamic = false, bool use_planar_input = false) {
auto input_prim = get_mem(get_input_layout(p, use_planar_input));

cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
Expand Down Expand Up @@ -63,8 +65,11 @@ class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format };
}

layout get_input_layout(reduce_test_params& p) {
return layout{ p.in_shape, p.data_type, p.input_format };
layout get_input_layout(reduce_test_params& p, bool use_planar_input = false) {
if (use_planar_input)
return layout{ p.in_shape, p.data_type, format::get_default_format(p.input_format)};
else
return layout{ p.in_shape, p.data_type, p.input_format };
}

layout get_output_layout(reduce_test_params& p) {
Expand Down Expand Up @@ -113,19 +118,17 @@ class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {

class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
TEST_P(reduce_eltwise_activation_quantize, basic) {
// TODO: Fix me, refer PR(#15873)
if (engine.get_device_info().supports_immad)
return;
auto p = GetParam();
update_out_shape(p);
create_topologies(
input_layout("input", get_input_layout(p)),
input_layout("input", get_input_layout(p, true)),
reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), -128)),
data("out_hi", get_mem(get_single_element_layout(p), 127)),
data("eltwise_data", get_mem(get_output_layout(p))),
reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
activation("activation", input_info("eltwise"), activation_func::relu),
quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"),
Expand All @@ -134,28 +137,31 @@ TEST_P(reduce_eltwise_activation_quantize, basic) {
);

tolerance = 1.f;
execute(p);
// Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
execute(p, false, true);
}

TEST_P(reduce_eltwise_activation_quantize, per_channel) {
auto p = GetParam();
update_out_shape(p);
create_topologies(
input_layout("input", get_input_layout(p)),
input_layout("input", get_input_layout(p, true)),
reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), -128)),
data("out_hi", get_mem(get_single_element_layout(p), 127)),
data("eltwise_data", get_mem(get_output_layout(p))),
reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
activation("activation", input_info("eltwise"), activation_func::relu),
quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"), input_info("out_lo"), input_info("out_hi"), 256, data_types::i8),
reorder("output_reorder", input_info("quantize"), p.default_format, data_types::f32)
);

tolerance = 1.f;
execute(p);
// Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
execute(p, false, true);
}

INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
Expand Down Expand Up @@ -215,7 +221,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, { 2, 1, 0 }, true, "reduce_ref" },
reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { 4, 3, 0 }, true, "reduce_ref" },
reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, { 3, 2, 1 }, true, "reduce_ref" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 1, 3 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { 3, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
Expand All @@ -228,6 +233,17 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 3 }, true, "reduce_gpu_b_fs_yx_fsv16" }
}));

INSTANTIATE_TEST_SUITE_P(fusings_gpu_bf_axis, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::prod, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::mean, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum_square, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l1, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l2, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::log_sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }
}));

class reduce_scale_activation : public ReduceFusingTest {};
TEST_P(reduce_scale_activation, basic) {
auto p = GetParam();
Expand Down

0 comments on commit 0d3b636

Please sign in to comment.