[GPU] BugFix reduce_b_fs_yx_fsv16 kernel (openvinotoolkit#17477)

+ Invalid calculation in reducing un-aligned feature axis for b_fs_yx_fsv16 + Some reduce modes are not invariant by using 0 value out of range + Added jit ZERO_INVARIANT_REDUCTION + Enable blocked unit-tests on dGPU by PR#15873 Signed-off-by: Min, Byungil <[email protected]>
Russ76 · May 24, 2023 · 0d3b636 · 0d3b636
1 parent 60d5572
commit 0d3b636
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 33 deletions.
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl
@@ -156,7 +156,7 @@ KERNEL(reduce_fsv16)(
     const uint xy   = (uint)get_global_id(1) * READ_OFFSET;
     const uint x    = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
     const uint y    = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
-#endif
+#endif  // !IS_REDUCE_XY
     const uint bf   = (uint)get_global_id(2) * SIMD;
     const uint b    = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
     const uint f    = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
@@ -252,7 +252,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
             for (uint yi = y_out; yi < y_max_val; ++yi) {
                 for (uint xi = x_out; xi < x_max_val; ++xi) {
                     INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
-                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
                         if (fi + FSV <= INPUT0_FEATURE_NUM)
                             input = BLOCK_READ(data, offset);
                         else
@@ -269,7 +269,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
                 #if INPUT0_SIZE_X % READ_OFFSET != 0
                     for (uint xi = x_leftover_start; xi < x_leftover_end; ++xi) {
                         INPUT0_TYPE leftovers = INIT_VAL;
-                        #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                        #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
                             if (fi + FSV <= INPUT0_FEATURE_NUM)
                                 leftovers = DT_INPUT_BLOCK_READ(data, offset);
                             else
@@ -330,27 +330,25 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
         if (get_sub_group_local_id() == 0)
             output[out_idx] = final_result;
     #endif
-#else
+#else  // !REDUCE_X
     ACCUMULATOR_VEC acc = (ACCUMULATOR_VEC)(INIT_VAL);
     for (uint bi = batch_out; bi < batch_max_val; ++bi) {
         for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
+
             for (uint yi = y_out; yi < y_max_val; ++yi) {
                 for (uint xi = x_out; xi < x_max_val; ++xi) {
-#if HANDLE_FEATURE_REMAINDER
-                    INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
-                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
+                        INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
                         if (fi + FSV <= INPUT0_FEATURE_NUM)
                             input = BLOCK_READ(data, offset);
                         else
                             if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
                                 for (int i = 0; i < READ_OFFSET; ++i)
                                     input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
                     #else
-                        input = BLOCK_READ(data, offset);
+                        INPUT_VEC input = BLOCK_READ(data, offset);
                     #endif
-#else
-                    INPUT_VEC input = BLOCK_READ(data, offset);
-#endif
+
                     unroll_for (int i = 0; i < READ_OFFSET; ++i)
                         acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
                     offset += input_x_pitch;
@@ -410,7 +408,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
             #endif
         }
     }
-#endif
+#endif  // !REDUCE_X
 }
 
 #undef SIMD

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp
@@ -12,6 +12,7 @@ namespace kernel_selector {
 
 static const size_t SIMD = 16;
 static const size_t XY_OPT_F_LIMITS = 96;
+static const size_t AXIS_F = 1;
 static const size_t AXIS_Y = 2;
 static const size_t AXIS_X = 3;
 using NDims = std::vector<kernel_selector::Tensor::Dim>;
@@ -78,6 +79,15 @@ static bool can_opt_reduce_xy(const reduce_params& params) {
         input_dims[1].v <= XY_OPT_F_LIMITS;
 }
 
+static bool reducing_unaligned_f_axis(const reduce_params& params) {
+    if (count(params.reduceAxes.begin(), params.reduceAxes.end(), AXIS_F) > 0) {
+        if (params.inputs[0].Feature().v % 16 != 0)
+            return true;
+    }
+
+    return false;
+}
+
 ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F16);
@@ -216,13 +226,14 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
         }
     }
 
-    // MIN/MAX mode should handle feature remainder in case reduce axes includes feature
-    if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) {
-        if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) {
-            if (params.inputs[0].Feature().v % 16 != 0) {
-                jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1));
-            }
-        }
+    // Some reduction modes are affected by 0 value (e.g. min, max, prod ...)
+    bool zero_invariant_mode = params.reduceMode == ReduceMode::L1 || params.reduceMode == ReduceMode::L2 ||
+                               params.reduceMode == ReduceMode::LOG_SUM || params.reduceMode == ReduceMode::LOG_SUM_EXP ||
+                               params.reduceMode == ReduceMode::MEAN || params.reduceMode == ReduceMode::OR ||
+                               params.reduceMode == ReduceMode::SUM || params.reduceMode == ReduceMode::SUM_SQUARE;
+
+    if (zero_invariant_mode && reducing_unaligned_f_axis(params)) {
+        jit.AddConstant(MakeJitConstant("ZERO_INVARIANT_REDUCTION", 1));
     }
 
     return jit;
@@ -232,6 +243,8 @@ KernelsData ReduceKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, con
     KernelsData kds = GetCommonKernelsData(params, options);
     const reduce_params& orgParams = static_cast<const reduce_params&>(params);
 
+    // To get perf gain of reduction of un-aligned f axis,
+    // Reduce kernel uses 0 value out of range in inner block by disabling re-use memory
     if (orgParams.inputs[0].Feature().v % 16 != 0) {
         kds[0].can_reuse_memory = false;
     }

diff --git a/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp
@@ -34,8 +34,10 @@ struct reduce_test_params {
 
 class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
 public:
-    void execute(reduce_test_params& p, bool is_dynamic = false) {
-        auto input_prim = get_mem(get_input_layout(p));
+    // If an input generator fills values for blocked formats, it sets random values outside of shape.
+    // To avoid this issue made by a generator, it could use a proper planar format given by 'default_format' and add a reorder.
+    void execute(reduce_test_params& p, bool is_dynamic = false, bool use_planar_input = false) {
+        auto input_prim = get_mem(get_input_layout(p, use_planar_input));
 
         cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
         cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
@@ -63,8 +65,11 @@ class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
         return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format };
     }
 
-    layout get_input_layout(reduce_test_params& p) {
-        return layout{ p.in_shape, p.data_type, p.input_format };
+    layout get_input_layout(reduce_test_params& p, bool use_planar_input = false) {
+        if (use_planar_input)
+            return layout{ p.in_shape, p.data_type, format::get_default_format(p.input_format)};
+        else
+            return layout{ p.in_shape, p.data_type, p.input_format };
     }
 
     layout get_output_layout(reduce_test_params& p) {
@@ -113,19 +118,17 @@ class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
 
 class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
 TEST_P(reduce_eltwise_activation_quantize, basic) {
-    // TODO: Fix me, refer PR(#15873)
-    if (engine.get_device_info().supports_immad)
-        return;
     auto p = GetParam();
     update_out_shape(p);
     create_topologies(
-        input_layout("input", get_input_layout(p)),
+        input_layout("input", get_input_layout(p, true)),
+        reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
         data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
         data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
         data("out_lo", get_mem(get_single_element_layout(p), -128)),
         data("out_hi", get_mem(get_single_element_layout(p), 127)),
         data("eltwise_data", get_mem(get_output_layout(p))),
-        reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
+        reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
         eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
         activation("activation", input_info("eltwise"), activation_func::relu),
         quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"),
@@ -134,28 +137,31 @@ TEST_P(reduce_eltwise_activation_quantize, basic) {
     );
 
     tolerance = 1.f;
-    execute(p);
+    // Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
+    execute(p, false, true);
 }
 
 TEST_P(reduce_eltwise_activation_quantize, per_channel) {
     auto p = GetParam();
     update_out_shape(p);
     create_topologies(
-        input_layout("input", get_input_layout(p)),
+        input_layout("input", get_input_layout(p, true)),
+        reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
         data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
         data("out_lo", get_mem(get_single_element_layout(p), -128)),
         data("out_hi", get_mem(get_single_element_layout(p), 127)),
         data("eltwise_data", get_mem(get_output_layout(p))),
-        reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
+        reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
         eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
         activation("activation", input_info("eltwise"), activation_func::relu),
         quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"), input_info("out_lo"), input_info("out_hi"), 256, data_types::i8),
         reorder("output_reorder", input_info("quantize"), p.default_format, data_types::f32)
     );
 
     tolerance = 1.f;
-    execute(p);
+    // Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
+    execute(p, false, true);
 }
 
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
@@ -215,7 +221,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
     reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, { 2, 1, 0 }, true, "reduce_ref" },
     reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { 4, 3, 0 }, true, "reduce_ref" },
     reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, { 3, 2, 1 }, true, "reduce_ref" },
-    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
     reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 1, 3 }, true, "reduce_gpu_b_fs_yx_fsv16" },
     reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
     reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { 3, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
@@ -228,6 +233,17 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
     reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 3 }, true, "reduce_gpu_b_fs_yx_fsv16" }
 }));
 
+INSTANTIATE_TEST_SUITE_P(fusings_gpu_bf_axis, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::prod, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::mean, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum_square, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l1, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l2, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::log_sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }
+}));
+
 class reduce_scale_activation : public ReduceFusingTest {};
 TEST_P(reduce_scale_activation, basic) {
     auto p = GetParam();