diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl index 1f36dfbf6bbd85..0708afa74f9753 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl @@ -156,7 +156,7 @@ KERNEL(reduce_fsv16)( const uint xy = (uint)get_global_id(1) * READ_OFFSET; const uint x = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET); const uint y = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET); -#endif +#endif // !IS_REDUCE_XY const uint bf = (uint)get_global_id(2) * SIMD; const uint b = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD); const uint f = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD); @@ -252,7 +252,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * for (uint yi = y_out; yi < y_max_val; ++yi) { for (uint xi = x_out; xi < x_max_val; ++xi) { INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL); - #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) + #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION if (fi + FSV <= INPUT0_FEATURE_NUM) input = BLOCK_READ(data, offset); else @@ -269,7 +269,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * #if INPUT0_SIZE_X % READ_OFFSET != 0 for (uint xi = x_leftover_start; xi < x_leftover_end; ++xi) { INPUT0_TYPE leftovers = INIT_VAL; - #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) + #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION if (fi + FSV <= INPUT0_FEATURE_NUM) leftovers = DT_INPUT_BLOCK_READ(data, offset); else @@ -330,15 +330,15 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * if (get_sub_group_local_id() == 0) output[out_idx] = final_result; #endif -#else +#else // !REDUCE_X ACCUMULATOR_VEC acc = (ACCUMULATOR_VEC)(INIT_VAL); for (uint bi = batch_out; bi < batch_max_val; ++bi) { for (uint fi = feature_out; fi < feature_max_val; fi += FSV) { + for (uint yi = y_out; yi < y_max_val; ++yi) { for (uint xi = x_out; xi < x_max_val; ++xi) { -#if HANDLE_FEATURE_REMAINDER - INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL); - #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) + #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION + INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL); if (fi + FSV <= INPUT0_FEATURE_NUM) input = BLOCK_READ(data, offset); else @@ -346,11 +346,9 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * for (int i = 0; i < READ_OFFSET; ++i) input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()]; #else - input = BLOCK_READ(data, offset); + INPUT_VEC input = BLOCK_READ(data, offset); #endif -#else - INPUT_VEC input = BLOCK_READ(data, offset); -#endif + unroll_for (int i = 0; i < READ_OFFSET; ++i) acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]); offset += input_x_pitch; @@ -410,7 +408,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * #endif } } -#endif +#endif // !REDUCE_X } #undef SIMD diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp index df44d029931680..20c4e582346539 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp @@ -12,6 +12,7 @@ namespace kernel_selector { static const size_t SIMD = 16; static const size_t XY_OPT_F_LIMITS = 96; +static const size_t AXIS_F = 1; static const size_t AXIS_Y = 2; static const size_t AXIS_X = 3; using NDims = std::vector; @@ -78,6 +79,15 @@ static bool can_opt_reduce_xy(const reduce_params& params) { input_dims[1].v <= XY_OPT_F_LIMITS; } +static bool reducing_unaligned_f_axis(const reduce_params& params) { + if (count(params.reduceAxes.begin(), params.reduceAxes.end(), AXIS_F) > 0) { + if (params.inputs[0].Feature().v % 16 != 0) + return true; + } + + return false; +} + ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); @@ -216,13 +226,14 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa } } - // MIN/MAX mode should handle feature remainder in case reduce axes includes feature - if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) { - if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) { - if (params.inputs[0].Feature().v % 16 != 0) { - jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1)); - } - } + // Some reduction modes are affected by 0 value (e.g. min, max, prod ...) + bool zero_invariant_mode = params.reduceMode == ReduceMode::L1 || params.reduceMode == ReduceMode::L2 || + params.reduceMode == ReduceMode::LOG_SUM || params.reduceMode == ReduceMode::LOG_SUM_EXP || + params.reduceMode == ReduceMode::MEAN || params.reduceMode == ReduceMode::OR || + params.reduceMode == ReduceMode::SUM || params.reduceMode == ReduceMode::SUM_SQUARE; + + if (zero_invariant_mode && reducing_unaligned_f_axis(params)) { + jit.AddConstant(MakeJitConstant("ZERO_INVARIANT_REDUCTION", 1)); } return jit; @@ -232,6 +243,8 @@ KernelsData ReduceKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, con KernelsData kds = GetCommonKernelsData(params, options); const reduce_params& orgParams = static_cast(params); + // To get perf gain of reduction of un-aligned f axis, + // Reduce kernel uses 0 value out of range in inner block by disabling re-use memory if (orgParams.inputs[0].Feature().v % 16 != 0) { kds[0].can_reuse_memory = false; } diff --git a/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp index 2a7d75c107cfea..84ac3b4fc8ce87 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp @@ -34,8 +34,10 @@ struct reduce_test_params { class ReduceFusingTest : public ::BaseFusingTest { public: - void execute(reduce_test_params& p, bool is_dynamic = false) { - auto input_prim = get_mem(get_input_layout(p)); + // If an input generator fills values for blocked formats, it sets random values outside of shape. + // To avoid this issue made by a generator, it could use a proper planar format given by 'default_format' and add a reorder. + void execute(reduce_test_params& p, bool is_dynamic = false, bool use_planar_input = false) { + auto input_prim = get_mem(get_input_layout(p, use_planar_input)); cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); @@ -63,8 +65,11 @@ class ReduceFusingTest : public ::BaseFusingTest { return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format }; } - layout get_input_layout(reduce_test_params& p) { - return layout{ p.in_shape, p.data_type, p.input_format }; + layout get_input_layout(reduce_test_params& p, bool use_planar_input = false) { + if (use_planar_input) + return layout{ p.in_shape, p.data_type, format::get_default_format(p.input_format)}; + else + return layout{ p.in_shape, p.data_type, p.input_format }; } layout get_output_layout(reduce_test_params& p) { @@ -113,19 +118,17 @@ class ReduceFusingTest : public ::BaseFusingTest { class reduce_eltwise_activation_quantize : public ReduceFusingTest {}; TEST_P(reduce_eltwise_activation_quantize, basic) { - // TODO: Fix me, refer PR(#15873) - if (engine.get_device_info().supports_immad) - return; auto p = GetParam(); update_out_shape(p); create_topologies( - input_layout("input", get_input_layout(p)), + input_layout("input", get_input_layout(p, true)), + reorder("input_reorder", input_info("input"), p.input_format, p.data_type), data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), data("out_lo", get_mem(get_single_element_layout(p), -128)), data("out_hi", get_mem(get_single_element_layout(p), 127)), data("eltwise_data", get_mem(get_output_layout(p))), - reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims), + reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims), eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type), activation("activation", input_info("eltwise"), activation_func::relu), quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"), @@ -134,20 +137,22 @@ TEST_P(reduce_eltwise_activation_quantize, basic) { ); tolerance = 1.f; - execute(p); + // Use a planar input format. It is changed to the 'input_format' by 'input_reorder' + execute(p, false, true); } TEST_P(reduce_eltwise_activation_quantize, per_channel) { auto p = GetParam(); update_out_shape(p); create_topologies( - input_layout("input", get_input_layout(p)), + input_layout("input", get_input_layout(p, true)), + reorder("input_reorder", input_info("input"), p.input_format, p.data_type), data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), data("out_lo", get_mem(get_single_element_layout(p), -128)), data("out_hi", get_mem(get_single_element_layout(p), 127)), data("eltwise_data", get_mem(get_output_layout(p))), - reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims), + reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims), eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type), activation("activation", input_info("eltwise"), activation_func::relu), quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"), input_info("out_lo"), input_info("out_hi"), 256, data_types::i8), @@ -155,7 +160,8 @@ TEST_P(reduce_eltwise_activation_quantize, per_channel) { ); tolerance = 1.f; - execute(p); + // Use a planar input format. It is changed to the 'input_format' by 'input_reorder' + execute(p, false, true); } INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector{ @@ -215,7 +221,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, { 2, 1, 0 }, true, "reduce_ref" }, reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { 4, 3, 0 }, true, "reduce_ref" }, reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, { 3, 2, 1 }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 1, 3 }, true, "reduce_gpu_b_fs_yx_fsv16" }, reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { 3, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, @@ -228,6 +233,17 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 3 }, true, "reduce_gpu_b_fs_yx_fsv16" } })); +INSTANTIATE_TEST_SUITE_P(fusings_gpu_bf_axis, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector{ + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::prod, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::mean, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum_square, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l1, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l2, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::log_sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" } +})); + class reduce_scale_activation : public ReduceFusingTest {}; TEST_P(reduce_scale_activation, basic) { auto p = GetParam();