From 94d5d819613b89511f2034f414538429ee35f1b6 Mon Sep 17 00:00:00 2001
From: Ilya Znamenskiy <ilya.znamenskiy@intel.com>
Date: Thu, 7 Oct 2021 10:30:07 +0300
Subject: [PATCH] [GPU] Quantize scale shift opt optimizations (#7770)

---
 .../quantize_kernel_scale_shift_opt.cpp       |  18 ++-
 .../quantize_gpu_scale_shift_opt.cl           | 127 ++++++++++++++----
 2 files changed, 116 insertions(+), 29 deletions(-)

diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp
index 733f6612777fbe..8805980745c465 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp
@@ -44,7 +44,8 @@ CommonDispatchData QuantizeKernelScaleShift::SetDefault(const quantize_params& p
         dispatchData.lws[0] = 1;
         dispatchData.lws[1] = sub_group_size;
         dispatchData.lws[2] = 1;
-    } else if (output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) {
+    } else if (output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 || output.GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 ||
+               output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16) {
         dispatchData.gws[0] = output.Y().v * output.X().v;
         dispatchData.gws[1] = Align(output.Feature().v, feature_size);
         dispatchData.gws[2] = Align(output.Batch().v, feature_size);
@@ -63,8 +64,9 @@ CommonDispatchData QuantizeKernelScaleShift::SetDefault(const quantize_params& p
 JitConstants QuantizeKernelScaleShift::GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const {
     JitConstants jit = Parent::GetJitConstants(params, dispatchData);
 
-    if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 ||
-        params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) {
+    if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 || params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 ||
+        params.output.GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 || params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16) {
+        jit.AddConstant(MakeJitConstant("FEATURE_BLOCKED_FORMAT", true));
         jit.AddConstant(MakeJitConstant("GWS_BATCH", 2));
         jit.AddConstant(MakeJitConstant("GWS_FEATURE", 1));
         jit.AddConstant(MakeJitConstant("GWS_YX", 0));
@@ -74,21 +76,31 @@ JitConstants QuantizeKernelScaleShift::GetJitConstants(const quantize_params& pa
         jit.Merge(tensor_jits);
     }
 
+    auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi;
+    auto has_output_range_round = !(params.output.GetDType() == Datatype::INT8 || params.output.GetDType() == Datatype::UINT8);
+
     jit.AddConstant(MakeJitConstant("HAS_POST_SCALE", params.has_post_scale));
     jit.AddConstant(MakeJitConstant("HAS_POST_SHIFT", params.has_post_shift));
     jit.AddConstant(MakeJitConstant("HAS_PRE_SHIFT", params.has_pre_shift));
     jit.AddConstant(MakeJitConstant("HAS_CLAMP", params.has_clamp));
+    jit.AddConstant(MakeJitConstant("HAS_MIN_CLAMP", params.has_min_clamp));
+    jit.AddConstant(MakeJitConstant("HAS_MAX_CLAMP", params.has_max_clamp));
     jit.AddConstant(MakeJitConstant("PER_TENSOR_INPUT_RANGE", params.per_tensor_input_range));
+    jit.AddConstant(MakeJitConstant("PER_TENSOR_OUTPUT_RANGE", params.per_tensor_output_range));
     jit.AddConstant(MakeJitConstant("PER_TENSOR_INPUT_SCALE", params.per_tensor_input_scale));
     jit.AddConstant(MakeJitConstant("PER_TENSOR_INPUT_SHIFT", params.per_tensor_input_shift));
     jit.AddConstant(MakeJitConstant("PER_TENSOR_OUTPUT_SCALE", params.per_tensor_output_scale));
     jit.AddConstant(MakeJitConstant("PER_TENSOR_OUTPUT_SHIFT", params.per_tensor_output_shift));
     jit.AddConstant(MakeJitConstant("IN_LO_VAL", params.in_lo));
     jit.AddConstant(MakeJitConstant("IN_HI_VAL", params.in_hi));
+    jit.AddConstant(MakeJitConstant("OUT_LO_VAL", params.out_lo));
+    jit.AddConstant(MakeJitConstant("OUT_HI_VAL", params.out_hi));
     jit.AddConstant(MakeJitConstant("IN_SCALE_VAL", params.in_scale));
     jit.AddConstant(MakeJitConstant("IN_SHIFT_VAL", params.in_shift));
     jit.AddConstant(MakeJitConstant("OUT_SCALE_VAL", params.out_scale));
     jit.AddConstant(MakeJitConstant("OUT_SHIFT_VAL", params.out_shift));
+    jit.AddConstant(MakeJitConstant("CAN_USE_OUTPUT_RANGE", can_use_output_range));
+    jit.AddConstant(MakeJitConstant("HAS_OUTPUT_RANGE_ROUND", has_output_range_round));
 
     return jit;
 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/quantize_gpu_scale_shift_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/quantize_gpu_scale_shift_opt.cl
index 5b5517db5717d1..061d19074ffc25 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/quantize_gpu_scale_shift_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/quantize_gpu_scale_shift_opt.cl
@@ -5,6 +5,9 @@
 #include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"
 
+#define TO_OUTPUT_TYPE              CAT(convert_, OUTPUT_TYPE)
+#define TO_OUTPUT_TYPE_SAT_RTE      CAT(TO_OUTPUT_TYPE, _sat_rte)
+
 #ifdef SUB_GROUP_SIZE
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 #endif
@@ -22,49 +25,60 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
 {
     const int b = get_global_id(GWS_BATCH);
     const int of = get_global_id(GWS_FEATURE);
+
 #if OUTPUT_DIMS <= 4
     const int yx = get_global_id(GWS_YX);
+
     const int x = yx % OUTPUT_SIZE_X;
     const int y = yx / OUTPUT_SIZE_X;
     const int z = 0;
+
+    const int output_offset = OUTPUT_GET_INDEX(b, of, y, x);
 #elif OUTPUT_DIMS == 5
     const int zyx = get_global_id(GWS_YX);
+    const int zyx_div_x = zyx / OUTPUT_SIZE_X;
+
     const int x = zyx % OUTPUT_SIZE_X;
-    const int y = (zyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
-    const int z = (zyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y;
+    const int y = zyx_div_x % OUTPUT_SIZE_Y;
+    const int z = zyx_div_x / OUTPUT_SIZE_Y;
+
+    const int output_offset = OUTPUT_GET_INDEX(b, of, z, y, x);
 #elif OUTPUT_DIMS == 6
     const int wzyx = get_global_id(GWS_YX);
+    const int wzyx_div_x = wzyx / OUTPUT_SIZE_X;
+    const int wzyx_div_xy = wzyx_div_x / OUTPUT_SIZE_Y;
+
     const int x = wzyx % OUTPUT_SIZE_X;
-    const int y = (wzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
-    const int z = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
-    const int w = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z;
+    const int y = wzyx_div_x % OUTPUT_SIZE_Y;
+    const int z = wzyx_div_xy % OUTPUT_SIZE_Z;
+    const int w = wzyx_div_xy / OUTPUT_SIZE_Z;
+
+    const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
+#else
+#   error quantize_gpu_scale_shift_opt.cl: output tensors with more than 6 dimensions are unsupported
 #endif
 
-#if INPUT0_DIMS == 6
-    const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
+#if INPUT0_DIMS <= 4
+    const int input_offset = INPUT0_GET_INDEX(b, of, y, x);
 #elif INPUT0_DIMS == 5
     const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x);
-#elif INPUT0_DIMS <= 4
-    const int input_offset = INPUT0_GET_INDEX(b, of, y, x);
-#endif
-
-#if OUTPUT_DIMS == 6
-    const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
-#elif OUTPUT_DIMS == 5
-    const int output_offset = OUTPUT_GET_INDEX(b, of, z, y, x);
-#elif OUTPUT_DIMS <= 4
-    const int output_offset = OUTPUT_GET_INDEX(b, of, y, x);
+#elif INPUT0_DIMS == 6
+    const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
+#else
+#   error quantize_gpu_scale_shift_opt.cl: input tensors with more than 6 dimensions are unsupported
 #endif
 
-#if HAS_CLAMP && !PER_TENSOR_INPUT_RANGE
+#if HAS_CLAMP && !PER_TENSOR_INPUT_RANGE && !CAN_USE_OUTPUT_RANGE
 #if INPUT1_DIMS == 4
     const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, y, x);
 #elif INPUT1_DIMS == 5
     const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x);
 #elif INPUT1_DIMS == 6
     const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x);
+#else
+#   error quantize_gpu_scale_shift_opt.cl: unsupported INPUT1_DIMS size
 #endif
-#endif
+#endif // HAS_CLAMP && !PER_TENSOR_INPUT_RANGE && !CAN_USE_OUTPUT_RANGE
 
 #if INPUT7_DIMS == 4
     const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, y, x);
@@ -72,6 +86,8 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
     const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, z, y, x);
 #elif INPUT7_DIMS == 6
     const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, w, z, y, x);
+#else
+#   error quantize_gpu_scale_shift_opt.cl: unsupported INPUT7_DIMS size
 #endif
 
 #if PER_TENSOR_INPUT_SCALE
@@ -79,6 +95,7 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
 #else
     INPUT1_TYPE input_scale_val  = input_scale[scales_offset];
 #endif
+
 #if PER_TENSOR_INPUT_SHIFT
     INPUT1_TYPE input_shift_val  = IN_SHIFT_VAL;
 #else
@@ -97,19 +114,67 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
     INPUT1_TYPE output_shift_val = output_shift[scales_offset];
 #endif
 
-#if PER_TENSOR_INPUT_RANGE && HAS_CLAMP
+#if HAS_CLAMP
+#if CAN_USE_OUTPUT_RANGE
+    INPUT1_TYPE output_low_val   = OUT_LO_VAL;
+    INPUT1_TYPE output_high_val  = OUT_HI_VAL;
+#else
+#if PER_TENSOR_INPUT_RANGE
     INPUT1_TYPE input_low_val    = IN_LO_VAL;
     INPUT1_TYPE input_high_val   = IN_HI_VAL;
-#elif HAS_CLAMP
+#else
     INPUT1_TYPE input_low_val    = input_low[in_range_offset];
     INPUT1_TYPE input_high_val   = input_high[in_range_offset];
+#endif // PER_TENSOR_INPUT_RANGE
+#endif // CAN_USE_OUTPUT_RANGE
+#endif // HAS_CLAMP
+
+// ************************************************************* //
+// Calculations for optimized branch with the output range usage //
+// ************************************************************* //
+
+#if CAN_USE_OUTPUT_RANGE
+
+#if HAS_PRE_SHIFT
+    INPUT1_TYPE val = TO_INPUT1_TYPE(input[input_offset]) * input_scale_val + input_shift_val;
+#else
+    INPUT1_TYPE val = TO_INPUT1_TYPE(input[input_offset]) * input_scale_val;
+#endif
+
+#if HAS_OUTPUT_RANGE_ROUND
+    val = round(val);
+#endif
+
+#if HAS_POST_SCALE
+    val *= output_scale_val;
+#endif
+
+#if HAS_POST_SHIFT
+    val += output_shift_val;
 #endif
 
 #if HAS_CLAMP
-    INPUT1_TYPE val = min(max(TO_INPUT1_TYPE(input[input_offset]), input_low_val), input_high_val);
+#if HAS_MIN_CLAMP && HAS_MAX_CLAMP
+    val = clamp(val, output_low_val, output_high_val);
+#elif HAS_MIN_CLAMP
+    val = max(val, output_low_val);
+#else // HAS_MAX_CLAMP
+    val = min(val, output_high_val);
+#endif
+#endif // HAS_CLAMP
+
+// ************************************************************** //
+// Calculations for alternative branch with the input range usage //
+// ************************************************************** //
+
+#else // CAN_USE_OUTPUT_RANGE
+
+#if HAS_CLAMP
+    INPUT1_TYPE val = clamp(TO_INPUT1_TYPE(input[input_offset]), input_low_val, input_high_val);
 #else
     INPUT1_TYPE val = TO_INPUT1_TYPE(input[input_offset]);
 #endif
+
 #if HAS_PRE_SHIFT
     val = round(val * input_scale_val + input_shift_val);
 #else
@@ -117,18 +182,28 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
 #endif
 
 #if HAS_POST_SCALE
-    val = val*output_scale_val;
+    val *= output_scale_val;
 #endif
+
 #if HAS_POST_SHIFT
     val += output_shift_val;
 #endif
 
-#if OUTPUT_LAYOUT_B_FS_YX_FSV16
+#endif // CAN_USE_OUTPUT_RANGE
+
+// *********************************** //
+// Common section with results writing //
+// *********************************** //
+
+#if FEATURE_BLOCKED_FORMAT
     if (of < OUTPUT_FEATURE_NUM)
 #endif
 #if OUTPUT_IS_FP
-    output[output_offset] = TO_OUTPUT_TYPE_SAT(val);
+        output[output_offset] = TO_OUTPUT_TYPE_SAT(val);
 #else
-    output[output_offset] = TO_OUTPUT_TYPE_SAT(round(val));
+        output[output_offset] = TO_OUTPUT_TYPE_SAT_RTE(val);
 #endif
 }
+
+#undef TO_OUTPUT_TYPE
+#undef TO_OUTPUT_TYPE_SAT_RTE