From f053cba2477c2a3eab5be29446eeed7023a1e13f Mon Sep 17 00:00:00 2001 From: Debargha Mukherjee Date: Wed, 21 Jun 2017 19:04:00 -0700 Subject: [PATCH] Reduce multiplier precision for warp least squares Includes reordering and other clamping changes, as well as changes to reduce multiplier precision. cam_lowres (60 frames): -0.092% BDRATE improvement in --disable-cdef --disable-global-motion --disable-ext-tx configuation. Change-Id: I0660c45b44fcd5a193534d8dadd1aa1ae5c5e27a --- aom_dsp/aom_dsp_common.h | 4 ++ av1/common/mv.h | 5 +- av1/common/warped_motion.c | 132 ++++++++++++++++++++++++++++--------- 3 files changed, 106 insertions(+), 35 deletions(-) diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h index 7f12ee8a6..3807ae054 100644 --- a/aom_dsp/aom_dsp_common.h +++ b/aom_dsp/aom_dsp_common.h @@ -80,6 +80,10 @@ static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); } +static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { + return value < low ? low : (value > high ? high : value); +} + static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } diff --git a/av1/common/mv.h b/av1/common/mv.h index 7fff18284..959831900 100644 --- a/av1/common/mv.h +++ b/av1/common/mv.h @@ -44,9 +44,8 @@ typedef struct mv32 { #define WARPEDMODEL_ROW3HOMO_PREC_BITS 16 #define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS) -#define WARPEDMODEL_DIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS + 1)) -#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 1)) -#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 1)) +#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3)) +#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 2)) // Bits of subpel precision for warped interpolation #define WARPEDPIXEL_PREC_BITS 6 diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c index 83932888e..637648aa8 100644 --- a/av1/common/warped_motion.c +++ b/av1/common/warped_motion.c @@ -1747,6 +1747,83 @@ void av1_warp_plane(WarpedMotionParams *wm, #define LS_PRODUCT2(a, b) \ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2) +#define USE_LIMITED_PREC_MULT 0 + +#if USE_LIMITED_PREC_MULT + +#define MUL_PREC_BITS 16 +static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) { + int msb = 0; + uint16_t mult = 0; + *shift = 0; + if (D != 0) { + msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + if (msb >= MUL_PREC_BITS) { + mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS); + *shift = msb + 1 - MUL_PREC_BITS; + } else { + mult = (uint16_t)D; + *shift = 0; + } + } + return mult; +} + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int32_t ret; + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp(v * (1 << (-shift)), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } + return ret; +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp( + ROUND_POWER_OF_TWO_SIGNED(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp( + v * (1 << (-shift)), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } +} + +#else + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64( + ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} +#endif // USE_LIMITED_PREC_MULT + static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, int mvx, WarpedMotionParams *wm, int mi_row, int mi_col) { @@ -1757,8 +1834,10 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - const int suy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1) * 8; - const int sux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1) * 8; + const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1); + const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1); + const int suy = isuy * 8; + const int sux = isux * 8; const int duy = suy + mvy; const int dux = sux + mvx; @@ -1845,38 +1924,27 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, shift = 0; } - int64_t v; - v = Px[0] * (int64_t)iDet; - wm->wmmat[2] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = Px[1] * (int64_t)iDet; - wm->wmmat[3] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = ((int64_t)dux * (1 << WARPEDMODEL_PREC_BITS)) - - (int64_t)sux * wm->wmmat[2] - (int64_t)suy * wm->wmmat[3]; - wm->wmmat[0] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3)); - - v = Py[0] * (int64_t)iDet; - wm->wmmat[4] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = Py[1] * (int64_t)iDet; - wm->wmmat[5] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = ((int64_t)duy * (1 << WARPEDMODEL_PREC_BITS)) - - (int64_t)sux * wm->wmmat[4] - (int64_t)suy * wm->wmmat[5]; - wm->wmmat[1] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3)); + wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift); + wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift); + wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift); + wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift); + + // Note: In the vx, vy expressions below, the max value of each of the + // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room + // for the first term so that the overall sum in the worst case fits + // within 32 bits overall. + int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + + isuy * wm->wmmat[3]); + int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * wm->wmmat[4] + + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); + wm->wmmat[0] = + clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + wm->wmmat[1] = + clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); wm->wmmat[6] = wm->wmmat[7] = 0; - - // Clamp values - wm->wmmat[0] = clamp(wm->wmmat[0], -WARPEDMODEL_TRANS_CLAMP, - WARPEDMODEL_TRANS_CLAMP - 1); - wm->wmmat[1] = clamp(wm->wmmat[1], -WARPEDMODEL_TRANS_CLAMP, - WARPEDMODEL_TRANS_CLAMP - 1); - wm->wmmat[2] = clamp(wm->wmmat[2], -WARPEDMODEL_DIAGAFFINE_CLAMP, - WARPEDMODEL_DIAGAFFINE_CLAMP - 1); - wm->wmmat[5] = clamp(wm->wmmat[5], -WARPEDMODEL_DIAGAFFINE_CLAMP, - WARPEDMODEL_DIAGAFFINE_CLAMP - 1); - wm->wmmat[3] = clamp(wm->wmmat[3], -WARPEDMODEL_NONDIAGAFFINE_CLAMP, - WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); - wm->wmmat[4] = clamp(wm->wmmat[4], -WARPEDMODEL_NONDIAGAFFINE_CLAMP, - WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); return 0; }