Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
alalek committed Oct 15, 2018
2 parents 5dd46b5 + 24ced3d commit edacd91
Show file tree
Hide file tree
Showing 48 changed files with 2,752 additions and 2,274 deletions.
4 changes: 2 additions & 2 deletions modules/calib3d/src/fisheye.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ void cv::fisheye::undistortImage(InputArray distorted, OutputArray undistorted,
{
CV_INSTRUMENT_REGION();

Size size = new_size.area() != 0 ? new_size : distorted.size();
Size size = !new_size.empty() ? new_size : distorted.size();

cv::Mat map1, map2;
fisheye::initUndistortRectifyMap(K, D, cv::Matx33d::eye(), Knew, size, CV_16SC2, map1, map2 );
Expand Down Expand Up @@ -601,7 +601,7 @@ void cv::fisheye::estimateNewCameraMatrixForUndistortRectify(InputArray K, Input
new_f[1] /= aspect_ratio;
new_c[1] /= aspect_ratio;

if (new_size.area() > 0)
if (!new_size.empty())
{
double rx = new_size.width /(double)image_size.width;
double ry = new_size.height/(double)image_size.height;
Expand Down
4 changes: 2 additions & 2 deletions modules/calib3d/src/stereobm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1226,8 +1226,8 @@ class StereoBMImpl CV_FINAL : public StereoBM
parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, &params), 1);

Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
validDisparityRect = getValidDisparityROI(R1.area() > 0 ? R1 : validDisparityRect,
R2.area() > 0 ? R2 : validDisparityRect,
validDisparityRect = getValidDisparityROI(!R1.empty() ? R1 : validDisparityRect,
!R2.empty() ? R2 : validDisparityRect,
params.minDisparity, params.numDisparities,
params.SADWindowSize);

Expand Down
8 changes: 8 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_FP16
#endif

#if CV_SSE2 || CV_NEON || CV_VSX
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif

#if CV_SSE2

#include "opencv2/core/hal/intrin_sse_em.hpp"
#include "opencv2/core/hal/intrin_sse.hpp"

#elif CV_NEON
Expand Down Expand Up @@ -168,6 +174,8 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
#if CV_AVX2

#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx.hpp"

#endif
Expand Down
140 changes: 118 additions & 22 deletions modules/core/include/opencv2/core/hal/intrin_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ inline __m128 _v256_extract_low(const __m256& v)
inline __m128d _v256_extract_low(const __m256d& v)
{ return _mm256_castpd256_pd128(v); }

inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
{
const __m256i m = _mm256_set1_epi32(65535);
__m256i am = _mm256_min_epu32(a, m);
__m256i bm = _mm256_min_epu32(b, m);
return _mm256_packus_epi32(am, bm);
}

///////// Types ////////////

struct v_uint8x32
Expand Down Expand Up @@ -626,10 +634,8 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
Expand All @@ -650,13 +656,103 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)

// saturating multiply 8-bit, 16-bit
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
{
v_uint16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
}
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
{
v_int16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epu16(a.val, b.val);
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_uint16x16(_v256_packs_epu32(p0, p1));
}
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epi16(a.val, b.val);
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_int16x16(_mm256_packs_epi32(p0, p1));
}
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
{ a = a * b; return a; }
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
{ a = a * b; return a; }
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
{ a = a * b; return a; }
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
{ a = a * b; return a; }

/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }

OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)

inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
{
__m256i ad = _mm256_srai_epi16(a.val, 8);
__m256i bd = _mm256_srai_epi16(b.val, 8);
__m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
__m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd

const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
}
inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
{
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
}

// Multiply and expand
inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
v_uint16x16& c, v_uint16x16& d)
{
v_uint16x16 a0, a1, b0, b1;
v_expand(a, a0, a1);
v_expand(b, b0, b1);
c = v_mul_wrap(a0, b0);
d = v_mul_wrap(a1, b1);
}

inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
v_int16x16& c, v_int16x16& d)
{
v_int16x16 a0, a1, b0, b1;
v_expand(a, a0, a1);
v_expand(b, b0, b1);
c = v_mul_wrap(a0, b0);
d = v_mul_wrap(a1, b1);
}

inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
v_int32x8& c, v_int32x8& d)
{
v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));

v_int16x16 v0, v1;
v_zip(a * b, vhi, v0, v1);
v_zip(v_mul_wrap(a, b), vhi, v0, v1);

c = v_reinterpret_as_s32(v0);
d = v_reinterpret_as_s32(v1);
Expand All @@ -668,7 +764,7 @@ inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));

v_uint16x16 v0, v1;
v_zip(a * b, vhi, v0, v1);
v_zip(v_mul_wrap(a, b), vhi, v0, v1);

c = v_reinterpret_as_u32(v0);
d = v_reinterpret_as_u32(v1);
Expand All @@ -685,20 +781,6 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }

/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }

OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)

/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
Expand Down Expand Up @@ -1385,6 +1467,10 @@ OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_ca
b0.val = intrin(_v256_extract_low(a.val)); \
b1.val = intrin(_v256_extract_high(a.val)); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
inline _Tpwvec v256_load_expand(const _Tp* ptr) \
{ \
__m128i a = _mm_loadu_si128((const __m128i*)ptr); \
Expand Down Expand Up @@ -1430,7 +1516,12 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack(a, a)); }

inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
{ v_store_low(ptr, v_pack(a, a)); }
{
const __m256i m = _mm256_set1_epi16(255);
__m256i am = _mm256_min_epu16(a.val, m);
am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
v_store_low(ptr, v_uint8x32(am));
}

inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack_u(a, a)); }
Expand Down Expand Up @@ -1484,16 +1575,21 @@ inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }

inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }

inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }

inline void v_pack_store(short* ptr, const v_int32x8& a)
{ v_store_low(ptr, v_pack(a, a)); }

inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
{ v_store_low(ptr, v_pack(a, a)); }
{
const __m256i m = _mm256_set1_epi32(65535);
__m256i am = _mm256_min_epu32(a.val, m);
am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
v_store_low(ptr, v_uint16x16(am));
}

inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
{ v_store_low(ptr, v_pack_u(a, a)); }
Expand Down
58 changes: 52 additions & 6 deletions modules/core/include/opencv2/core/hal/intrin_cpp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
Expand Down Expand Up @@ -185,11 +185,14 @@ Regular integers:
|load, store | x | x | x | x | x | x |
|interleave | x | x | x | x | x | x |
|expand | x | x | x | x | x | x |
|expand_low | x | x | x | x | x | x |
|expand_high | x | x | x | x | x | x |
|expand_q | x | x | | | | |
|add, sub | x | x | x | x | x | x |
|add_wrap, sub_wrap | x | x | x | x | | |
|mul | | | x | x | x | x |
|mul_expand | | | x | x | x | |
|mul_wrap | x | x | x | x | | |
|mul | x | x | x | x | x | x |
|mul_expand | x | x | x | x | x | |
|compare | x | x | x | x | x | x |
|shift | | | x | x | x | x |
|dotprod | | | | x | | |
Expand Down Expand Up @@ -680,7 +683,7 @@ OPENCV_HAL_IMPL_CMP_OP(!=)

//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
template<typename _Tp, int n> \
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
Expand All @@ -694,12 +697,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
/** @brief Add values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)

/** @brief Subtract values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)

/** @brief Multiply values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)

//! @cond IGNORED
template<typename T> inline T _absdiff(T a, T b)
Expand Down Expand Up @@ -1106,6 +1114,44 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
}
}

/** @brief Expand lower values to the wider pack type
Same as cv::v_expand, but return lower half of the vector.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {A B}
@endcode */
template<typename _Tp, int n>
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_low(const v_reg<_Tp, n>& a)
{
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
for( int i = 0; i < (n/2); i++ )
b.s[i] = a.s[i];
return b;
}

/** @brief Expand higher values to the wider pack type
Same as cv::v_expand_low, but expand higher half of the vector instead.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {C D}
@endcode */
template<typename _Tp, int n>
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_high(const v_reg<_Tp, n>& a)
{
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
for( int i = 0; i < (n/2); i++ )
b.s[i] = a.s[i+(n/2)];
return b;
}

//! @cond IGNORED
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
v_reinterpret_as_int(const v_reg<_Tp, n>& a)
Expand Down
Loading

0 comments on commit edacd91

Please sign in to comment.