Skip to content

Commit

Permalink
Merge pull request opencv#7182 from mself:two_channel_universal_intri…
Browse files Browse the repository at this point in the history
…nsics
  • Loading branch information
mshabunin committed Aug 31, 2016
2 parents d4ae7f3 + 9678d48 commit 595fd27
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 8 deletions.
55 changes: 47 additions & 8 deletions modules/core/include/opencv2/core/hal/intrin_cpp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
Expand Down Expand Up @@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr)
return c;
}

/** @brief Load and deinterleave (4 channels)
/** @brief Load and deinterleave (2 channels)
Load data from memory deinterleave and store to 4 registers.
Load data from memory deinterleave and store to 2 registers.
Scheme:
@code
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
a.s[i] = ptr[i2];
b.s[i] = ptr[i2+1];
}
}

/** @brief Load and deinterleave (3 channels)
Load data from memory deinterleave and store to 3 registers.
Scheme:
@code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
Expand All @@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
}
}

/** @brief Load and deinterleave (3 channels)
/** @brief Load and deinterleave (4 channels)
Load data from memory deinterleave and store to 3 registers.
Load data from memory deinterleave and store to 4 registers.
Scheme:
@code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>
Expand All @@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
}
}

/** @brief Interleave and store (2 channels)
Interleave and store data from 2 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
ptr[i2] = a.s[i];
ptr[i2+1] = b.s[i];
}
}

/** @brief Interleave and store (3 channels)
Interleave and store data from 3 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>
Expand Down
13 changes: 13 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)

#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
} \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
{ \
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
Expand All @@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
c.val = v.val[2]; \
d.val = v.val[3]; \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
{ \
_Tpvec##x3_t v; \
Expand Down
24 changes: 24 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin_sse.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
}

// 2-channel, float only
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);

__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3

a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}

inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c )
{
Expand Down Expand Up @@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3
v_store(ptr + 12, t3);
}

// 2-channel, float only
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
{
// a0 a1 a2 a3 ...
// b0 b1 b2 b3 ...
__m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
__m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3

_mm_storeu_ps(ptr, u0);
_mm_storeu_ps((ptr + 4), u1);
}

#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0 ) \
Expand Down
27 changes: 27 additions & 0 deletions modules/core/test/test_intrin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,32 @@ template<typename R> struct TheTest
return *this;
}

// float32x4 only
TheTest & test_interleave_2channel()
{
Data<R> data1, data2;
data2 += 20;

R a = data1, b = data2;

LaneType buf2[R::nlanes * 2];

v_store_interleave(buf2, a, b);

Data<R> z(0);
a = b = z;

v_load_deinterleave(buf2, a, b);

for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(data1, Data<R>(a));
EXPECT_EQ(data2, Data<R>(b));
}

return *this;
}

// v_expand and v_load_expand
TheTest & test_expand()
{
Expand Down Expand Up @@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) {
TheTest<v_float32x4>()
.test_loadstore()
.test_interleave()
.test_interleave_2channel()
.test_addsub()
.test_mul()
.test_div()
Expand Down

0 comments on commit 595fd27

Please sign in to comment.