Skip to content

Commit

Permalink
Add SSE2 implementation of floor and ceil in x86 targets. (Tencent#3595)
Browse files Browse the repository at this point in the history
* Add SSE2 implementation of floor and ceil in x86 targets.

* apply code-format changes

* Update the SSE2 floor implementation.

Co-authored-by: MouriNaruto <[email protected]>
  • Loading branch information
MouriNaruto and MouriNaruto authored Mar 9, 2022
1 parent f4f4cfd commit 2b4a212
Showing 1 changed file with 60 additions and 18 deletions.
78 changes: 60 additions & 18 deletions src/layer/x86/unaryop_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,41 @@ struct unary_op_floor
{
#if __SSE4_1__
return (__m128)_mm_floor_ps(x);
#endif // __SSE4_1__ \
// TODO sse optimize
float tmp[4];
_mm_storeu_ps(tmp, x);
tmp[0] = floor(tmp[0]);
tmp[1] = floor(tmp[1]);
tmp[2] = floor(tmp[2]);
tmp[3] = floor(tmp[3]);
return _mm_loadu_ps(tmp);
#endif // __SSE4_1__

// The sign bit mask.
const __m128 magic_sign_bit = _mm_set_ps1(-0.0f);

// The smallest float number that have no fractional part. (2^23)
const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f);

// absolute = abs(x);
__m128 absolute = _mm_andnot_ps(magic_sign_bit, x);

// negative_mask = magic_sign_bit && x;
__m128 negative_mask = _mm_and_ps(magic_sign_bit, x);

// no_fraction = (magic_smallest_no_fraction < absolute);
__m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute);

// truncated = static_cast<float>(static_cast<uint32_t>(absolute));
__m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute));

// truncated_with_sign = (truncated || negative_mask);
__m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask);

// negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f);
__m128 negative_fix = _mm_and_ps(
_mm_cmplt_ps(x, truncated_with_sign),
_mm_set_ps1(1.0f));

// fixed_result = truncated_with_sign - negative_fix;
__m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix);

// return ((x && no_fraction) || (!no_fraction && negative_fix));
return _mm_or_ps(
_mm_and_ps(x, no_fraction),
_mm_andnot_ps(no_fraction, fixed_result));
}
#if __AVX__
__m256 operator()(const __m256& x) const
Expand All @@ -163,15 +189,31 @@ struct unary_op_ceil
{
#if __SSE4_1__
return (__m128)_mm_ceil_ps(x);
#endif // __SSE4_1__ \
// TODO sse optimize
float tmp[4];
_mm_storeu_ps(tmp, x);
tmp[0] = ceil(tmp[0]);
tmp[1] = ceil(tmp[1]);
tmp[2] = ceil(tmp[2]);
tmp[3] = ceil(tmp[3]);
return _mm_loadu_ps(tmp);
#endif // __SSE4_1__
const __m128 magic_negative_one = _mm_set_ps1(-1.0f);
const __m128 magic_infinity = _mm_set_ps1(INFINITY);
const __m128 magic_max_fraction = _mm_set_ps1(8388607.5f);

__m128i v1 = _mm_castps_si128(x);
__m128i v2 = _mm_srai_epi32(v1, 31);
__m128i v3 = _mm_cmpgt_epi32(
_mm_and_si128(v1, _mm_castps_si128(magic_infinity)),
_mm_castps_si128(magic_max_fraction));
__m128 v4 = _mm_castsi128_ps(_mm_or_si128(
_mm_andnot_si128(
v3,
_mm_or_si128(
_mm_castps_si128(_mm_cvtepi32_ps(_mm_cvttps_epi32(x))),
_mm_slli_epi32(v2, 31))),
_mm_and_si128(v1, v3)));

return _mm_sub_ps(
v4,
_mm_castsi128_ps(_mm_andnot_si128(
v2,
_mm_andnot_si128(
_mm_cmpeq_epi32(v1, _mm_castps_si128(v4)),
_mm_castps_si128(magic_negative_one)))));
}
#if __AVX__
__m256 operator()(const __m256& x) const
Expand Down

0 comments on commit 2b4a212

Please sign in to comment.