Skip to content

Commit

Permalink
+add AVX-512BW optimizations of SynetMergedConvolution8iCdc, SynetMer…
Browse files Browse the repository at this point in the history
…gedConvolution8iCd, SynetMergedConvolution8iDc classes.
  • Loading branch information
ermig1979 committed Oct 5, 2020
1 parent b1419c8 commit 95ddfbb
Show file tree
Hide file tree
Showing 11 changed files with 1,597 additions and 19 deletions.
6 changes: 3 additions & 3 deletions docs/2020.html
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ <h3 id="R095">November X, 2020 (version X.X.94)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>AVX2 optimizations of SynetMergedConvolution8iCdc class.</li>
<li>AVX2 optimizations of SynetMergedConvolution8iCd class.</li>
<li>AVX2 optimizations of SynetMergedConvolution8iDc class.</li>
<li>AVX2 and AVX-512BW optimizations of SynetMergedConvolution8iCdc class.</li>
<li>AVX2 and AVX-512BW optimizations of SynetMergedConvolution8iCd class.</li>
<li>AVX2 and AVX-512BW optimizations of SynetMergedConvolution8iDc class.</li>
</ul>

<a href="#HOME">Home</a>
Expand Down
4 changes: 4 additions & 0 deletions prj/vs2019/Avx512bw.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetConversion.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetConvolution8iDepthwise.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetConvolution8iDirect.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetMergedConvolution8i.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetPooling.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetScale.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAvx512bwTexture.cpp" />
Expand Down Expand Up @@ -109,8 +110,11 @@
<ClInclude Include="..\..\src\Simd\SimdStore.h" />
<ClInclude Include="..\..\src\Simd\SimdStream.h" />
<ClInclude Include="..\..\src\Simd\SimdSynet.h" />
<ClInclude Include="..\..\src\Simd\SimdSynetConvolution32fCommon.h" />
<ClInclude Include="..\..\src\Simd\SimdSynetConvolution8i.h" />
<ClInclude Include="..\..\src\Simd\SimdSynetConvolution8iCommon.h" />
<ClInclude Include="..\..\src\Simd\SimdSynetMergedConvolution8i.h" />
<ClInclude Include="..\..\src\Simd\SimdSynetScale8i.h" />
<ClInclude Include="..\..\src\Simd\SimdTime.h" />
<ClInclude Include="..\..\src\Simd\SimdUpdate.h" />
<ClInclude Include="..\..\src\Simd\SimdView.hpp" />
Expand Down
12 changes: 12 additions & 0 deletions prj/vs2019/Avx512bw.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetConvolution8iDirect.cpp">
<Filter>Avx512bw</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdAvx512bwSynetMergedConvolution8i.cpp">
<Filter>Avx512bw</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Avx512bw">
Expand Down Expand Up @@ -317,5 +320,14 @@
<ClInclude Include="..\..\src\Simd\SimdView.hpp">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdSynetConvolution32fCommon.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdSynetMergedConvolution8i.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdSynetScale8i.h">
<Filter>Inc</Filter>
</ClInclude>
</ItemGroup>
</Project>
5 changes: 0 additions & 5 deletions src/Simd/SimdAvx512bwSynetConvolution8iDirect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,6 @@ namespace Simd
using AlgParam = SynetConvolution8iNhwcDirect::AlgParam;
using ConvolutionPtr = SynetConvolution8iNhwcDirect::ConvolutionPtr;

SIMD_INLINE __m512i Set4(const uint8_t* src)
{
return _mm512_set1_epi32(*(int32_t*)src);
}

template<bool overflow, Term8iType term, SimdConvolutionActivationType type, bool nofma> void ConvolutionNhwcDirect_2x1(const uint8_t * src0,
const ConvParam8i& p, const AlgParam & a, size_t dy, size_t dx, size_t srcC, size_t dstC, const int8_t * weight0,
const __m512* norm, const __m512 * bias, const __m512* params, const __m512 * scale, const __m512* shift, int32_t * buf, uint8_t* dst)
Expand Down
1,504 changes: 1,504 additions & 0 deletions src/Simd/SimdAvx512bwSynetMergedConvolution8i.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5525,7 +5525,7 @@ SIMD_API void SimdSynetMergedConvolution32fForward(void * context, const float *
SIMD_API void* SimdSynetMergedConvolution8iInit(size_t batch, const SimdConvolutionParameters* convs, size_t count, SimdSynetCompatibilityType compatibility)
{
typedef void* (*SimdSynetMergedConvolution8iInitPtr) (size_t batch, const SimdConvolutionParameters* convs, size_t count, SimdSynetCompatibilityType compatibility);
const static SimdSynetMergedConvolution8iInitPtr simdSynetMergedConvolution8iInit = SIMD_FUNC2(SynetMergedConvolution8iInit, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);// , SIMD_AVX512VNNI_FUNC, SIMD_AVX512BW_FUNC);
const static SimdSynetMergedConvolution8iInitPtr simdSynetMergedConvolution8iInit = SIMD_FUNC3(SynetMergedConvolution8iInit, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);// , SIMD_AVX512VNNI_FUNC);

return simdSynetMergedConvolution8iInit(batch, convs, count, compatibility);
}
Expand Down
9 changes: 9 additions & 0 deletions src/Simd/SimdStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,15 @@ namespace Simd
Sse::Store<align>(p2, _mm512_extractf32x4_ps(a, 2));
Sse::Store<align>(p3, _mm512_extractf32x4_ps(a, 3));
}

SIMD_INLINE __m128i Cvt32fTo8u(__m512 a)
{
#if 1
return _mm512_cvtusepi32_epi8(_mm512_max_epi32(_mm512_cvtps_epu32(a), _mm512_setzero_si512()));
#else
return _mm256_castsi256_si128(Avx2::PackI16ToU8(_mm512_cvtepi32_epi16(_mm512_cvtps_epi32(a)), _mm256_setzero_si256()));
#endif
}
}
#endif//SIMD_AVX512F_ENABLE

Expand Down
5 changes: 5 additions & 0 deletions src/Simd/SimdSynet.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,11 @@ namespace Simd
#ifdef SIMD_AVX512BW_ENABLE
namespace Avx512bw
{
SIMD_INLINE __m512i Set4(const uint8_t* src)
{
return _mm512_set1_epi32(*(int32_t*)src);
}

template<bool overflow> void Madd4(__m512i& i32, __m512i u8, __m512i i8);

template<> SIMD_INLINE void Madd4<true>(__m512i& i32, __m512i u8, __m512i i8)
Expand Down
31 changes: 25 additions & 6 deletions src/Simd/SimdSynetConvolution8iCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,9 @@ namespace Simd
{
template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* dst, int32_t* buf, __m512i sum,
const __m512* norm, const __m512* bias, const __m512* params, const __m512* scale, const __m512* shift, __m128i upper, __mmask16 tail = -1);

template<SimdConvolutionActivationType type> static SIMD_INLINE void Save(uint8_t* dst, __m512 sum,
const __m512* params, const __m512 & scale, const __m512 & shift, __m128i upper, __mmask16 tail = -1);
};

template <> struct Term8i<Term8iSingle8u>
Expand All @@ -824,10 +827,16 @@ namespace Simd
const __m512* norm, const __m512* bias, const __m512* params, const __m512* scale, const __m512* shift, __m128i upper, __mmask16 tail = -1)
{
__m512 f32 = Activate<type>(Fmadd<nofma>(_mm512_cvtepi32_ps(sum), norm[index], bias[index]), params, index);
__m512i i32 = _mm512_cvtps_epi32(Fmadd<nofma>(f32, scale[index], shift[index]));
__m128i u8 = _mm256_castsi256_si128(Avx2::PackI16ToU8(_mm512_cvtepi32_epi16(i32), Avx2::K_ZERO));
__m128i u8 = Cvt32fTo8u(Fmadd<nofma>(f32, scale[index], shift[index]));
_mm_mask_storeu_epi8(dst + index * F, tail, _mm_min_epu8(u8, upper));
}

template<SimdConvolutionActivationType type, bool nofma> static SIMD_INLINE void Save(uint8_t* dst, __m512 sum,
const __m512* params, const __m512& scale, const __m512& shift, __m128i upper, __mmask16 tail)
{
__m128i u8 = Cvt32fTo8u(Fmadd<nofma>(Activate<type>(sum, params, 0), scale, shift));
_mm_mask_storeu_epi8(dst, tail, _mm_min_epu8(u8, upper));
}
};

template <> struct Term8i<Term8iSingle32f>
Expand All @@ -838,6 +847,12 @@ namespace Simd
__m512 f32 = Activate<type>(Fmadd<nofma>(_mm512_cvtepi32_ps(sum), norm[index], bias[index]), params, index);
_mm512_mask_storeu_ps((float*)dst + index * F, tail, f32);
}

template<SimdConvolutionActivationType type, bool nofma> static SIMD_INLINE void Save(uint8_t* dst, __m512 sum,
const __m512* params, const __m512& scale, const __m512& shift, __m128i upper, __mmask16 tail)
{
_mm512_mask_storeu_ps((float*)dst, tail, Activate<type>(sum, params, 0));
}
};

template <> struct Term8i<Term8iFirst>
Expand Down Expand Up @@ -865,8 +880,7 @@ namespace Simd
{
sum = _mm512_add_epi32(_mm512_maskz_loadu_epi32(tail, buf + index * F), sum);
__m512 f32 = Activate<type>(Fmadd<nofma>(_mm512_cvtepi32_ps(sum), norm[index], bias[index]), params, index);
__m512i i32 = _mm512_cvtps_epi32(Fmadd<nofma>(f32, scale[index], shift[index]));
__m128i u8 = _mm256_castsi256_si128(Avx2::PackI16ToU8(_mm512_cvtepi32_epi16(i32), Avx2::K_ZERO));
__m128i u8 = Cvt32fTo8u(Fmadd<nofma>(f32, scale[index], shift[index]));
_mm_mask_storeu_epi8(dst + index * F, tail, _mm_min_epu8(u8, upper));
}
};
Expand Down Expand Up @@ -897,6 +911,12 @@ namespace Simd
Term8i<term>::template Save<type, 1, nofma>(dst, buf, sum1, norm, bias, params, scale, shift, upper, tail);
}

template<Term8iType term, SimdConvolutionActivationType type, bool nofma>
SIMD_INLINE void Save1(uint8_t* dst, __m512 sum, const __m512* params, const __m512& scale, const __m512& shift, __m128i upper, __mmask16 tail = -1)
{
Term8i<term>::template Save<type, nofma>(dst, sum, params, scale, shift, upper, tail);
}

//---------------------------------------------------------------------

template <Term8iType term> struct Term8iDepthwise
Expand All @@ -915,8 +935,7 @@ namespace Simd
__m512 f32 = Avx512f::Activate<type>(Fmadd<nofma>(_mm512_cvtepi32_ps(sum), _norm, _bias), params, offset, tail);
__m512 _scale = _mm512_maskz_loadu_ps(tail, scale + offset);
__m512 _shift = _mm512_maskz_loadu_ps(tail, shift + offset);
__m512i i32 = _mm512_cvtps_epi32(Fmadd<nofma>(f32, _scale, _shift));
__m128i u8 = _mm256_castsi256_si128(Avx2::PackI16ToU8(_mm512_cvtepi32_epi16(i32), Avx2::K_ZERO));
__m128i u8 = Cvt32fTo8u(Fmadd<nofma>(f32, _scale, _shift));
_mm_mask_storeu_epi8(dst + offset, tail, _mm_min_epu8(u8, upper));
}
};
Expand Down
25 changes: 25 additions & 0 deletions src/Simd/SimdSynetMergedConvolution8i.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,31 @@ namespace Simd
#ifdef SIMD_AVX512BW_ENABLE
namespace Avx512bw
{
class SynetMergedConvolution8iCdc : public Avx2::SynetMergedConvolution8iCdc
{
public:
SynetMergedConvolution8iCdc(const MergConvParam8i& p);

virtual String Ext() const { return "Avx512bw"; }
};

class SynetMergedConvolution8iCd : public Avx2::SynetMergedConvolution8iCd
{
public:
SynetMergedConvolution8iCd(const MergConvParam8i& p);

virtual String Ext() const { return "Avx512bw"; }
};

class SynetMergedConvolution8iDc : public Avx2::SynetMergedConvolution8iDc
{
public:
SynetMergedConvolution8iDc(const MergConvParam8i& p);

virtual String Ext() const { return "Avx512bw"; }
};

void* SynetMergedConvolution8iInit(size_t batch, const SimdConvolutionParameters* convs, size_t count, SimdSynetCompatibilityType compatibility);
}
#endif//SIMD_AVX512BW_ENABLE

Expand Down
13 changes: 9 additions & 4 deletions src/Test/TestSynetMergedConvolution8i.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,13 +258,13 @@ namespace Test
//const SimdConvolutionActivationType a0 = SimdConvolutionActivationPrelu, a1 = SimdConvolutionActivationHswish, a2 = SimdConvolutionActivationIdentity;
const SimdConvolutionActivationType a0 = SimdConvolutionActivationHswish, a1 = SimdConvolutionActivationIdentity, a2 = SimdConvolutionActivationPrelu;
#if defined(NDEBUG)
#if 1
#if 0
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 128, 20, 12), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 20), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 128, 20, 12), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 20), f32, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 128, 20, 12), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 128), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 128, 20, 12), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 128), f32, u8, 1, n), f1, f2);
#endif
#if 0
#if 1
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 1024, 8, 6), Cnv(a0, 1, 1, 1548), Cnv(a1, 3, 1), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 3, 320, 180), Cnv(a0, 3, 2, 16), Cnv(a1, 3, 1), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 16, 160, 90), Cnv(a0, 1, 1, 30), Cnv(a1, 3, 2), f32, u8, 1, n), f1, f2);
Expand All @@ -274,7 +274,7 @@ namespace Test
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 32, 40, 23), Cnv(a0, 1, 1, 64), Cnv(a1, 3, 1), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 64, 40, 23), Cnv(a0, 1, 1, 64), Cnv(a1, 3, 1), u8, u8, 1, p), f1, f2);
#endif
#if 0
#if 1
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 64, 40, 23), Cnv(a0, 3, 2), Cnv(a1, 1, 1, 128), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 64, 40, 23), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 30), f32, u8, 1, p), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 64, 40, 23), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 12), u8, f32, 0, o), f1, f2);
Expand All @@ -290,7 +290,7 @@ namespace Test
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 256, 10, 6), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 4), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 1280, 6, 8), Cnv(a0, 3, 1), Cnv(a1, 1, 1, 1024), u8, u8, 1, n), f1, f2);
#endif
#if 0
#if 1
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 256, 10, 6), Cnv(a0, 1, 1, 64), Cnv(a1, 3, 2), Cnv(a2, 1, 1, 256), u8, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 1060, 6, 7), Cnv(a0, 1, 1, 960), Cnv(a1, 3, 1), Cnv(a2, 1, 1, 1060), f32, u8, 1, n), f1, f2);
result = result && SynetMergedConvolution8iForwardAutoTest(eps, Param(Shp(1, 160, 8, 13), Cnv(a0, 1, 1, 960), Cnv(a1, 3, 1), Cnv(a2, 1, 1, 160), u8, f32, 1, o), f1, f2);
Expand Down Expand Up @@ -320,6 +320,11 @@ namespace Test
result = result && SynetMergedConvolution8iForwardAutoTest(EPS, FUNC_MC(Simd::Avx2::SynetMergedConvolution8iInit), FUNC_MC(SimdSynetMergedConvolution8iInit));
#endif

#ifdef SIMD_AVX512BW_ENABLE
if (Simd::Avx512bw::Enable)
result = result && SynetMergedConvolution8iForwardAutoTest(EPS, FUNC_MC(Simd::Avx512bw::SynetMergedConvolution8iInit), FUNC_MC(SimdSynetMergedConvolution8iInit));
#endif

return result;
}
}

0 comments on commit 95ddfbb

Please sign in to comment.