Skip to content

Commit

Permalink
Use unaligned SSE2 instructions for loading data.
Browse files Browse the repository at this point in the history
When using the functions in gost3411-2012-core.c in the context
of a shared library `const unsigned char *data` can be passed in
from unaligned memory.

Using `_mm_loadu_si128` over `_mm_load_si128` does not imply any
speed penalty on recent (Sandy Bridge and later) CPUs.
  • Loading branch information
besser82 authored and adegtyarev committed Oct 25, 2018
1 parent 432d5de commit c94d6ca
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 16 deletions.
16 changes: 8 additions & 8 deletions gost3411-2012-sse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@

#define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
const __m128i *__m128p = (const __m128i *) &P[0]; \
xmm0 = _mm_load_si128(&__m128p[0]); \
xmm1 = _mm_load_si128(&__m128p[1]); \
xmm2 = _mm_load_si128(&__m128p[2]); \
xmm3 = _mm_load_si128(&__m128p[3]); \
xmm0 = _mm_loadu_si128(&__m128p[0]); \
xmm1 = _mm_loadu_si128(&__m128p[1]); \
xmm2 = _mm_loadu_si128(&__m128p[2]); \
xmm3 = _mm_loadu_si128(&__m128p[3]); \
}

#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
Expand All @@ -57,10 +57,10 @@

#define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
const __m128i *__m128p = (const __m128i *) &P[0]; \
xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \
xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \
xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \
xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \
xmm0 = _mm_xor_si128(xmm0, _mm_loadu_si128(&__m128p[0])); \
xmm1 = _mm_xor_si128(xmm1, _mm_loadu_si128(&__m128p[1])); \
xmm2 = _mm_xor_si128(xmm2, _mm_loadu_si128(&__m128p[2])); \
xmm3 = _mm_xor_si128(xmm3, _mm_loadu_si128(&__m128p[3])); \
}

#define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
Expand Down
16 changes: 8 additions & 8 deletions gost3411-2012-sse41.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@

#define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
const __m128i *__m128p = (const __m128i *) &P[0]; \
xmm0 = _mm_load_si128(&__m128p[0]); \
xmm1 = _mm_load_si128(&__m128p[1]); \
xmm2 = _mm_load_si128(&__m128p[2]); \
xmm3 = _mm_load_si128(&__m128p[3]); \
xmm0 = _mm_loadu_si128(&__m128p[0]); \
xmm1 = _mm_loadu_si128(&__m128p[1]); \
xmm2 = _mm_loadu_si128(&__m128p[2]); \
xmm3 = _mm_loadu_si128(&__m128p[3]); \
}

#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
Expand All @@ -55,10 +55,10 @@

#define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
const __m128i *__m128p = (const __m128i *) &P[0]; \
xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \
xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \
xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \
xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \
xmm0 = _mm_xor_si128(xmm0, _mm_loadu_si128(&__m128p[0])); \
xmm1 = _mm_xor_si128(xmm1, _mm_loadu_si128(&__m128p[1])); \
xmm2 = _mm_xor_si128(xmm2, _mm_loadu_si128(&__m128p[2])); \
xmm3 = _mm_xor_si128(xmm3, _mm_loadu_si128(&__m128p[3])); \
}

#define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
Expand Down

0 comments on commit c94d6ca

Please sign in to comment.