Skip to content

Commit

Permalink
Add vec_ld for aligned loads
Browse files Browse the repository at this point in the history
  • Loading branch information
noloader committed Mar 7, 2018
1 parent 9bb28a7 commit d4eda27
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 19 deletions.
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,28 @@ The Power8 source file is a work in progress. The main problem at the moment is

Performance testing of SHA-512 has not started.

## Benchmarks
# Benchmarks

The speedups can be tricky to measure, but concrete numbers are availble from Jack Lloyd's Botan. The relative speedups using a three second benchmark under the command `./botan speed --msec=3000 SHA-1 SHA-224 SHA-256` are as follows. The measurements were taken from a Intel Celeron J3455, and an ARMv8 LeMaker HiKey.

## Intel SHA

* Intel x86, SHA-1, GCC 6.2 - approximately 4.3x
* Intel x86, SHA-1, Clang 3.8 - approximately 4.5x
* Intel x86, SHA-224, GCC 6.2 - approximately 5.8x
* Intel x86, SHA-224, Clang 3.8 - approximately 5.8x
* Intel x86, SHA-256, GCC 6.2 - approximately 5.8x
* Intel x86, SHA-256, Clang 3.8 - approximately 5.8x

## ARM SHA

* ARMv8, SHA-1, GCC 4.9 - approximately 4.8x
* ARMv8, SHA-1, Clang 3.5 - approximately 5.9x
* ARMv8, SHA-224, GCC 4.9 - approximately 9.2x
* ARMv8, SHA-224, Clang 3.5 - approximately 12.6x
* ARMv8, SHA-256, GCC 4.9 - approximately 9.2x
* ARMv8, SHA-256, Clang 3.5 - approximately 12.6x

## Power8 SHA

To be determined.
33 changes: 25 additions & 8 deletions sha256-2-p8.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

/* sha256-2-p8.cxx rotates working variables in the callers instead of */
/* the SHA round function. Loop unrolling penalizes performance. */
/* Loads and stores: https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html. */

/* xlC -DTEST_MAIN -qarch=pwr8 -qaltivec sha256-p8.cxx -o sha256-p8.exe */
/* g++ -DTEST_MAIN -mcpu=power8 sha256-p8.cxx -o sha256-p8.exe */
Expand Down Expand Up @@ -50,6 +51,14 @@ static const ALIGN16 uint32_t K[] =
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};

// Aligned load
template <class T> static inline
uint32x4_p8 VectorLoad32x4(const T* data, int offset)
{
return vec_ld(offset, (uint32_t*)data);
}

// Unaligned load
template <class T> static inline
uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
{
Expand All @@ -60,6 +69,14 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
#endif
}

// Aligned store
template <class T> static inline
void VectorStore32x4(const uint32x4_p8 val, T* data, int offset)
{
vec_st(val, offset, (uint32_t*)data);
}

// Unaligned store
template <class T> static inline
void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
{
Expand Down Expand Up @@ -164,10 +181,10 @@ void SHA256_SCHEDULE(uint32_t W[64+2], const uint8_t* data)
#if (__LITTLE_ENDIAN__)
const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
for (unsigned int i=0; i<16; i+=4)
VectorStore32x4u(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
VectorStore32x4(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
#else
for (unsigned int i=0; i<16; i+=4)
VectorStore32x4u(VectorLoad32x4u(data, i*4), W, i*4);
VectorStore32x4(VectorLoad32x4u(data, i*4), W, i*4);
#endif

// At i=62, W[i-2] reads the 65th and 66th elements. W[] has 2 extra "don't care" elements.
Expand Down Expand Up @@ -229,15 +246,15 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)

for (unsigned int i=0; i<64; i+=8)
{
uint32x4_p8 k = VectorLoad32x4u(K, i*4);
uint32x4_p8 w = VectorLoad32x4u(W, i*4);
uint32x4_p8 k = VectorLoad32x4(K, i*4);
uint32x4_p8 w = VectorLoad32x4(W, i*4);
SHA256_ROUND<0>(w,k, a,b,c,d,e,f,g,h);
SHA256_ROUND<1>(w,k, h,a,b,c,d,e,f,g);
SHA256_ROUND<2>(w,k, g,h,a,b,c,d,e,f);
SHA256_ROUND<3>(w,k, f,g,h,a,b,c,d,e);

k = VectorLoad32x4u(K, i*4+16);
w = VectorLoad32x4u(W, i*4+16);
k = VectorLoad32x4(K, i*4+16);
w = VectorLoad32x4(W, i*4+16);
SHA256_ROUND<4>(w,k, e,f,g,h,a,b,c,d);
SHA256_ROUND<5>(w,k, d,e,f,g,h,a,b,c);
SHA256_ROUND<6>(w,k, c,d,e,f,g,h,a,b);
Expand All @@ -249,8 +266,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
data += 64;
}

VectorStore32x4u(abcd, state, 0);
VectorStore32x4u(efgh, state, 16);
VectorStore32x4(abcd, state, 0);
VectorStore32x4(efgh, state, 16);
}

#if defined(TEST_MAIN)
Expand Down
37 changes: 27 additions & 10 deletions sha256-p8.cxx
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
/* sha256-p8.cxx - Power8 SHA extensions using C intrinsics */
/* Written and placed in public domain by Jeffrey Walton */

/* sha256-p8.cxx rotates working variables in the SHA round */
/* function. Loop unrolling penalizes performance. */
/* sha256-p8.cxx rotates working variables in the SHA round function */
/* and not the caller. Loop unrolling penalizes performance. */
/* Loads and stores: https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html. */

/* xlC -DTEST_MAIN -qarch=pwr8 -qaltivec sha256-p8.cxx -o sha256-p8.exe */
/* g++ -DTEST_MAIN -mcpu=power8 sha256-p8.cxx -o sha256-p8.exe */
Expand Down Expand Up @@ -50,6 +51,14 @@ static const ALIGN16 uint32_t K[] =
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};

// Aligned load
template <class T> static inline
uint32x4_p8 VectorLoad32x4(const T* data, int offset)
{
return vec_ld(offset, (uint32_t*)data);
}

// Unaligned load
template <class T> static inline
uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
{
Expand All @@ -60,6 +69,14 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
#endif
}

// Aligned store
template <class T> static inline
void VectorStore32x4(const uint32x4_p8 val, T* data, int offset)
{
vec_st(val, offset, (uint32_t*)data);
}

// Unaligned store
template <class T> static inline
void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
{
Expand Down Expand Up @@ -164,10 +181,10 @@ void SHA256_SCHEDULE(uint32_t W[64+2], const uint8_t* data)
#if (__LITTLE_ENDIAN__)
const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
for (unsigned int i=0; i<16; i+=4)
VectorStore32x4u(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
VectorStore32x4(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
#else
for (unsigned int i=0; i<16; i+=4)
VectorStore32x4u(VectorLoad32x4u(data, i*4), W, i*4);
VectorStore32x4(VectorLoad32x4u(data, i*4), W, i*4);
#endif

// At i=62, W[i-2] reads the 65th and 66th elements. W[] has 2 extra "don't care" elements.
Expand Down Expand Up @@ -213,8 +230,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
// +2 because Schedule reads beyond the last element
ALIGN16 uint32_t W[64+2];

uint32x4_p8 abcd = VectorLoad32x4u(state, 0);
uint32x4_p8 efgh = VectorLoad32x4u(state, 16);
uint32x4_p8 abcd = VectorLoad32x4(state, 0);
uint32x4_p8 efgh = VectorLoad32x4(state, 16);

while (blocks--)
{
Expand All @@ -230,8 +247,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)

for (unsigned int i=0; i<64; i+=4)
{
const uint32x4_p8 k = VectorLoad32x4u(K, i*4);
const uint32x4_p8 w = VectorLoad32x4u(W, i*4);
const uint32x4_p8 k = VectorLoad32x4(K, i*4);
const uint32x4_p8 w = VectorLoad32x4(W, i*4);
SHA256_ROUND<0>(w,k, a,b,c,d,e,f,g,h);
SHA256_ROUND<1>(w,k, a,b,c,d,e,f,g,h);
SHA256_ROUND<2>(w,k, a,b,c,d,e,f,g,h);
Expand All @@ -243,8 +260,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
data += 64;
}

VectorStore32x4u(abcd, state, 0);
VectorStore32x4u(efgh, state, 16);
VectorStore32x4(abcd, state, 0);
VectorStore32x4(efgh, state, 16);
}

#if defined(TEST_MAIN)
Expand Down

0 comments on commit d4eda27

Please sign in to comment.