From d4eda2784062837a8aacac4c592fa2e34eb96c9c Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 6 Mar 2018 22:43:05 -0500 Subject: [PATCH] Add vec_ld for aligned loads Same for vec_st. Also see https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html --- README.md | 10 +++++++++- sha256-2-p8.cxx | 33 +++++++++++++++++++++++++-------- sha256-p8.cxx | 37 +++++++++++++++++++++++++++---------- 3 files changed, 61 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index f0d5f80..9530c02 100644 --- a/README.md +++ b/README.md @@ -36,10 +36,12 @@ The Power8 source file is a work in progress. The main problem at the moment is Performance testing of SHA-512 has not started. -## Benchmarks +# Benchmarks The speedups can be tricky to measure, but concrete numbers are availble from Jack Lloyd's Botan. The relative speedups using a three second benchmark under the command `./botan speed --msec=3000 SHA-1 SHA-224 SHA-256` are as follows. The measurements were taken from a Intel Celeron J3455, and an ARMv8 LeMaker HiKey. +## Intel SHA + * Intel x86, SHA-1, GCC 6.2 - approximately 4.3x * Intel x86, SHA-1, Clang 3.8 - approximately 4.5x * Intel x86, SHA-224, GCC 6.2 - approximately 5.8x @@ -47,9 +49,15 @@ The speedups can be tricky to measure, but concrete numbers are availble from Ja * Intel x86, SHA-256, GCC 6.2 - approximately 5.8x * Intel x86, SHA-256, Clang 3.8 - approximately 5.8x +## ARM SHA + * ARMv8, SHA-1, GCC 4.9 - approximately 4.8x * ARMv8, SHA-1, Clang 3.5 - approximately 5.9x * ARMv8, SHA-224, GCC 4.9 - approximately 9.2x * ARMv8, SHA-224, Clang 3.5 - approximately 12.6x * ARMv8, SHA-256, GCC 4.9 - approximately 9.2x * ARMv8, SHA-256, Clang 3.5 - approximately 12.6x + +## Power8 SHA + +To be determined. \ No newline at end of file diff --git a/sha256-2-p8.cxx b/sha256-2-p8.cxx index b0e3791..b5a7bd3 100644 --- a/sha256-2-p8.cxx +++ b/sha256-2-p8.cxx @@ -3,6 +3,7 @@ /* sha256-2-p8.cxx rotates working variables in the callers instead of */ /* the SHA round function. Loop unrolling penalizes performance. */ +/* Loads and stores: https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html. */ /* xlC -DTEST_MAIN -qarch=pwr8 -qaltivec sha256-p8.cxx -o sha256-p8.exe */ /* g++ -DTEST_MAIN -mcpu=power8 sha256-p8.cxx -o sha256-p8.exe */ @@ -50,6 +51,14 @@ static const ALIGN16 uint32_t K[] = 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; +// Aligned load +template static inline +uint32x4_p8 VectorLoad32x4(const T* data, int offset) +{ + return vec_ld(offset, (uint32_t*)data); +} + +// Unaligned load template static inline uint32x4_p8 VectorLoad32x4u(const T* data, int offset) { @@ -60,6 +69,14 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset) #endif } +// Aligned store +template static inline +void VectorStore32x4(const uint32x4_p8 val, T* data, int offset) +{ + vec_st(val, offset, (uint32_t*)data); +} + +// Unaligned store template static inline void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset) { @@ -164,10 +181,10 @@ void SHA256_SCHEDULE(uint32_t W[64+2], const uint8_t* data) #if (__LITTLE_ENDIAN__) const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; for (unsigned int i=0; i<16; i+=4) - VectorStore32x4u(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4); + VectorStore32x4(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4); #else for (unsigned int i=0; i<16; i+=4) - VectorStore32x4u(VectorLoad32x4u(data, i*4), W, i*4); + VectorStore32x4(VectorLoad32x4u(data, i*4), W, i*4); #endif // At i=62, W[i-2] reads the 65th and 66th elements. W[] has 2 extra "don't care" elements. @@ -229,15 +246,15 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length) for (unsigned int i=0; i<64; i+=8) { - uint32x4_p8 k = VectorLoad32x4u(K, i*4); - uint32x4_p8 w = VectorLoad32x4u(W, i*4); + uint32x4_p8 k = VectorLoad32x4(K, i*4); + uint32x4_p8 w = VectorLoad32x4(W, i*4); SHA256_ROUND<0>(w,k, a,b,c,d,e,f,g,h); SHA256_ROUND<1>(w,k, h,a,b,c,d,e,f,g); SHA256_ROUND<2>(w,k, g,h,a,b,c,d,e,f); SHA256_ROUND<3>(w,k, f,g,h,a,b,c,d,e); - k = VectorLoad32x4u(K, i*4+16); - w = VectorLoad32x4u(W, i*4+16); + k = VectorLoad32x4(K, i*4+16); + w = VectorLoad32x4(W, i*4+16); SHA256_ROUND<4>(w,k, e,f,g,h,a,b,c,d); SHA256_ROUND<5>(w,k, d,e,f,g,h,a,b,c); SHA256_ROUND<6>(w,k, c,d,e,f,g,h,a,b); @@ -249,8 +266,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length) data += 64; } - VectorStore32x4u(abcd, state, 0); - VectorStore32x4u(efgh, state, 16); + VectorStore32x4(abcd, state, 0); + VectorStore32x4(efgh, state, 16); } #if defined(TEST_MAIN) diff --git a/sha256-p8.cxx b/sha256-p8.cxx index a23807e..66be46b 100644 --- a/sha256-p8.cxx +++ b/sha256-p8.cxx @@ -1,8 +1,9 @@ /* sha256-p8.cxx - Power8 SHA extensions using C intrinsics */ /* Written and placed in public domain by Jeffrey Walton */ -/* sha256-p8.cxx rotates working variables in the SHA round */ -/* function. Loop unrolling penalizes performance. */ +/* sha256-p8.cxx rotates working variables in the SHA round function */ +/* and not the caller. Loop unrolling penalizes performance. */ +/* Loads and stores: https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html. */ /* xlC -DTEST_MAIN -qarch=pwr8 -qaltivec sha256-p8.cxx -o sha256-p8.exe */ /* g++ -DTEST_MAIN -mcpu=power8 sha256-p8.cxx -o sha256-p8.exe */ @@ -50,6 +51,14 @@ static const ALIGN16 uint32_t K[] = 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; +// Aligned load +template static inline +uint32x4_p8 VectorLoad32x4(const T* data, int offset) +{ + return vec_ld(offset, (uint32_t*)data); +} + +// Unaligned load template static inline uint32x4_p8 VectorLoad32x4u(const T* data, int offset) { @@ -60,6 +69,14 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset) #endif } +// Aligned store +template static inline +void VectorStore32x4(const uint32x4_p8 val, T* data, int offset) +{ + vec_st(val, offset, (uint32_t*)data); +} + +// Unaligned store template static inline void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset) { @@ -164,10 +181,10 @@ void SHA256_SCHEDULE(uint32_t W[64+2], const uint8_t* data) #if (__LITTLE_ENDIAN__) const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; for (unsigned int i=0; i<16; i+=4) - VectorStore32x4u(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4); + VectorStore32x4(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4); #else for (unsigned int i=0; i<16; i+=4) - VectorStore32x4u(VectorLoad32x4u(data, i*4), W, i*4); + VectorStore32x4(VectorLoad32x4u(data, i*4), W, i*4); #endif // At i=62, W[i-2] reads the 65th and 66th elements. W[] has 2 extra "don't care" elements. @@ -213,8 +230,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length) // +2 because Schedule reads beyond the last element ALIGN16 uint32_t W[64+2]; - uint32x4_p8 abcd = VectorLoad32x4u(state, 0); - uint32x4_p8 efgh = VectorLoad32x4u(state, 16); + uint32x4_p8 abcd = VectorLoad32x4(state, 0); + uint32x4_p8 efgh = VectorLoad32x4(state, 16); while (blocks--) { @@ -230,8 +247,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length) for (unsigned int i=0; i<64; i+=4) { - const uint32x4_p8 k = VectorLoad32x4u(K, i*4); - const uint32x4_p8 w = VectorLoad32x4u(W, i*4); + const uint32x4_p8 k = VectorLoad32x4(K, i*4); + const uint32x4_p8 w = VectorLoad32x4(W, i*4); SHA256_ROUND<0>(w,k, a,b,c,d,e,f,g,h); SHA256_ROUND<1>(w,k, a,b,c,d,e,f,g,h); SHA256_ROUND<2>(w,k, a,b,c,d,e,f,g,h); @@ -243,8 +260,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length) data += 64; } - VectorStore32x4u(abcd, state, 0); - VectorStore32x4u(efgh, state, 16); + VectorStore32x4(abcd, state, 0); + VectorStore32x4(efgh, state, 16); } #if defined(TEST_MAIN)