Add vec_ld for aligned loads

Same for vec_st. Also see https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html
noloader · Mar 7, 2018 · d4eda27 · d4eda27
1 parent 9bb28a7
commit d4eda27
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -36,20 +36,28 @@ The Power8 source file is a work in progress. The main problem at the moment is
 
 Performance testing of SHA-512 has not started.
 
-## Benchmarks
+# Benchmarks
 
 The speedups can be tricky to measure, but concrete numbers are availble from Jack Lloyd's Botan. The relative speedups using a three second benchmark under the command `./botan speed --msec=3000 SHA-1 SHA-224 SHA-256` are as follows. The measurements were taken from a Intel Celeron J3455, and an ARMv8 LeMaker HiKey.
 
+## Intel SHA
+
 * Intel x86, SHA-1, GCC 6.2 - approximately 4.3x
 * Intel x86, SHA-1, Clang 3.8 - approximately 4.5x
 * Intel x86, SHA-224, GCC 6.2 - approximately 5.8x
 * Intel x86, SHA-224, Clang 3.8 - approximately 5.8x
 * Intel x86, SHA-256, GCC 6.2 - approximately 5.8x
 * Intel x86, SHA-256, Clang 3.8 - approximately 5.8x
 
+## ARM SHA
+
 * ARMv8, SHA-1, GCC 4.9 - approximately 4.8x
 * ARMv8, SHA-1, Clang 3.5 - approximately 5.9x
 * ARMv8, SHA-224, GCC 4.9 - approximately 9.2x
 * ARMv8, SHA-224, Clang 3.5 - approximately 12.6x
 * ARMv8, SHA-256, GCC 4.9 - approximately 9.2x
 * ARMv8, SHA-256, Clang 3.5 - approximately 12.6x
+
+## Power8 SHA
+
+To be determined.
diff --git a/sha256-2-p8.cxx b/sha256-2-p8.cxx
@@ -3,6 +3,7 @@
 
 /* sha256-2-p8.cxx rotates working variables in the callers instead of */
 /* the SHA round function. Loop unrolling penalizes performance.       */
+/* Loads and stores: https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html. */
 
 /* xlC -DTEST_MAIN -qarch=pwr8 -qaltivec sha256-p8.cxx -o sha256-p8.exe  */
 /* g++ -DTEST_MAIN -mcpu=power8 sha256-p8.cxx -o sha256-p8.exe           */
@@ -50,6 +51,14 @@ static const ALIGN16 uint32_t K[] =
     0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 
+// Aligned load
+template <class T> static inline
+uint32x4_p8 VectorLoad32x4(const T* data, int offset)
+{
+    return vec_ld(offset, (uint32_t*)data);
+}
+
+// Unaligned load
 template <class T> static inline
 uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
 {
@@ -60,6 +69,14 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
 #endif
 }
 
+// Aligned store
+template <class T> static inline
+void VectorStore32x4(const uint32x4_p8 val, T* data, int offset)
+{
+    vec_st(val, offset, (uint32_t*)data);
+}
+
+// Unaligned store
 template <class T> static inline
 void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
 {
@@ -164,10 +181,10 @@ void SHA256_SCHEDULE(uint32_t W[64+2], const uint8_t* data)
 #if (__LITTLE_ENDIAN__)
     const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
     for (unsigned int i=0; i<16; i+=4)
-        VectorStore32x4u(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
+        VectorStore32x4(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
 #else
     for (unsigned int i=0; i<16; i+=4)
-        VectorStore32x4u(VectorLoad32x4u(data, i*4), W, i*4);
+        VectorStore32x4(VectorLoad32x4u(data, i*4), W, i*4);
 #endif
 
     // At i=62, W[i-2] reads the 65th and 66th elements. W[] has 2 extra "don't care" elements.
@@ -229,15 +246,15 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
 
         for (unsigned int i=0; i<64; i+=8)
         {
-            uint32x4_p8 k = VectorLoad32x4u(K, i*4);
-            uint32x4_p8 w = VectorLoad32x4u(W, i*4);
+            uint32x4_p8 k = VectorLoad32x4(K, i*4);
+            uint32x4_p8 w = VectorLoad32x4(W, i*4);
             SHA256_ROUND<0>(w,k, a,b,c,d,e,f,g,h);
             SHA256_ROUND<1>(w,k, h,a,b,c,d,e,f,g);
             SHA256_ROUND<2>(w,k, g,h,a,b,c,d,e,f);
             SHA256_ROUND<3>(w,k, f,g,h,a,b,c,d,e);
 
-            k = VectorLoad32x4u(K, i*4+16);
-            w = VectorLoad32x4u(W, i*4+16);
+            k = VectorLoad32x4(K, i*4+16);
+            w = VectorLoad32x4(W, i*4+16);
             SHA256_ROUND<4>(w,k, e,f,g,h,a,b,c,d);
             SHA256_ROUND<5>(w,k, d,e,f,g,h,a,b,c);
             SHA256_ROUND<6>(w,k, c,d,e,f,g,h,a,b);
@@ -249,8 +266,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
         data += 64;
     }
 
-    VectorStore32x4u(abcd, state,  0);
-    VectorStore32x4u(efgh, state, 16);
+    VectorStore32x4(abcd, state,  0);
+    VectorStore32x4(efgh, state, 16);
 }
 
 #if defined(TEST_MAIN)

diff --git a/sha256-p8.cxx b/sha256-p8.cxx
@@ -1,8 +1,9 @@
 /* sha256-p8.cxx - Power8 SHA extensions using C intrinsics  */
 /*   Written and placed in public domain by Jeffrey Walton   */
 
-/* sha256-p8.cxx rotates working variables in the SHA round */
-/* function. Loop unrolling penalizes performance.          */
+/* sha256-p8.cxx rotates working variables in the SHA round function   */
+/* and not the caller. Loop unrolling penalizes performance.           */
+/* Loads and stores: https://gcc.gnu.org/ml/gcc/2015-03/msg00140.html. */
 
 /* xlC -DTEST_MAIN -qarch=pwr8 -qaltivec sha256-p8.cxx -o sha256-p8.exe  */
 /* g++ -DTEST_MAIN -mcpu=power8 sha256-p8.cxx -o sha256-p8.exe           */
@@ -50,6 +51,14 @@ static const ALIGN16 uint32_t K[] =
     0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 
+// Aligned load
+template <class T> static inline
+uint32x4_p8 VectorLoad32x4(const T* data, int offset)
+{
+    return vec_ld(offset, (uint32_t*)data);
+}
+
+// Unaligned load
 template <class T> static inline
 uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
 {
@@ -60,6 +69,14 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
 #endif
 }
 
+// Aligned store
+template <class T> static inline
+void VectorStore32x4(const uint32x4_p8 val, T* data, int offset)
+{
+    vec_st(val, offset, (uint32_t*)data);
+}
+
+// Unaligned store
 template <class T> static inline
 void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
 {
@@ -164,10 +181,10 @@ void SHA256_SCHEDULE(uint32_t W[64+2], const uint8_t* data)
 #if (__LITTLE_ENDIAN__)
     const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
     for (unsigned int i=0; i<16; i+=4)
-        VectorStore32x4u(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
+        VectorStore32x4(VectorPermute32x4(VectorLoad32x4u(data, i*4), mask), W, i*4);
 #else
     for (unsigned int i=0; i<16; i+=4)
-        VectorStore32x4u(VectorLoad32x4u(data, i*4), W, i*4);
+        VectorStore32x4(VectorLoad32x4u(data, i*4), W, i*4);
 #endif
 
     // At i=62, W[i-2] reads the 65th and 66th elements. W[] has 2 extra "don't care" elements.
@@ -213,8 +230,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
     // +2 because Schedule reads beyond the last element
     ALIGN16 uint32_t W[64+2];
 
-    uint32x4_p8 abcd = VectorLoad32x4u(state,  0);
-    uint32x4_p8 efgh = VectorLoad32x4u(state, 16);
+    uint32x4_p8 abcd = VectorLoad32x4(state,  0);
+    uint32x4_p8 efgh = VectorLoad32x4(state, 16);
 
     while (blocks--)
     {
@@ -230,8 +247,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
 
         for (unsigned int i=0; i<64; i+=4)
         {
-            const uint32x4_p8 k = VectorLoad32x4u(K, i*4);
-            const uint32x4_p8 w = VectorLoad32x4u(W, i*4);
+            const uint32x4_p8 k = VectorLoad32x4(K, i*4);
+            const uint32x4_p8 w = VectorLoad32x4(W, i*4);
             SHA256_ROUND<0>(w,k, a,b,c,d,e,f,g,h);
             SHA256_ROUND<1>(w,k, a,b,c,d,e,f,g,h);
             SHA256_ROUND<2>(w,k, a,b,c,d,e,f,g,h);
@@ -243,8 +260,8 @@ void sha256_process_p8(uint32_t state[8], const uint8_t data[], uint32_t length)
         data += 64;
     }
 
-    VectorStore32x4u(abcd, state,  0);
-    VectorStore32x4u(efgh, state, 16);
+    VectorStore32x4(abcd, state,  0);
+    VectorStore32x4(efgh, state, 16);
 }
 
 #if defined(TEST_MAIN)