get mean30x4 to compile, but still crash

CortexFoundation · Mar 24, 2018 · ebbe9cb · ebbe9cb
1 parent 26c7360
commit ebbe9cb
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 44 deletions.
diff --git a/doc/cuckoo.tex b/doc/cuckoo.tex
@@ -26,7 +26,7 @@
 on cycle length, guiding the latter's choice.
 Trading in the other direction, algorithm "mean" uses a few dozen bits per edge but is 4x faster
 in practice, becoming memory bandwidth bound rather than latency bound.
-We present performance figures for optimized CPU and GPU implementations, including a CUDA port for GPUs,
+We present performance figures for optimized CPU and GPU implementations,
 and discuss possible ASIC implementations.
 % Both algorithms are shown to parallelize well.
 \end{abstract}
@@ -81,11 +81,10 @@ \section{Introduction}
 
 \section{Motivation}
 Cuckoo Cycle aims to be an ``egalitarian'' proof-of-work, that is,
-to minimize performance-per-dollar differences across hardware architectures,
-and make mining---the process of looking for proofs---on commodity hardware cost-effective.
-This is to be achieved by making main memory latency a bottleneck, since
-DRAM latencies have remained relatively stable while cpu-speed and memory bandwidth vary highly
-across hardware architecture and process technology.
+to minimize performance-per-dollar differences across hardware architectures.
+This is to be achieved by making main memory latency or bandwidth the bottleneck, since
+DRAM latency and bandwidth vary much less across hardware architecture and process technology
+than compute power.
 
 Our aim of a memory bound PoW translates to the following desirable properties:
 

diff --git a/src/Makefile b/src/Makefile
@@ -89,7 +89,7 @@ mean28sx1:	cuckoo.h siphash.h mean_miner.hpp mean_miner.cpp Makefile
 	$(GPP) -o $@ -DSAVEEDGES  -DXBITS=6 -DNSIPHASH=1 -DEDGEBITS=27 mean_miner.cpp $(LIBS)
 
 mean30x4:	cuckoo.h siphash.h mean_miner.hpp mean_miner.cpp Makefile
-	$(GPP) -o $@ -msse2 -DNSIPHASH=4 -DEDGEBITS=29 mean_miner.cpp $(LIBS)
+	$(GPP) -o $@ -mno-avx2 -DNSIPHASH=4 -DEDGEBITS=29 mean_miner.cpp $(LIBS)
 
 mean30x8:	cuckoo.h siphash.h mean_miner.hpp mean_miner.cpp Makefile
 	$(GPP) -o $@ -mavx2 -DNSIPHASH=8 -DEDGEBITS=29 mean_miner.cpp $(LIBS)

diff --git a/src/mean_miner.hpp b/src/mean_miner.hpp
@@ -333,10 +333,8 @@ class edgetrimmer {
     const u32   endy = NY * (id+1) / nthreads;
     u32 edge = starty << YZBITS, endedge = edge + NYZ;
 #if NSIPHASH == 4
-    static const __m256i vxmask = {XMASK, XMASK};
-    static const __m256i vyzmask = {YZMASK, YZMASK};
-    const __m128i vinit0 = _mm_load_si128((__m128i *)&sip_keys);
-    const __m128i vinit1 = _mm_load_si128((__m128i *)(&sip_keys + 2));
+    static const __m128i vxmask = {XMASK, XMASK};
+    static const __m128i vyzmask = {YZMASK, YZMASK};
     __m128i v0, v1, v2, v3, v4, v5, v6, v7;
     const u32 e2 = 2 * edge + uorv;
     __m128i vpacket0 = _mm_set_epi64x(e2+2, e2+0);
@@ -389,41 +387,41 @@ class edgetrimmer {
         }
 #endif
 #elif NSIPHASH == 4
-        v7 = v3 = _mm_unpackhi_epi64(vinit1, vinit1); // _mm_shuffle_epi32(vinit1, 0xee);
-        v4 = v0 = _mm_unpacklo_epi64(vinit0, vinit0); // _mm_shuffle_epi32(vinit0, 0x44);
-        v5 = v1 = _mm_unpackhi_epi64(vinit0, vinit0); // _mm_shuffle_epi32(vinit0, 0xee);
-        v6 = v2 = _mm_unpacklo_epi64(vinit1, vinit1); // _mm_shuffle_epi32(vinit1, 0x44);
+        v7 = v3 = _mm_set1_epi64x(sip_keys.k3);
+        v4 = v0 = _mm_set1_epi64x(sip_keys.k0);
+        v5 = v1 = _mm_set1_epi64x(sip_keys.k1);
+        v6 = v2 = _mm_set1_epi64x(sip_keys.k2);
 
         v3 = XOR(v3,vpacket0); v7 = XOR(v7,vpacket1);
         SIPROUNDX2N; SIPROUNDX2N;
         v0 = XOR(v0,vpacket0); v4 = XOR(v4,vpacket1);
-        v2 = XOR(v2,_mm128_broadcastq_epi64(_mm_cvtsi64_si128(0xff)));
-        v6 = XOR(v6,_mm128_broadcastq_epi64(_mm_cvtsi64_si128(0xff)));
+        v2 = XOR(v2, _mm_set1_epi64x(0xffLL));
+        v6 = XOR(v6, _mm_set1_epi64x(0xffLL));
         SIPROUNDX2N; SIPROUNDX2N; SIPROUNDX2N; SIPROUNDX2N;
         v0 = XOR(XOR(v0,v1),XOR(v2,v3));
         v4 = XOR(XOR(v4,v5),XOR(v6,v7));
 
-        vpacket0 = _mm128_add_epi64(vpacket0, vpacketinc);
-        vpacket1 = _mm128_add_epi64(vpacket1, vpacketinc);
-        v1 = _mm128_srli_epi64(v0, YZBITS) & vxmask;
-        v5 = _mm128_srli_epi64(v4, YZBITS) & vxmask;
+        vpacket0 = _mm_add_epi64(vpacket0, vpacketinc);
+        vpacket1 = _mm_add_epi64(vpacket1, vpacketinc);
+        v1 = _mm_srli_epi64(v0, YZBITS) & vxmask;
+        v5 = _mm_srli_epi64(v4, YZBITS) & vxmask;
         v0 = (v0 & vyzmask) | vhi0;
         v4 = (v4 & vyzmask) | vhi1;
-        vhi0 = _mm128_add_epi64(vhi0, vhiinc);
-        vhi1 = _mm128_add_epi64(vhi1, vhiinc);
+        vhi0 = _mm_add_epi64(vhi0, vhiinc);
+        vhi1 = _mm_add_epi64(vhi1, vhiinc);
 
         u32 ux;
 #ifndef NEEDSYNC
 #define STORE0(i,v,x,w) \
-  ux = _mm128_extract_epi32(v,x);\
+  ux = _mm_extract_epi32(v,x);\
   *(u64 *)(base+dst.index[ux]) = _mm128_extract_epi64(w,i%4);\
   dst.index[ux] += BIGSIZE0;
 #else
   u32 zz;
 #define STORE0(i,v,x,w) \
-  zz = _mm128_extract_epi32(w,x);\
+  zz = _mm_extract_epi32(w,x);\
   if (i || likely(zz)) {\
-    ux = _mm128_extract_epi32(v,x);\
+    ux = _mm_extract_epi32(v,x);\
     for (; unlikely(last[ux] + NNONYZ <= edge+i); last[ux] += NNONYZ, dst.index[ux] += BIGSIZE0)\
       *(u32 *)(base+dst.index[ux]) = 0;\
     *(u32 *)(base+dst.index[ux]) = zz;\
@@ -1214,16 +1212,13 @@ class solver_ctx {
         }
   // bit        39..21     20..13    12..0
   // write        edge     YYYYYY    ZZZZZ
+  #elif NSIPHASH == 4
   #elif NSIPHASH == 8
-        v3 = _mm256_permute4x64_epi64(vinit, 0xFF);
-        v0 = _mm256_permute4x64_epi64(vinit, 0x00);
-        v1 = _mm256_permute4x64_epi64(vinit, 0x55);
-        v2 = _mm256_permute4x64_epi64(vinit, 0xAA);
-        v7 = _mm256_permute4x64_epi64(vinit, 0xFF);
-        v4 = _mm256_permute4x64_epi64(vinit, 0x00);
-        v5 = _mm256_permute4x64_epi64(vinit, 0x55);
-        v6 = _mm256_permute4x64_epi64(vinit, 0xAA);
-
+        v7 = v3 = _mm256_permute4x64_epi64(vinit, 0xFF);
+        v4 = v0 = _mm256_permute4x64_epi64(vinit, 0x00);
+        v5 = v1 = _mm256_permute4x64_epi64(vinit, 0x55);
+        v2 = v2 = _mm256_permute4x64_epi64(vinit, 0xAA);
+
         v3 = XOR(v3,vpacket0); v7 = XOR(v7,vpacket1);
         SIPROUNDX2N; SIPROUNDX2N;
         v0 = XOR(v0,vpacket0); v4 = XOR(v4,vpacket1);

diff --git a/src/siphashxN.h b/src/siphashxN.h
@@ -18,10 +18,10 @@
 #define ADD(a, b) _mm_add_epi64(a, b)
 #define XOR(a, b) _mm_xor_si128(a, b)
 #define ROT13(x) _mm_or_si128(_mm_slli_epi64(x,13),_mm_srli_epi64(x,51))
+#define ROT16(x) _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2,1,0,3)), _MM_SHUFFLE(2,1,0,3))
 #define ROT17(x) _mm_or_si128(_mm_slli_epi64(x,17),_mm_srli_epi64(x,47))
 #define ROT21(x) _mm_or_si128(_mm_slli_epi64(x,21),_mm_srli_epi64(x,43))
 #define ROT32(x) _mm_shuffle_epi32  (x, _MM_SHUFFLE(2,3,0,1))
-#define ROT16(x) _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2,1,0,3)), _MM_SHUFFLE(2,1,0,3))
 
 #endif
 
@@ -176,14 +176,10 @@ void siphash24x2(const siphash_keys *keys, const u64 *indices, u64 *hashes) {
 // 4-way sipHash-2-4 specialized to precomputed key and 8 byte nonces
 void siphash24x4(const siphash_keys *keys, const u64 *indices, u64 *hashes) {
   __m128i v0, v1, v2, v3, mi, v4, v5, v6, v7, m2;
-  v0 = _mm_set1_epi64x(keys->k0);
-  v1 = _mm_set1_epi64x(keys->k1);
-  v2 = _mm_set1_epi64x(keys->k2);
-  v3 = _mm_set1_epi64x(keys->k3);
-  v4 = _mm_set1_epi64x(keys->k0);
-  v5 = _mm_set1_epi64x(keys->k1);
-  v6 = _mm_set1_epi64x(keys->k2);
-  v7 = _mm_set1_epi64x(keys->k3);
+  v4 = v0 = _mm_set1_epi64x(keys->k0);
+  v5 = v1 = _mm_set1_epi64x(keys->k1);
+  v6 = v2 = _mm_set1_epi64x(keys->k2);
+  v7 = v3 = _mm_set1_epi64x(keys->k3);
 
   mi = _mm_load_si128((__m128i *)indices);
   m2 = _mm_load_si128((__m128i *)(indices + 2));