Skip to content

Commit

Permalink
get mean30x4 to compile, but still crash
Browse files Browse the repository at this point in the history
  • Loading branch information
tromp committed Mar 24, 2018
1 parent 26c7360 commit ebbe9cb
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 44 deletions.
11 changes: 5 additions & 6 deletions doc/cuckoo.tex
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
on cycle length, guiding the latter's choice.
Trading in the other direction, algorithm "mean" uses a few dozen bits per edge but is 4x faster
in practice, becoming memory bandwidth bound rather than latency bound.
We present performance figures for optimized CPU and GPU implementations, including a CUDA port for GPUs,
We present performance figures for optimized CPU and GPU implementations,
and discuss possible ASIC implementations.
% Both algorithms are shown to parallelize well.
\end{abstract}
Expand Down Expand Up @@ -81,11 +81,10 @@ \section{Introduction}

\section{Motivation}
Cuckoo Cycle aims to be an ``egalitarian'' proof-of-work, that is,
to minimize performance-per-dollar differences across hardware architectures,
and make mining---the process of looking for proofs---on commodity hardware cost-effective.
This is to be achieved by making main memory latency a bottleneck, since
DRAM latencies have remained relatively stable while cpu-speed and memory bandwidth vary highly
across hardware architecture and process technology.
to minimize performance-per-dollar differences across hardware architectures.
This is to be achieved by making main memory latency or bandwidth the bottleneck, since
DRAM latency and bandwidth vary much less across hardware architecture and process technology
than compute power.

Our aim of a memory bound PoW translates to the following desirable properties:

Expand Down
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ mean28sx1: cuckoo.h siphash.h mean_miner.hpp mean_miner.cpp Makefile
$(GPP) -o $@ -DSAVEEDGES -DXBITS=6 -DNSIPHASH=1 -DEDGEBITS=27 mean_miner.cpp $(LIBS)

mean30x4: cuckoo.h siphash.h mean_miner.hpp mean_miner.cpp Makefile
$(GPP) -o $@ -msse2 -DNSIPHASH=4 -DEDGEBITS=29 mean_miner.cpp $(LIBS)
$(GPP) -o $@ -mno-avx2 -DNSIPHASH=4 -DEDGEBITS=29 mean_miner.cpp $(LIBS)

mean30x8: cuckoo.h siphash.h mean_miner.hpp mean_miner.cpp Makefile
$(GPP) -o $@ -mavx2 -DNSIPHASH=8 -DEDGEBITS=29 mean_miner.cpp $(LIBS)
Expand Down
51 changes: 23 additions & 28 deletions src/mean_miner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,10 +333,8 @@ class edgetrimmer {
const u32 endy = NY * (id+1) / nthreads;
u32 edge = starty << YZBITS, endedge = edge + NYZ;
#if NSIPHASH == 4
static const __m256i vxmask = {XMASK, XMASK};
static const __m256i vyzmask = {YZMASK, YZMASK};
const __m128i vinit0 = _mm_load_si128((__m128i *)&sip_keys);
const __m128i vinit1 = _mm_load_si128((__m128i *)(&sip_keys + 2));
static const __m128i vxmask = {XMASK, XMASK};
static const __m128i vyzmask = {YZMASK, YZMASK};
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
const u32 e2 = 2 * edge + uorv;
__m128i vpacket0 = _mm_set_epi64x(e2+2, e2+0);
Expand Down Expand Up @@ -389,41 +387,41 @@ class edgetrimmer {
}
#endif
#elif NSIPHASH == 4
v7 = v3 = _mm_unpackhi_epi64(vinit1, vinit1); // _mm_shuffle_epi32(vinit1, 0xee);
v4 = v0 = _mm_unpacklo_epi64(vinit0, vinit0); // _mm_shuffle_epi32(vinit0, 0x44);
v5 = v1 = _mm_unpackhi_epi64(vinit0, vinit0); // _mm_shuffle_epi32(vinit0, 0xee);
v6 = v2 = _mm_unpacklo_epi64(vinit1, vinit1); // _mm_shuffle_epi32(vinit1, 0x44);
v7 = v3 = _mm_set1_epi64x(sip_keys.k3);
v4 = v0 = _mm_set1_epi64x(sip_keys.k0);
v5 = v1 = _mm_set1_epi64x(sip_keys.k1);
v6 = v2 = _mm_set1_epi64x(sip_keys.k2);

v3 = XOR(v3,vpacket0); v7 = XOR(v7,vpacket1);
SIPROUNDX2N; SIPROUNDX2N;
v0 = XOR(v0,vpacket0); v4 = XOR(v4,vpacket1);
v2 = XOR(v2,_mm128_broadcastq_epi64(_mm_cvtsi64_si128(0xff)));
v6 = XOR(v6,_mm128_broadcastq_epi64(_mm_cvtsi64_si128(0xff)));
v2 = XOR(v2, _mm_set1_epi64x(0xffLL));
v6 = XOR(v6, _mm_set1_epi64x(0xffLL));
SIPROUNDX2N; SIPROUNDX2N; SIPROUNDX2N; SIPROUNDX2N;
v0 = XOR(XOR(v0,v1),XOR(v2,v3));
v4 = XOR(XOR(v4,v5),XOR(v6,v7));

vpacket0 = _mm128_add_epi64(vpacket0, vpacketinc);
vpacket1 = _mm128_add_epi64(vpacket1, vpacketinc);
v1 = _mm128_srli_epi64(v0, YZBITS) & vxmask;
v5 = _mm128_srli_epi64(v4, YZBITS) & vxmask;
vpacket0 = _mm_add_epi64(vpacket0, vpacketinc);
vpacket1 = _mm_add_epi64(vpacket1, vpacketinc);
v1 = _mm_srli_epi64(v0, YZBITS) & vxmask;
v5 = _mm_srli_epi64(v4, YZBITS) & vxmask;
v0 = (v0 & vyzmask) | vhi0;
v4 = (v4 & vyzmask) | vhi1;
vhi0 = _mm128_add_epi64(vhi0, vhiinc);
vhi1 = _mm128_add_epi64(vhi1, vhiinc);
vhi0 = _mm_add_epi64(vhi0, vhiinc);
vhi1 = _mm_add_epi64(vhi1, vhiinc);

u32 ux;
#ifndef NEEDSYNC
#define STORE0(i,v,x,w) \
ux = _mm128_extract_epi32(v,x);\
ux = _mm_extract_epi32(v,x);\
*(u64 *)(base+dst.index[ux]) = _mm128_extract_epi64(w,i%4);\
dst.index[ux] += BIGSIZE0;
#else
u32 zz;
#define STORE0(i,v,x,w) \
zz = _mm128_extract_epi32(w,x);\
zz = _mm_extract_epi32(w,x);\
if (i || likely(zz)) {\
ux = _mm128_extract_epi32(v,x);\
ux = _mm_extract_epi32(v,x);\
for (; unlikely(last[ux] + NNONYZ <= edge+i); last[ux] += NNONYZ, dst.index[ux] += BIGSIZE0)\
*(u32 *)(base+dst.index[ux]) = 0;\
*(u32 *)(base+dst.index[ux]) = zz;\
Expand Down Expand Up @@ -1214,16 +1212,13 @@ class solver_ctx {
}
// bit 39..21 20..13 12..0
// write edge YYYYYY ZZZZZ
#elif NSIPHASH == 4
#elif NSIPHASH == 8
v3 = _mm256_permute4x64_epi64(vinit, 0xFF);
v0 = _mm256_permute4x64_epi64(vinit, 0x00);
v1 = _mm256_permute4x64_epi64(vinit, 0x55);
v2 = _mm256_permute4x64_epi64(vinit, 0xAA);
v7 = _mm256_permute4x64_epi64(vinit, 0xFF);
v4 = _mm256_permute4x64_epi64(vinit, 0x00);
v5 = _mm256_permute4x64_epi64(vinit, 0x55);
v6 = _mm256_permute4x64_epi64(vinit, 0xAA);

v7 = v3 = _mm256_permute4x64_epi64(vinit, 0xFF);
v4 = v0 = _mm256_permute4x64_epi64(vinit, 0x00);
v5 = v1 = _mm256_permute4x64_epi64(vinit, 0x55);
v2 = v2 = _mm256_permute4x64_epi64(vinit, 0xAA);

v3 = XOR(v3,vpacket0); v7 = XOR(v7,vpacket1);
SIPROUNDX2N; SIPROUNDX2N;
v0 = XOR(v0,vpacket0); v4 = XOR(v4,vpacket1);
Expand Down
14 changes: 5 additions & 9 deletions src/siphashxN.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
#define ADD(a, b) _mm_add_epi64(a, b)
#define XOR(a, b) _mm_xor_si128(a, b)
#define ROT13(x) _mm_or_si128(_mm_slli_epi64(x,13),_mm_srli_epi64(x,51))
#define ROT16(x) _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2,1,0,3)), _MM_SHUFFLE(2,1,0,3))
#define ROT17(x) _mm_or_si128(_mm_slli_epi64(x,17),_mm_srli_epi64(x,47))
#define ROT21(x) _mm_or_si128(_mm_slli_epi64(x,21),_mm_srli_epi64(x,43))
#define ROT32(x) _mm_shuffle_epi32 (x, _MM_SHUFFLE(2,3,0,1))
#define ROT16(x) _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2,1,0,3)), _MM_SHUFFLE(2,1,0,3))

#endif

Expand Down Expand Up @@ -176,14 +176,10 @@ void siphash24x2(const siphash_keys *keys, const u64 *indices, u64 *hashes) {
// 4-way sipHash-2-4 specialized to precomputed key and 8 byte nonces
void siphash24x4(const siphash_keys *keys, const u64 *indices, u64 *hashes) {
__m128i v0, v1, v2, v3, mi, v4, v5, v6, v7, m2;
v0 = _mm_set1_epi64x(keys->k0);
v1 = _mm_set1_epi64x(keys->k1);
v2 = _mm_set1_epi64x(keys->k2);
v3 = _mm_set1_epi64x(keys->k3);
v4 = _mm_set1_epi64x(keys->k0);
v5 = _mm_set1_epi64x(keys->k1);
v6 = _mm_set1_epi64x(keys->k2);
v7 = _mm_set1_epi64x(keys->k3);
v4 = v0 = _mm_set1_epi64x(keys->k0);
v5 = v1 = _mm_set1_epi64x(keys->k1);
v6 = v2 = _mm_set1_epi64x(keys->k2);
v7 = v3 = _mm_set1_epi64x(keys->k3);

mi = _mm_load_si128((__m128i *)indices);
m2 = _mm_load_si128((__m128i *)(indices + 2));
Expand Down

0 comments on commit ebbe9cb

Please sign in to comment.