Skip to content

Commit

Permalink
Add LEA-128 SSSE3 implementation (GH weidai11#669)
Browse files Browse the repository at this point in the history
LEA-128(128) from 6.73 cpb to 2.84 cpb on modern Core-i5 6400. LEA-128 from 10.12 cpb to 7.84 cpb antique Core2 Duo.
  • Loading branch information
noloader committed Jun 22, 2018
1 parent ea109e0 commit fa7714f
Show file tree
Hide file tree
Showing 9 changed files with 582 additions and 10 deletions.
6 changes: 6 additions & 0 deletions GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),)
ifeq ($(HAVE_SSSE3),1)
ARIA_FLAG = -mssse3
CHAM_FLAG = -mssse3
LEA_FLAG = -mssse3
SSSE3_FLAG = -mssse3
SIMON_FLAG = -mssse3
SPECK_FLAG = -mssse3
Expand Down Expand Up @@ -291,6 +292,7 @@ ifeq ($(SUN_COMPILER),1)
SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
CHAM_FLAG = -xarch=ssse3 -D__SSSE3__=1
LEA_FLAG = -xarch=ssse3 -D__SSSE3__=1
SIMON_FLAG = -xarch=ssse3 -D__SSSE3__=1
SPECK_FLAG = -xarch=ssse3 -D__SSSE3__=1
LDFLAGS += -xarch=ssse3
Expand Down Expand Up @@ -1068,6 +1070,10 @@ crc-simd.o : crc-simd.cpp
gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<

# SSSE3 available
lea-simd.o : lea-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $<

# NEON available
neon-simd.o : neon-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $<
Expand Down
5 changes: 5 additions & 0 deletions GNUmakefile-cross
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ ifneq ($(IS_i686)$(IS_x86_64),00)
ifeq ($(HAVE_SSSE3),1)
ARIA_FLAG = -mssse3
CHAM_FLAG = -mssse3
LEA_FLAG = -mssse3
SSSE3_FLAG = -mssse3
SIMON_FLAG = -mssse3
SPECK_FLAG = -mssse3
Expand Down Expand Up @@ -504,6 +505,10 @@ crc-simd.o : crc-simd.cpp
gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<

# SSSE3 available
lea-simd.o : lea-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $<

# NEON available
neon-simd.o : neon-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $<
Expand Down
24 changes: 16 additions & 8 deletions adv-simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// acceleration. After several implementations we noticed a lot of copy and
// paste occuring. adv-simd.h provides a template to avoid the copy and paste.
//
// There are 8 templates provided in this file. The number following the
// There are 9 templates provided in this file. The number following the
// function name is the block size of the cipher. The name following that
// is the acceleration and arrangement. For example 4x1_SSE means Intel SSE
// using two encrypt (or decrypt) functions: one that operates on 4 blocks,
Expand All @@ -22,6 +22,14 @@
// * AdvancedProcessBlocks64_6x2_ALTIVEC
// * AdvancedProcessBlocks128_6x2_ALTIVEC
//
// If an arrangement ends in 2, like 6x2, then the template will handle the
// single block case by padding with 0's and using the two block function.
// This happens at most one time when processing multiple blocks. The extra
// processing of a zero block is trivial and worth the tradeoff.
//
// The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
// of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
// results in a failed link due to the const/non-const mismatch.

#ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
#define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Expand Down Expand Up @@ -323,7 +331,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
}

/// \brief AdvancedProcessBlocks for 1 and 6 blocks
/// \tparam F1 function to process 1 128-bit blocks
/// \tparam F1 function to process 1 128-bit block
/// \tparam F6 function to process 6 128-bit blocks
/// \tparam W word type of the subkey table
/// \details AdvancedProcessBlocks128_NEON1x6 processes 6 and 2 NEON SIMD words
Expand Down Expand Up @@ -721,7 +729,7 @@ NAMESPACE_BEGIN(CryptoPP)

template <typename F1, typename F2, typename W>
inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
const W *subKeys, size_t rounds, const byte *inBlocks,
MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
CRYPTOPP_ASSERT(subKeys);
Expand Down Expand Up @@ -878,7 +886,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
/// same word type.
template <typename F2, typename F6, typename W>
inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
const W *subKeys, size_t rounds, const byte *inBlocks,
MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
CRYPTOPP_ASSERT(subKeys);
Expand Down Expand Up @@ -1125,7 +1133,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
/// same word type.
template <typename F2, typename F6, typename W>
inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
const W *subKeys, size_t rounds, const byte *inBlocks,
MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
CRYPTOPP_ASSERT(subKeys);
Expand Down Expand Up @@ -1312,12 +1320,12 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
}

/// \brief AdvancedProcessBlocks for 1 and 4 blocks
/// \tparam F1 function to process 1 128-bit blocks
/// \tparam F1 function to process 1 128-bit block
/// \tparam F4 function to process 4 128-bit blocks
/// \tparam W word type of the subkey table
/// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
/// at a time.
/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
/// same word type.
template <typename F1, typename F4, typename W>
inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
Expand Down Expand Up @@ -1455,7 +1463,7 @@ NAMESPACE_END // CryptoPP
NAMESPACE_BEGIN(CryptoPP)

/// \brief AdvancedProcessBlocks for 1 and 6 blocks
/// \tparam F1 function to process 1 128-bit blocks
/// \tparam F1 function to process 1 128-bit block
/// \tparam F6 function to process 6 128-bit blocks
/// \tparam W word type of the subkey table
/// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
Expand Down
4 changes: 2 additions & 2 deletions cryptest.nmake
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@

# If you use 'make sources' from Linux makefile, then add 'winpipes.cpp' to the list below.

LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp cham.cpp cham-simd.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp lea.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp cham.cpp cham-simd.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp lea.cpp lea-simd.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp

LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj cham.obj cham-simd.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj lea.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj
LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj cham.obj cham-simd.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj lea.obj lea-simd.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj

TEST_SRCS = bench1.cpp bench2.cpp test.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp validat4.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp fipstest.cpp

Expand Down
1 change: 1 addition & 0 deletions cryptlib.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@
<ClCompile Include="kalynatab.cpp" />
<ClCompile Include="keccak.cpp" />
<ClCompile Include="lea.cpp" />
<ClCompile Include="lea-simd.cpp" />
<ClCompile Include="luc.cpp" />
<ClCompile Include="mars.cpp" />
<ClCompile Include="marss.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions cryptlib.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@
<ClCompile Include="lea.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lea-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="luc.cpp">
<Filter>Source Files</Filter>
</ClCompile>
Expand Down
Loading

0 comments on commit fa7714f

Please sign in to comment.