diff --git a/INSTALL_LINUX b/INSTALL_LINUX index a88f888c..24927b46 100644 --- a/INSTALL_LINUX +++ b/INSTALL_LINUX @@ -32,14 +32,26 @@ but different package names. $ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and -openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA -support depending on your CPU and compiler version: +openssl 1.1.0e or higher. -"-march=native" is always the best choice +znver1 and znver2 should be recognized on most recent version of GCC and +znver3 is expected with GCC 11. GCC 11 also includes rocketlake support. +In the meantime here are some suggestions to compile with new CPUs: -"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000. +"-march=native" is usually the best choice, used by build.sh. -"-msha" Add SHA to other tuning options +"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized. + +"-mcascadelake -msha" or +"-mcometlake -mavx512 -msha" can be used for Rocket Lake. + +Features can also be added individually: + +"-msha" adds support for HW accelerated sha256. + +"-mavx512" adds support for 512 bit vectors + +"-mvaes" add support for parallel AES Additional instructions for static compilalation can be found here: https://lxadm.com/Static_compilation_of_cpuminer diff --git a/INSTALL_WINDOWS b/INSTALL_WINDOWS index f2e2c80a..b61f0915 100644 --- a/INSTALL_WINDOWS +++ b/INSTALL_WINDOWS @@ -1,5 +1,9 @@ Instructions for compiling cpuminer-opt for Windows. +Thwaw intructions nay be out of date. Please consult the wiki for +the latest: + +https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source Windows compilation using Visual Studio is not supported. Mingw64 is used on a Linux system (bare metal or virtual machine) to cross-compile @@ -24,79 +28,76 @@ Refer to Linux compile instructions and install required packages. Additionally, install mingw-w64. -sudo apt-get install mingw-w64 +sudo apt-get install mingw-w64 libz-mingw-w64-dev 2. Create a local library directory for packages to be compiled in the next step. Suggested location is $HOME/usr/lib/ +$ mkdir $HOME/usr/lib + 3. Download and build other packages for mingw that don't have a mingw64 version available in the repositories. Download the following source code packages from their respective and -respected download locations, copy them to ~/usr/lib/ and uncompress them. +respected download locations, copy them to $HOME/usr/lib/ and uncompress them. + +openssl: https://github.com/openssl/openssl/releases + +curl: https://github.com/curl/curl/releases -openssl -curl -gmp +gmp: https://gmplib.org/download/gmp/ -In most cases the latest vesrion is ok but it's safest to download -the same major and minor version as included in your distribution. +In most cases the latest version is ok but it's safest to download the same major and minor version as included in your distribution. The following uses versions from Ubuntu 20.04. Change version numbers as required. -Run the following commands or follow the supplied instructions. -Do not run "make install" unless you are using ~/usr/lib, which isn't -recommended. +Run the following commands or follow the supplied instructions. Do not run "make install" unless you are using /usr/lib, which isn't recommended. -Some instructions insist on running "make check". If make check fails -it may still work, YMMV. +Some instructions insist on running "make check". If make check fails it may still work, YMMV. -You can speed up "make" by using all CPU cores available with "-j n" where -n is the number of CPU threads you want to use. +You can speed up "make" by using all CPU cores available with "-j n" where n is the number of CPU threads you want to use. openssl: -./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32 -make +$ ./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32- +$ make + +Make may fail with an ld error, just ensure libcrypto-1_1-x64.dll is created. curl: -./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32 -make +$ ./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32 +$ make gmp: -./configure --host=x86_64-w64-mingw32 -make - - +$ ./configure --host=x86_64-w64-mingw32 +$ make 4. Tweak the environment. -This step is required everytime you login or the commands can be added to -.bashrc. +This step is required everytime you login or the commands can be added to .bashrc. -Define some local variables to point to local library. +Define some local variables to point to local library. -export LOCAL_LIB="$HOME/usr/lib" +$ export LOCAL_LIB="$HOME/usr/lib" -export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" +$ export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" -export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32" +$ export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32" -Create a release directory and copy some dll files previously built. -This can be done outside of cpuminer-opt and only needs to be done once. -If the release directory is in cpuminer-opt directory it needs to be -recreated every a source package is decompressed. +Adjust for gcc version: -mkdir release -cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/ -cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/ -cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/ -cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/ -cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/ -cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ +$ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32" +Create a release directory and copy some dll files previously built. This can be done outside of cpuminer-opt and only needs to be done once. If the release directory is in cpuminer-opt directory it needs to be recreated every time a source package is decompressed. +$ mkdir release +$ cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/ +$ cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/ +$ cp $GCC_MINGW_LIB/libstdc++-6.dll release/ +$ cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/ +$ cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/ +$ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ The following steps need to be done every time a new source package is opened. @@ -110,63 +111,48 @@ https://github.com/JayDDee/cpuminer-opt/releases Decompress and change to the cpuminer-opt directory. - - -6. Prepare to compile +6. compile Create a link to the locally compiled version of gmp.h -ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h - -Edit configure.ac to fix lipthread package name. - -sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac - +$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h -7. Compile +$ ./autogen.sh -you can use the default compile if you intend to use cpuminer-opt on the -same CPU and the virtual machine supports that architecture. +Configure the compiler for the CPU architecture of the host machine: -./build.sh +CFLAGS="-O3 -march=native -Wall" ./configure $CONFIGURE_ARGS -Otherwise you can compile manually while setting options in CFLAGS. +or cross compile for a specific CPU architecture: -Some common options: - -To compile for a specific CPU architecture: - -CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl +CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS This will compile for AMD Ryzen. -You can compile more generically for a set of specific CPU features -if you know what features you want: +You can compile more generically for a set of specific CPU features if you know what features you want: -CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl +CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS This will compile for an older CPU that does not have AVX. -You can find several examples in build-allarch.sh +You can find several examples in README.txt -If you have a CPU with more than 64 threads and Windows 7 or higher you -can enable the CPU Groups feature: +If you have a CPU with more than 64 threads and Windows 7 or higher you can enable the CPU Groups feature by adding the following to CFLAGS: --D_WIN32_WINNT==0x0601 +"-D_WIN32_WINNT=0x0601" -Once you have run configure successfully run make with n CPU threads: +Once you have run configure successfully run the compiler with n CPU threads: -make -j n +$ make -j n -Copy cpuminer.exe to the release directory, compress and copy the release -directory to a Windows system and run cpuminer.exe from the command line. +Copy cpuminer.exe to the release directory, compress and copy the release directory to a Windows system and run cpuminer.exe from the command line. Run cpuminer -In a command windows change directories to the unzipped release folder. -to get a list of all options: +In a command windows change directories to the unzipped release folder. To get a list of all options: cpuminer.exe --help -Command options are specific to where you mine. Refer to the pool's -instructions on how to set them. +Command options are specific to where you mine. Refer to the pool's instructions on how to set them. + + diff --git a/Makefile.am b/Makefile.am index 2a62650e..7fe9e9af 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,12 +20,8 @@ cpuminer_SOURCES = \ util.c \ api.c \ sysinfos.c \ - algo-gate-api.c \ - crypto/oaes_lib.c \ - crypto/c_keccak.c \ - crypto/c_groestl.c \ - crypto/c_blake256.c \ - crypto/c_jh.c \ + algo-gate-api.c\ + malloc-huge.c \ algo/argon2/argon2a/argon2a.c \ algo/argon2/argon2a/ar2/argon2.c \ algo/argon2/argon2a/ar2/opt.c \ @@ -72,24 +68,11 @@ cpuminer_SOURCES = \ algo/bmw/bmw512-gate.c \ algo/bmw/bmw512.c \ algo/bmw/bmw512-4way.c \ - algo/cubehash/sph_cubehash.c\ algo/cubehash/cubehash_sse2.c\ algo/cubehash/cube-hash-2way.c \ algo/echo/sph_echo.c \ algo/echo/echo-hash-4way.c \ - algo/echo/aes_ni/hash.c \ - algo/gr/cryptonote/crypto/aesb.c \ - algo/gr/cryptonote/crypto/hash.c \ - algo/gr/cryptonote/crypto/c_skein.c \ - algo/gr/cryptonote/cryptonight.c \ - algo/gr/cryptonote/cryptonight_dark.c \ - algo/gr/cryptonote/cryptonight_dark_lite.c \ - algo/gr/cryptonote/cryptonight_fast.c \ - algo/gr/cryptonote/cryptonight_lite.c \ - algo/gr/cryptonote/cryptonight_soft_shell.c \ - algo/gr/cryptonote/cryptonight_turtle.c \ - algo/gr/cryptonote/cryptonight_turtle_lite.c \ - algo/gr/gr-gate.c \ + algo/echo/aes_ni/hash.c\ algo/gost/sph_gost.c \ algo/groestl/groestl-gate.c \ algo/groestl/groestl512-hash-4way.c \ @@ -126,7 +109,6 @@ cpuminer_SOURCES = \ algo/keccak/sha3d-4way.c \ algo/keccak/sha3d.c \ algo/lanehash/lane.c \ - algo/luffa/sph_luffa.c \ algo/luffa/luffa_for_sse2.c \ algo/luffa/luffa-hash-2way.c \ algo/lyra2/lyra2.c \ @@ -148,7 +130,7 @@ cpuminer_SOURCES = \ algo/lyra2/allium.c \ algo/lyra2/phi2-4way.c \ algo/lyra2/phi2.c \ - algo//m7m/m7m.c \ + algo/m7m/m7m.c \ algo/m7m/magimath.cpp \ algo/nist5/nist5-gate.c \ algo/nist5/nist5-4way.c \ @@ -177,14 +159,20 @@ cpuminer_SOURCES = \ algo/ripemd/lbry.c \ algo/ripemd/lbry-4way.c \ algo/scrypt/scrypt.c \ + algo/scrypt/scrypt-core-4way.c \ algo/scrypt/neoscrypt.c \ + algo/sha/sha256-hash.c \ algo/sha/sph_sha2.c \ algo/sha/sph_sha2big.c \ algo/sha/sha256-hash-4way.c \ algo/sha/sha512-hash-4way.c \ + algo/sha/sha256-hash-opt.c \ + algo/sha/sha256-hash-2way-ni.c \ algo/sha/hmac-sha256-hash.c \ algo/sha/hmac-sha256-hash-4way.c \ + algo/sha/sha256d.c \ algo/sha/sha2.c \ + algo/sha/sha256d-4way.c \ algo/sha/sha256t-gate.c \ algo/sha/sha256t-4way.c \ algo/sha/sha256t.c \ @@ -197,7 +185,6 @@ cpuminer_SOURCES = \ algo/shavite/shavite-hash-2way.c \ algo/shavite/shavite-hash-4way.c \ algo/shavite/shavite.c \ - algo/simd/sph_simd.c \ algo/simd/nist.c \ algo/simd/vector.c \ algo/simd/simd-hash-2way.c \ @@ -212,14 +199,19 @@ cpuminer_SOURCES = \ algo/sm3/sm3-hash-4way.c \ algo/swifftx/swifftx.c \ algo/tiger/sph_tiger.c \ + algo/verthash/verthash-gate.c \ + algo/verthash/Verthash.c \ + algo/verthash/fopen_utf8.c \ + algo/verthash/tiny_sha3/sha3.c \ + algo/verthash/tiny_sha3/sha3-4way.c \ algo/whirlpool/sph_whirlpool.c \ algo/whirlpool/whirlpool-hash-4way.c \ algo/whirlpool/whirlpool-gate.c \ algo/whirlpool/whirlpool.c \ algo/whirlpool/whirlpoolx.c \ - algo/x11/0x10-gate.c \ - algo/x11/0x10.c \ - algo/x11/0x10-4way.c \ + algo/x11/hash0x10-gate.c \ + algo/x11/hash0x10.c \ + algo/x11/hash0x10-4way.c \ algo/x11/x11-gate.c \ algo/x11/x11.c \ algo/x11/x11-4way.c \ @@ -282,7 +274,6 @@ cpuminer_SOURCES = \ algo/x16/x21s-4way.c \ algo/x16/x21s.c \ algo/x16/minotaur.c \ - algo/x16/minotaurx.c \ algo/x17/x17-gate.c \ algo/x17/x17.c \ algo/x17/x17-4way.c \ diff --git a/README.md b/README.md index c5fb52a3..8cf37c19 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ GBT is YMMV. Supported Algorithms -------------------- + 0x10 ChainOX (CHOX) allium Garlicoin anime Animecoin argon2 Argon2 coin (AR2) @@ -79,7 +80,6 @@ Supported Algorithms decred deep Deepcoin (DCN) dmd-gr Diamond-Groestl - gr Ghostrider groestl Groestl coin hex x16r-hex hmq1725 Espers @@ -92,13 +92,12 @@ Supported Algorithms lyra2h Hppcoin lyra2re lyra2 lyra2rev2 lyra2v2 - lyra2rev3 lyrav2v3, Vertcoin + lyra2rev3 lyrav2v3 lyra2z lyra2z330 Lyra2 330 rows, Zoin (ZOI) m7m Magi (XMG) minotaur Ringcoin (RNG) - minotaurx Litecoin Cash (LCC) - myr-gr Myriad-Groestl + myr-gr Myriad-Groestl neoscrypt NeoScrypt(128, 2, 1) nist5 Nist5 pentablake Pentablake @@ -127,6 +126,7 @@ Supported Algorithms tribus Denarius (DNR) vanilla blake256r8vnl (VCash) veltor (VLT) + verthash Vertcoin whirlpool whirlpoolx x11 Dash @@ -139,7 +139,7 @@ Supported Algorithms x14 X14 x15 X15 x16r - x16rv2 Ravencoin (RVN) + x16rv2 x16rt Gincoin (GIN) x16rt-veil Veil (VEIL) x16s Pigeoncoin (PGN) @@ -154,10 +154,10 @@ Supported Algorithms yescryptr16 Eli yescryptr32 WAVI yespower Cryply - yespowerarwn Arowanacoin(ARWN) + yespowerARWN Arowanacoin(ARWN) yespowerr16 Yenten (YTN) - yespowersugar Suagrchain (SUGAR) - yespowerurx UraniumX (URX) + yespowerSUGAR Suagrchain (SUGAR) + yespowerURX UraniumX (URX) yespower-b2b generic yespower + blake2b zr5 Ziftr diff --git a/README.txt b/README.txt index 36298c00..3776d85e 100644 --- a/README.txt +++ b/README.txt @@ -18,14 +18,14 @@ error to find the fastest one that works. Pay attention to the features listed at cpuminer startup to ensure you are mining at optimum speed using the best available features. -Architecture names and compile options used are only provided for Intel -Core series. Budget CPUs like Pentium and Celeron are often missing some -features. +Architecture names and compile options used are only provided for +mainstream desktop CPUs. Budget CPUs like Pentium and Celeron are often +missing some features. Check your CPU. -AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not -supported by cpuminer-opt due to an incompatible implementation of SSE2 on -these CPUs. Some algos may crash the miner with an invalid instruction. -Users are recommended to use an unoptimized miner such as cpuminer-multi. +Support for AMD CPUs older than Ryzen is incomplete and without specific +recommendations. Find the best fit. CPUs older than Piledriver, including +Athlon x2 and Phenom II x4, are not supported by cpuminer-opt due to an +incompatible implementation of SSE2 on these CPUs. More information for Intel and AMD CPU architectures and their features can be found on Wikipedia. @@ -34,36 +34,35 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures +File name Architecture name -Exe file name Compile flags Arch name +cpuminer-sse2.exe Core2, Nehalem, generic x86_64 with SSE2 +cpuminer-aes-sse42.exe Westmere +cpuminer-avx.exe Sandybridge, Ivybridge +cpuminer-avx2.exe Haswell, Skylake, Kabylake, Coffeelake, Cometlake +cpuminer-avx2-sha.exe AMD Zen1, Zen2 +cpuminer-avx2-sha-vaes.exe Intel Alderlake*, AMD Zen3 +cpuminer-avx512.exe Intel HEDT Skylake-X, Cascadelake +cpuminer-avx512-sha-vaes.exe Icelake, Tigerlake, Rocketlake -cpuminer-sse2.exe "-msse2" Core2, Nehalem -cpuminer-aes-sse42.exe "-march=westmere" Westmere -cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge -cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell(1) -cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake -cpuminer-avx512-sha.exe "-march=cascadelake -msha" Rocketlake(2) -cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake, Tigerlake(3) -cpuminer-zen.exe "-march=znver1" AMD Zen1, Zen2 -cpuminer-zen3.exe "-march=znver2 -mvaes" Zen3(4) - -(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. -(2) Rocketlake build uses cascadelake+sha as a workaround until Rocketlake - compiler support is avalable. -(3) Icelake & Tigerlake are only available on some laptops. Mining with a - laptop is not recommended. -(4) Zen3 build uses zen2+vaes as a workaround until Zen3 compiler support is - available. Zen2 CPUs should use Zen1 build. +* Alderlake is a hybrid architecture. With the E-cores disabled it may be + possible to enable AVX512 on the the P-cores and use the avx512-sha-vaes + build. This is not officially supported by Intel at time of writing. + Check for current information. Notes about included DLL files: Downloading DLL files from alternative sources presents an inherent security risk if their source is unknown. All DLL files included have -been copied from the Ubuntu-20.04 instalation or compiled by me from +been copied from the Ubuntu-20.04 installation or compiled by me from source code obtained from the author's official repository. The exact procedure is documented in the build instructions for Windows: https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source +Some DLL filess may already be installed on the system by Windows or third +party packages. They often will work and may be used instead of the included +file. + If you like this software feel free to donate: BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 1e2f27ad..fd8d114a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,10 +65,213 @@ If not what makes it happen or not happen? Change Log ---------- +v3.19.6 + +#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing +Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled +Small optimization to Shavite. + +v3.19.5 + +Enhanced stratum-keepalive preemptively resets the stratum connection +before the server to avoid lost shares. + +Added build-msys2.sh scrypt for easier compiling on Windows, see Wiki for details. + +X16RT: eliminate unnecessary recalculations of the hash order. + +Fix a few compiler warnings. + +Fixed log colour error when a block is solved. + +v3.19.4 + +#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3. + +New option stratum-keepalive prevents stratum timeouts when no shares are +submitted for several minutes due to high difficulty. + +Fixed a bug displaying optimizations for some algos. + +v3.19.3 + +Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available. + +Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512. + +v3.19.2 + +Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1. + +Reduce log noise when replies to submitted shares are lost due to stratum errors. + +Fugue prehash optimization for X16r family AVX2 & AVX512. + +Small speed improvement for Hamsi AVX2 & AVX512. + +Win: With CPU groups enabled the number of CPUs displayed in the ASCII art +affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64. + +v3.19.1 + +Changes to Windows binaries package: + - builds for CPUs with AVX or lower have CPU groups disabled, + - zen3 build renamed to avx2-sha-vaes to support Alderlake as well as Zen3, + - zen build renamed to avx2-sha, supports Zen1 & Zen2, + - avx512-sha build removed, Rocketlake CPUs can use avx512-sha-vaes, + - see README.txt for compatibility details. + +Fixed a few compiler warnings that are new in GCC 11. +Other minor fixes. + +v3.19.0 + +Windows binaries now built with support for CPU groups, requires Windows 7. + +Changes to cpu-affinity: + - PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups, + - added support for CPU affinity for up to 256 threads or CPUs, + - streamlined code for more efficient initialization of miner threads, + - precise affining of each miner thread to a specific CPU, + - added an option to disable CPU affinity with "--cpu-affinity 0" + +Faster sha256t with AVX512 & AVX2. + +Added stratum error count to stats log, reported only when non-zero. + +v3.18.2 + +Issue #342, fixed Groestl AES on Windows, broken in v3.18.0. + +AVX512 for sha256d. + +SSE42 and AVX may now be displayed as mining features at startup. +This is hard coded for each algo, and is only implemented for scrypt +at this time as it is the only algo with significant performance differences +with those features. + +Fixed an issue where a high hashrate algo could cause excessive invalid hash +rate log reports when starting up in benchmark mode. + +v3.18.1 + +More speed for scrypt: + - additional scryptn2 optimizations for all CPU architectures, + - AVX2 is now used by default on CPUS with SHA but not AVX512, + - scrypt:1024 performance lost in v3.18.0 is restored, + - AVX512 & AVX2 improvements to scrypt:1024. + +Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%. + +Issue #337: fixed a problem that could display negative stats values in the +first summary report if the report was forced prematurely due to a stratum +diff change. The stats will still be invalid but should display zeros. + +v3.18.0 + +Complete rewrite of Scrypt code, optimized for large N factor (scryptn2): + - AVX512 & SHA support for sha256, AVX512 has priority, + - up to 50% increase in hashrate, + - memory requirements reduced 30-60% depending on CPU architecture, + - memory usage displayed at startup, + - scrypt, default N=1024 (LTC), will likely perform slower. + +Improved stale share detection and handling for Scrypt with large N factor: + - abort and discard partially computed hash when new work is detected, + - quicker response to new job, less time wasted mining stale job. + +Improved stale share handling for all algorithms: + - report possible stale share when new work received with a previously + submitted share still pending, + - when new work is detected report the submission of an already completed, + otherwise valid, but likely stale, share, + - fixed incorrect block height in stale share log. + +Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2. + +When stratum disconnects miner threads go to idle until reconnected. + +Colour changes to some logs. + +Some low level function name changes for clarity and consistency. + +The reference hashrate in the summary log and the benchmark total hashrate +are now the mean hashrate for the session. + +v3.17.1 + +Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES. +More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES. +Fixed my-gr algo for VAES. + +v3.17.0 + +AVX512 optimized using ternary logic instructions. +Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%. +Use SHA on supported CPUs to produce merkle hash. +Fixed byte order in Extranonce2 log & replaced Block height with Job ID. + +v3.16.5 + +#329: Fixed GBT incorrect target diff in stats, second attempt. +Fixed formatting error in share result log when --no-color option is used. + +v3.16.4 + +Faster sha512 and sha256 when not using SHA CPU extension. +#329: Fixed GBT incorrect target diff in stats. + +v3.16.3 + +#313 Fix compile error with GCC 11. +Incremental improvements to verthash. + +v3.16.2 + +Verthash: midstate prehash optimization for all architectures. +Verthash: AVX2 optimization. +GBT: added support for Bech32 addresses. +Linux: added CPU frequency to benchmark log. +Fixed integer overflow in time calculations. + +v3.16.1 + +New options for verthash: + --data-file to specify the name, and optionally the path, of the verthash + data file, default is "verthash.dat" in the current directory. + --verify to perform the data file integrity check at startup, default is + not to verify data file integrity. +Support for creation of default verthash data file if: + 1) --data-file option is not used, + 2) no default data file is found in the current directory, and, + 3) --verify option is used. +More detailed logs related to verthash data file. +Small verthash performance improvement. +Fixed detection of corrupt stats caused by networking issues. + +v3.16.0 + +Added verthash algo. + +v3.15.7 + +Added accepted/stale/rejected percentage to summary log report. +Added warning if share counters mismatch which could corrupt stats. +Linux: CPU temperature reporting is more responsive to rising temperature. +A few AVX2 & AVX512 tweaks. +Removed some dead code and other cleanup. + +v3.15.6 + +Implement keccak pre-hash optimization for x16* algos. +Move conditional mining test to before get_new_work in miner thread. +Add test for share reject reason when solo mining. +Add support for floating point, as well as integer, "networkhasps" in +RPC getmininginfo method. + v3.15.5 Fix stratum jobs lost if 2 jobs received in less than one second. - v3.15.4 diff --git a/algo-gate-api.c b/algo-gate-api.c index d8d5bb21..18d4b9e4 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -15,8 +15,6 @@ #include #include #include -#include -//#include "miner.h" #include "algo-gate-api.h" // Define null and standard functions. @@ -279,9 +277,11 @@ void init_algo_gate( algo_gate_t* gate ) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wimplicit-function-declaration" -// called by each thread that uses the gate +// Called once by main bool register_algo_gate( int algo, algo_gate_t *gate ) { + bool rc = false; + if ( NULL == gate ) { applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n"); @@ -290,112 +290,110 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) init_algo_gate( gate ); - switch (algo) + switch ( algo ) { - case ALGO_0X10: register_0x10_algo ( gate ); break; - case ALGO_ALLIUM: register_allium_algo ( gate ); break; - case ALGO_ANIME: register_anime_algo ( gate ); break; - case ALGO_ARGON2: register_argon2_algo ( gate ); break; - case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break; - case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break; - case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break; - case ALGO_ARGON2D16000: register_argon2d_bcrs_algo ( gate ); break; - case ALGO_AXIOM: register_axiom_algo ( gate ); break; - case ALGO_BLAKE: register_blake_algo ( gate ); break; - case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break; - case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break; - case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break; - case ALGO_BMW512: register_bmw512_algo ( gate ); break; - case ALGO_C11: register_c11_algo ( gate ); break; - case ALGO_CPUPOWER: register_cpupower_algo ( gate ); break; - case ALGO_DECRED: register_decred_algo ( gate ); break; - case ALGO_DEEP: register_deep_algo ( gate ); break; - case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break; - case ALGO_GR: register_gr_algo ( gate ); break; - case ALGO_GROESTL: register_groestl_algo ( gate ); break; - case ALGO_HEX: register_hex_algo ( gate ); break; - case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break; - case ALGO_HODL: register_hodl_algo ( gate ); break; - case ALGO_JHA: register_jha_algo ( gate ); break; - case ALGO_KECCAK: register_keccak_algo ( gate ); break; - case ALGO_KECCAKC: register_keccakc_algo ( gate ); break; - case ALGO_LBRY: register_lbry_algo ( gate ); break; - case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break; - case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break; - case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break; - case ALGO_LYRA2REV3: register_lyra2rev3_algo ( gate ); break; - case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break; - case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break; - case ALGO_M7M: register_m7m_algo ( gate ); break; - case ALGO_MINOTAUR: register_minotaur_algo ( gate ); break; - case ALGO_MINOTAURX: register_minotaurx_algo ( gate ); break; - case ALGO_MYR_GR: register_myriad_algo ( gate ); break; - case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break; - case ALGO_NIST5: register_nist5_algo ( gate ); break; - case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break; - case ALGO_PHI1612: register_phi1612_algo ( gate ); break; - case ALGO_PHI2: register_phi2_algo ( gate ); break; - case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break; - case ALGO_POWER2B: register_power2b_algo ( gate ); break; - case ALGO_QUARK: register_quark_algo ( gate ); break; - case ALGO_QUBIT: register_qubit_algo ( gate ); break; - case ALGO_SCRYPT: register_scrypt_algo ( gate ); break; - case ALGO_SHA256D: register_sha256d_algo ( gate ); break; - case ALGO_SHA256Q: register_sha256q_algo ( gate ); break; - case ALGO_SHA256T: register_sha256t_algo ( gate ); break; - case ALGO_SHA3D: register_sha3d_algo ( gate ); break; - case ALGO_SHAVITE3: register_shavite_algo ( gate ); break; - case ALGO_SKEIN: register_skein_algo ( gate ); break; - case ALGO_SKEIN2: register_skein2_algo ( gate ); break; - case ALGO_SKUNK: register_skunk_algo ( gate ); break; - case ALGO_SONOA: register_sonoa_algo ( gate ); break; - case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break; - case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break; - case ALGO_TRIBUS: register_tribus_algo ( gate ); break; - case ALGO_VANILLA: register_vanilla_algo ( gate ); break; - case ALGO_VELTOR: register_veltor_algo ( gate ); break; - case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break; - case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break; - case ALGO_X11: register_x11_algo ( gate ); break; - case ALGO_X11EVO: register_x11evo_algo ( gate ); break; - case ALGO_X11GOST: register_x11gost_algo ( gate ); break; - case ALGO_X12: register_x12_algo ( gate ); break; - case ALGO_X13: register_x13_algo ( gate ); break; - case ALGO_X13BCD: register_x13bcd_algo ( gate ); break; - case ALGO_X13SM3: register_x13sm3_algo ( gate ); break; - case ALGO_X14: register_x14_algo ( gate ); break; - case ALGO_X15: register_x15_algo ( gate ); break; - case ALGO_X16R: register_x16r_algo ( gate ); break; - case ALGO_X16RV2: register_x16rv2_algo ( gate ); break; - case ALGO_X16RT: register_x16rt_algo ( gate ); break; - case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break; - case ALGO_X16S: register_x16s_algo ( gate ); break; - case ALGO_X17: register_x17_algo ( gate ); break; - case ALGO_X21S: register_x21s_algo ( gate ); break; - case ALGO_X22I: register_x22i_algo ( gate ); break; - case ALGO_X25X: register_x25x_algo ( gate ); break; - case ALGO_XEVAN: register_xevan_algo ( gate ); break; - case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break; - case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break; - case ALGO_YESCRYPTR8G: register_yescryptr8g_algo ( gate ); break; - case ALGO_YESCRYPTR16: register_yescryptr16_05_algo( gate ); break; - case ALGO_YESCRYPTR32: register_yescryptr32_05_algo( gate ); break; - case ALGO_YESPOWER: register_yespower_algo ( gate ); break; - case ALGO_YESPOWERARWN: register_yespowerarwn_algo ( gate ); break; - case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break; - case ALGO_YESPOWERSUGAR: register_yespowersugar_algo ( gate ); break; - case ALGO_YESPOWERURX: register_yespowerurx_algo ( gate ); break; - case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break; - case ALGO_ZR5: register_zr5_algo ( gate ); break; + case ALGO_0X10: rc = register_hash0x10_algo ( gate ); break; + case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break; + case ALGO_ANIME: rc = register_anime_algo ( gate ); break; + case ALGO_ARGON2: rc = register_argon2_algo ( gate ); break; + case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break; + case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break; + case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break; + case ALGO_ARGON2D16000: rc = register_argon2d_bcrs_algo ( gate ); break; + case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break; + case ALGO_BLAKE: rc = register_blake_algo ( gate ); break; + case ALGO_BLAKE2B: rc = register_blake2b_algo ( gate ); break; + case ALGO_BLAKE2S: rc = register_blake2s_algo ( gate ); break; + case ALGO_BLAKECOIN: rc = register_blakecoin_algo ( gate ); break; + case ALGO_BMW512: rc = register_bmw512_algo ( gate ); break; + case ALGO_C11: rc = register_c11_algo ( gate ); break; + case ALGO_CPUOWER: rc = register_cpupower_algo ( gate ); break; + case ALGO_DECRED: rc = register_decred_algo ( gate ); break; + case ALGO_DEEP: rc = register_deep_algo ( gate ); break; + case ALGO_DMD_GR: rc = register_dmd_gr_algo ( gate ); break; + case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break; + case ALGO_HEX: rc = register_hex_algo ( gate ); break; + case ALGO_HMQ1725: rc = register_hmq1725_algo ( gate ); break; + case ALGO_HODL: rc = register_hodl_algo ( gate ); break; + case ALGO_JHA: rc = register_jha_algo ( gate ); break; + case ALGO_KECCAK: rc = register_keccak_algo ( gate ); break; + case ALGO_KECCAKC: rc = register_keccakc_algo ( gate ); break; + case ALGO_LBRY: rc = register_lbry_algo ( gate ); break; + case ALGO_LYRA2H: rc = register_lyra2h_algo ( gate ); break; + case ALGO_LYRA2RE: rc = register_lyra2re_algo ( gate ); break; + case ALGO_LYRA2REV2: rc = register_lyra2rev2_algo ( gate ); break; + case ALGO_LYRA2REV3: rc = register_lyra2rev3_algo ( gate ); break; + case ALGO_LYRA2Z: rc = register_lyra2z_algo ( gate ); break; + case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break; + case ALGO_M7M: rc = register_m7m_algo ( gate ); break; + case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break; + case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break; + case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break; + case ALGO_NIST5: rc = register_nist5_algo ( gate ); break; + case ALGO_PENTABLAKE: rc = register_pentablake_algo ( gate ); break; + case ALGO_PHI1612: rc = register_phi1612_algo ( gate ); break; + case ALGO_PHI2: rc = register_phi2_algo ( gate ); break; + case ALGO_POLYTIMOS: rc = register_polytimos_algo ( gate ); break; + case ALGO_POWER2B: rc = register_power2b_algo ( gate ); break; + case ALGO_QUARK: rc = register_quark_algo ( gate ); break; + case ALGO_QUBIT: rc = register_qubit_algo ( gate ); break; + case ALGO_SCRYPT: rc = register_scrypt_algo ( gate ); break; + case ALGO_SHA256D: rc = register_sha256d_algo ( gate ); break; + case ALGO_SHA256Q: rc = register_sha256q_algo ( gate ); break; + case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break; + case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break; + case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break; + case ALGO_SKEIN: rc = register_skein_algo ( gate ); break; + case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break; + case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break; + case ALGO_SONOA: rc = register_sonoa_algo ( gate ); break; + case ALGO_TIMETRAVEL: rc = register_timetravel_algo ( gate ); break; + case ALGO_TIMETRAVEL10: rc = register_timetravel10_algo ( gate ); break; + case ALGO_TRIBUS: rc = register_tribus_algo ( gate ); break; + case ALGO_VANILLA: rc = register_vanilla_algo ( gate ); break; + case ALGO_VELTOR: rc = register_veltor_algo ( gate ); break; + case ALGO_VERTHASH: rc = register_verthash_algo ( gate ); break; + case ALGO_WHIRLPOOL: rc = register_whirlpool_algo ( gate ); break; + case ALGO_WHIRLPOOLX: rc = register_whirlpoolx_algo ( gate ); break; + case ALGO_X11: rc = register_x11_algo ( gate ); break; + case ALGO_X11EVO: rc = register_x11evo_algo ( gate ); break; + case ALGO_X11GOST: rc = register_x11gost_algo ( gate ); break; + case ALGO_X12: rc = register_x12_algo ( gate ); break; + case ALGO_X13: rc = register_x13_algo ( gate ); break; + case ALGO_X13BCD: rc = register_x13bcd_algo ( gate ); break; + case ALGO_X13SM3: rc = register_x13sm3_algo ( gate ); break; + case ALGO_X14: rc = register_x14_algo ( gate ); break; + case ALGO_X15: rc = register_x15_algo ( gate ); break; + case ALGO_X16R: rc = register_x16r_algo ( gate ); break; + case ALGO_X16RV2: rc = register_x16rv2_algo ( gate ); break; + case ALGO_X16RT: rc = register_x16rt_algo ( gate ); break; + case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break; + case ALGO_X16S: rc = register_x16s_algo ( gate ); break; + case ALGO_X17: rc = register_x17_algo ( gate ); break; + case ALGO_X21S: rc = register_x21s_algo ( gate ); break; + case ALGO_X22I: rc = register_x22i_algo ( gate ); break; + case ALGO_X25X: rc = register_x25x_algo ( gate ); break; + case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break; + case ALGO_YESCRYPT: rc = register_yescrypt_05_algo ( gate ); break; + case ALGO_YESCRYPTR8: rc = register_yescryptr8_05_algo ( gate ); break; + case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break; + case ALGO_YESCRYPTR16: rc = register_yescryptr16_05_algo( gate ); break; + case ALGO_YESCRYPTR32: rc = register_yescryptr32_05_algo( gate ); break; + case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break; + case ALGO_YESPOWERARWN: rc = register_yespowerarwn_algo ( gate ); break; + case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break; + case ALGO_YESPOWERSUGAR: rc = register_yespowersugar_algo ( gate ); break; + case ALGO_YESPOWERURX: rc = register_yespowerurx_algo ( gate ); break; + case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break; + case ALGO_ZR5: rc = register_zr5_algo ( gate ); break; default: - applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] ); + applog(LOG_ERR,"BUG: unregistered algorithm %s.\n", algo_names[opt_algo] ); return false; } // switch - // ensure required functions were defined. - if ( gate->scanhash == (void*)&null_scanhash ) + if ( !rc ) { - applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n"); + applog(LOG_ERR, "FAIL: %s algorithm failed to initialize\n", algo_names[opt_algo] ); return false; } return true; @@ -423,11 +421,8 @@ void exec_hash_function( int algo, void *output, const void *pdata ) const char* const algo_alias_map[][2] = { // alias proper - { "0x10", "0x10" }, { "argon2d-dyn", "argon2d500" }, { "argon2d-uis", "argon2d4096" }, - { "argon2d-crds", "argon2d250" }, - { "argon2d-bcrs", "argon2d16000" }, { "bcd", "x13bcd" }, { "bitcore", "timetravel10" }, { "bitzeny", "yescryptr8" }, @@ -435,14 +430,11 @@ const char* const algo_alias_map[][2] = { "blake256r8vnl", "vanilla" }, { "blake256r14", "blake" }, { "blake256r14dcr", "decred" }, - { "cpupower", "cpupower" }, { "diamond", "dmd-gr" }, { "espers", "hmq1725" }, { "flax", "c11" }, - { "ghostrider", "gr" }, { "hsr", "x13sm3" }, { "jackpot", "jha" }, - { "jane", "scryptjane" }, { "lyra2", "lyra2re" }, { "lyra2v2", "lyra2rev2" }, { "lyra2v3", "lyra2rev3" }, @@ -455,10 +447,7 @@ const char* const algo_alias_map[][2] = { "timetravel8", "timetravel" }, { "veil", "x16rt-veil" }, { "x16r-hex", "hex" }, - { "yenten", "yespowerr16" }, - { "yespowerarwn", "yespowerarwn" }, - { "yespowersugar", "yespowersugar" }, - { "yespowerurx", "yespowerurx" }, + { "yenten", "yescryptr16" }, { "ziftr", "zr5" }, { NULL, NULL } }; diff --git a/algo-gate-api.h b/algo-gate-api.h index c578f85a..07108021 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -1,3 +1,6 @@ +#ifndef __ALGO_GATE_API_H__ +#define __ALGO_GATE_API_H__ 1 + #include #include #include @@ -94,7 +97,6 @@ typedef uint32_t set_t; #define SHA_OPT 0x20 // Zen1, Icelake (sha256) #define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW]) #define VAES_OPT 0x80 // Icelake (VAES & AVX512) -#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512) // return set containing all elements from sets a & b @@ -114,15 +116,15 @@ typedef struct // Mandatory functions, one of these is mandatory. If a generic scanhash // is used a custom target hash function must be registered, with a custom // scanhash the target hash function can be called directly and doesn't need -// to be registered in the gate. +// to be registered with the gate. int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* ); int ( *hash ) ( void*, const void*, int ); //optional, safe to use default in most cases -// Allocate thread local buffers and other initialization specific to miner -// threads. +// Called once by each miner thread to allocate thread local buffers and +// other initialization specific to miner threads. bool ( *miner_thread_init ) ( int ); // Get thread local copy of blockheader with unique nonce. @@ -150,7 +152,7 @@ void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* ); char* ( *malloc_txs_request ) ( struct work* ); -// Big or little +// Big endian or little endian void ( *set_work_data_endian ) ( struct work* ); double ( *calc_network_diff ) ( struct work* ); @@ -260,7 +262,7 @@ int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce, #endif // displays warning -int null_hash (); +int null_hash(); // optional safe targets, default listed first unless noted. @@ -281,7 +283,7 @@ void std_be_build_stratum_request( char *req, struct work *work ); char* std_malloc_txs_request( struct work *work ); -// Default is do_nothing (assumed LE) +// Default is do_nothing, little endian is assumed void set_work_data_big_endian( struct work *work ); double std_calc_network_diff( struct work *work ); @@ -319,3 +321,4 @@ void exec_hash_function( int algo, void *output, const void *pdata ); // algo name if valid alias, NULL if invalid alias or algo. void get_algo_alias( char **algo_or_alias ); +#endif diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h index fb457945..c2f9edcc 100644 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h +++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h @@ -344,7 +344,7 @@ static size_t detect_cpu(void) { //union { uint8_t s[12]; uint32_t i[3]; } vendor_string; //cpu_vendors_x86 vendor = cpu_nobody; - x86_regs regs; + x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0; uint32_t max_level, max_ext_level; size_t cpu_flags = 0; #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) @@ -460,4 +460,4 @@ get_top_cpuflag_desc(size_t flag) { #endif #endif -#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ \ No newline at end of file +#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h index 57ba649f..3124c847 100644 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h +++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h @@ -4,11 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc #endif /* romix pre/post nop function */ +/* static void asm_calling_convention scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { (void)blocks; (void)nblocks; } - +*/ /* romix pre/post endian conversion function */ static void asm_calling_convention scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c index 31829304..5164a1e9 100644 --- a/algo/argon2/argon2d/argon2d/opt.c +++ b/algo/argon2/argon2d/argon2d/opt.c @@ -37,6 +37,13 @@ #if defined(__AVX512F__) +static inline __m512i blamka( __m512i x, __m512i y ) +{ + __m512i xy = _mm512_mul_epu32( x, y ); + return _mm512_add_epi64( _mm512_add_epi64( x, y ), + _mm512_add_epi64( xy, xy ) ); +} + static void fill_block( __m512i *state, const block *ref_block, block *next_block, int with_xor ) { diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h index 81563314..4cb8bdad 100644 --- a/algo/argon2/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h @@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { #include -#define ror64(x, n) _mm512_ror_epi64((x), (n)) - -static __m512i muladd(__m512i x, __m512i y) +static inline __m512i muladd(__m512i x, __m512i y) { __m512i z = _mm512_mul_epu32(x, y); return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); @@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y) D0 = _mm512_xor_si512(D0, A0); \ D1 = _mm512_xor_si512(D1, A1); \ \ - D0 = ror64(D0, 32); \ - D1 = ror64(D1, 32); \ + D0 = _mm512_ror_epi64(D0, 32); \ + D1 = _mm512_ror_epi64(D1, 32); \ \ C0 = muladd(C0, D0); \ C1 = muladd(C1, D1); \ @@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y) B0 = _mm512_xor_si512(B0, C0); \ B1 = _mm512_xor_si512(B1, C1); \ \ - B0 = ror64(B0, 24); \ - B1 = ror64(B1, 24); \ + B0 = _mm512_ror_epi64(B0, 24); \ + B1 = _mm512_ror_epi64(B1, 24); \ } while ((void)0, 0) #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y) D0 = _mm512_xor_si512(D0, A0); \ D1 = _mm512_xor_si512(D1, A1); \ \ - D0 = ror64(D0, 16); \ - D1 = ror64(D1, 16); \ + D0 = _mm512_ror_epi64(D0, 16); \ + D1 = _mm512_ror_epi64(D1, 16); \ \ C0 = muladd(C0, D0); \ C1 = muladd(C1, D1); \ @@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y) B0 = _mm512_xor_si512(B0, C0); \ B1 = _mm512_xor_si512(B1, C1); \ \ - B0 = ror64(B0, 63); \ - B1 = ror64(B1, 63); \ + B0 = _mm512_ror_epi64(B0, 63); \ + B1 = _mm512_ror_epi64(B1, 63); \ } while ((void)0, 0) #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y) #define SWAP_HALVES(A0, A1) \ do { \ - __m512i t0, t1; \ - t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ - t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ - A0 = t0; \ - A1 = t1; \ + __m512i t; \ + t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ + A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ + A0 = t; \ } while((void)0, 0) #define SWAP_QUARTERS(A0, A1) \ diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index fc64583d..a5d74e0a 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len ); void blake512_8way_close( void *cc, void *dst ); void blake512_8way_full( blake_8way_big_context *sc, void * dst, const void *data, size_t len ); +void blake512_8way_hash_le80( void *hash, const void *data ); #endif // AVX512 #endif // AVX2 diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index 3de03633..65fbe1fa 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -669,14 +669,14 @@ do { \ ROUND_S_8WAY(2); \ ROUND_S_8WAY(3); \ } \ - H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \ - H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \ - H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \ - H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \ - H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \ - H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \ - H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \ - H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \ + H0 = mm256_xor3( V8, V0, H0 ); \ + H1 = mm256_xor3( V9, V1, H1 ); \ + H2 = mm256_xor3( VA, V2, H2 ); \ + H3 = mm256_xor3( VB, V3, H3 ); \ + H4 = mm256_xor3( VC, V4, H4 ); \ + H5 = mm256_xor3( VD, V5, H5 ); \ + H6 = mm256_xor3( VE, V6, H6 ); \ + H7 = mm256_xor3( VF, V7, H7 ); \ } while (0) @@ -808,14 +808,14 @@ do { \ ROUND_S_16WAY(2); \ ROUND_S_16WAY(3); \ } \ - H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \ - H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \ - H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \ - H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \ - H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \ - H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \ - H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \ - H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \ + H0 = mm512_xor3( V8, V0, H0 ); \ + H1 = mm512_xor3( V9, V1, H1 ); \ + H2 = mm512_xor3( VA, V2, H2 ); \ + H3 = mm512_xor3( VB, V3, H3 ); \ + H4 = mm512_xor3( VC, V4, H4 ); \ + H5 = mm512_xor3( VD, V5, H5 ); \ + H6 = mm512_xor3( VE, V6, H6 ); \ + H7 = mm512_xor3( VF, V7, H7 ); \ } while (0) #endif diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c index d9853c2e..f4824434 100644 --- a/algo/blake/blake2b-hash-4way.c +++ b/algo/blake/blake2b-hash-4way.c @@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last ) B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] ); } - ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] ); - ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] ); - ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] ); - ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] ); - ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] ); - ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] ); - ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] ); - ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] ); + ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] ); + ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] ); + ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] ); + ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] ); + ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] ); + ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] ); + ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] ); + ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] ); } int blake2b_8way_init( blake2b_8way_ctx *ctx ) diff --git a/algo/blake/blake2b-hash-4way.h b/algo/blake/blake2b-hash-4way.h index 979e4b22..1256fb18 100644 --- a/algo/blake/blake2b-hash-4way.h +++ b/algo/blake/blake2b-hash-4way.h @@ -17,7 +17,7 @@ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -ALIGN(128) typedef struct { +typedef struct ALIGN( 64 ) { __m512i b[16]; // input buffer __m512i h[8]; // chained state uint64_t t[2]; // total number of bytes @@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ); #if defined(__AVX2__) // state context -ALIGN(128) typedef struct { +typedef struct ALIGN( 64 ) { __m256i b[16]; // input buffer __m256i h[8]; // chained state uint64_t t[2]; // total number of bytes diff --git a/algo/blake/blake2s-gate.h b/algo/blake/blake2s-gate.h index 4c621b40..4a7942c3 100644 --- a/algo/blake/blake2s-gate.h +++ b/algo/blake/blake2s-gate.h @@ -4,7 +4,6 @@ #include #include "algo-gate-api.h" -//#if defined(__SSE4_2__) #if defined(__SSE2__) #define BLAKE2S_4WAY #endif @@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, #elif defined (BLAKE2S_8WAY) -//#if defined(BLAKE2S_8WAY) - void blake2s_8way_hash( void *state, const void *input ); int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c index 094edd0b..190ad0b7 100644 --- a/algo/blake/blake2s-hash-4way.c +++ b/algo/blake/blake2s-hash-4way.c @@ -368,7 +368,7 @@ do { \ ROUND8W( 9 ); for( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] ); + S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] ); #undef G8W #undef ROUND8W @@ -566,7 +566,7 @@ do { \ ROUND16W( 9 ); for( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] ); + S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] ); #undef G16W #undef ROUND16W diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h index baf28656..fc86c4fc 100644 --- a/algo/blake/blake2s-hash-4way.h +++ b/algo/blake/blake2s-hash-4way.h @@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param } blake2s_nway_param; #pragma pack(pop) -ALIGN( 64 ) typedef struct __blake2s_4way_state +typedef struct ALIGN( 64 ) __blake2s_4way_state { __m128i h[8]; uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ]; @@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, #if defined(__AVX2__) -ALIGN( 64 ) typedef struct __blake2s_8way_state +typedef struct ALIGN( 64 ) __blake2s_8way_state { __m256i h[8]; uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ]; @@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out, #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -ALIGN( 128 ) typedef struct __blake2s_16way_state +typedef struct ALIGN( 64 ) __blake2s_16way_state { __m512i h[8]; uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ]; diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c index a5d53948..d1b5d2bf 100644 --- a/algo/blake/blake512-hash-4way.c +++ b/algo/blake/blake512-hash-4way.c @@ -293,10 +293,6 @@ static const sph_u64 CB[16] = { H5 = (state)->H[5]; \ H6 = (state)->H[6]; \ H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ T0 = (state)->T0; \ T1 = (state)->T1; \ } while (0) @@ -310,10 +306,6 @@ static const sph_u64 CB[16] = { (state)->H[5] = H5; \ (state)->H[6] = H6; \ (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ (state)->T0 = T0; \ (state)->T1 = T1; \ } while (0) @@ -348,7 +340,6 @@ static const sph_u64 CB[16] = { #define DECL_STATE64_8WAY \ __m512i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m512i S0, S1, S2, S3; \ uint64_t T0, T1; #define COMPRESS64_8WAY( buf ) do \ @@ -366,10 +357,10 @@ static const sph_u64 CB[16] = { V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \ - V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \ - VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \ - VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \ + V8 = m512_const1_64( CB0 ); \ + V9 = m512_const1_64( CB1 ); \ + VA = m512_const1_64( CB2 ); \ + VB = m512_const1_64( CB3 ); \ VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ m512_const1_64( CB4 ) ); \ VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ @@ -414,14 +405,14 @@ static const sph_u64 CB[16] = { ROUND_B_8WAY(3); \ ROUND_B_8WAY(4); \ ROUND_B_8WAY(5); \ - H0 = mm512_xor4( V8, V0, S0, H0 ); \ - H1 = mm512_xor4( V9, V1, S1, H1 ); \ - H2 = mm512_xor4( VA, V2, S2, H2 ); \ - H3 = mm512_xor4( VB, V3, S3, H3 ); \ - H4 = mm512_xor4( VC, V4, S0, H4 ); \ - H5 = mm512_xor4( VD, V5, S1, H5 ); \ - H6 = mm512_xor4( VE, V6, S2, H6 ); \ - H7 = mm512_xor4( VF, V7, S3, H7 ); \ + H0 = mm512_xor3( V8, V0, H0 ); \ + H1 = mm512_xor3( V9, V1, H1 ); \ + H2 = mm512_xor3( VA, V2, H2 ); \ + H3 = mm512_xor3( VB, V3, H3 ); \ + H4 = mm512_xor3( VC, V4, H4 ); \ + H5 = mm512_xor3( VD, V5, H5 ); \ + H6 = mm512_xor3( VE, V6, H6 ); \ + H7 = mm512_xor3( VF, V7, H7 ); \ } while (0) void blake512_8way_compress( blake_8way_big_context *sc ) @@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc ) V5 = sc->H[5]; V6 = sc->H[6]; V7 = sc->H[7]; - V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) ); - V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) ); - VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) ); - VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) ); + V8 = m512_const1_64( CB0 ); + V9 = m512_const1_64( CB1 ); + VA = m512_const1_64( CB2 ); + VB = m512_const1_64( CB3 ); VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), m512_const1_64( CB4 ) ); VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), @@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc ) ROUND_B_8WAY(4); ROUND_B_8WAY(5); - sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] ); - sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] ); - sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] ); - sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] ); - sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] ); - sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] ); - sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] ); - sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] ); + sc->H[0] = mm512_xor3( V8, V0, sc->H[0] ); + sc->H[1] = mm512_xor3( V9, V1, sc->H[1] ); + sc->H[2] = mm512_xor3( VA, V2, sc->H[2] ); + sc->H[3] = mm512_xor3( VB, V3, sc->H[3] ); + sc->H[4] = mm512_xor3( VC, V4, sc->H[4] ); + sc->H[5] = mm512_xor3( VD, V5, sc->H[5] ); + sc->H[6] = mm512_xor3( VE, V6, sc->H[6] ); + sc->H[7] = mm512_xor3( VF, V7, sc->H[7] ); } void blake512_8way_init( blake_8way_big_context *sc ) { - __m512i zero = m512_zero; casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 ); casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B ); casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B ); @@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc ) casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); - casti_m512i( sc->S, 0 ) = zero; - casti_m512i( sc->S, 1 ) = zero; - casti_m512i( sc->S, 2 ) = zero; - casti_m512i( sc->S, 3 ) = zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; } @@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst, casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); - casti_m512i( sc->S, 0 ) = m512_zero; - casti_m512i( sc->S, 1 ) = m512_zero; - casti_m512i( sc->S, 2 ) = m512_zero; - casti_m512i( sc->S, 3 ) = m512_zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; @@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst) #define DECL_STATE64_4WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m256i S0, S1, S2, S3; \ uint64_t T0, T1; #define COMPRESS64_4WAY do \ @@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst) V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \ - V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \ - VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \ - VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \ + V8 = m256_const1_64( CB0 ); \ + V9 = m256_const1_64( CB1 ); \ + VA = m256_const1_64( CB2 ); \ + VB = m256_const1_64( CB3 ); \ VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ m256_const1_64( CB4 ) ); \ VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ @@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst) ROUND_B_4WAY(3); \ ROUND_B_4WAY(4); \ ROUND_B_4WAY(5); \ - H0 = mm256_xor4( V8, V0, S0, H0 ); \ - H1 = mm256_xor4( V9, V1, S1, H1 ); \ - H2 = mm256_xor4( VA, V2, S2, H2 ); \ - H3 = mm256_xor4( VB, V3, S3, H3 ); \ - H4 = mm256_xor4( VC, V4, S0, H4 ); \ - H5 = mm256_xor4( VD, V5, S1, H5 ); \ - H6 = mm256_xor4( VE, V6, S2, H6 ); \ - H7 = mm256_xor4( VF, V7, S3, H7 ); \ + H0 = mm256_xor3( V8, V0, H0 ); \ + H1 = mm256_xor3( V9, V1, H1 ); \ + H2 = mm256_xor3( VA, V2, H2 ); \ + H3 = mm256_xor3( VB, V3, H3 ); \ + H4 = mm256_xor3( VC, V4, H4 ); \ + H5 = mm256_xor3( VD, V5, H5 ); \ + H6 = mm256_xor3( VE, V6, H6 ); \ + H7 = mm256_xor3( VF, V7, H7 ); \ } while (0) @@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc ) V5 = sc->H[5]; V6 = sc->H[6]; V7 = sc->H[7]; - V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) ); - V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) ); - VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) ); - VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) ); + V8 = m256_const1_64( CB0 ); + V9 = m256_const1_64( CB1 ); + VA = m256_const1_64( CB2 ); + VB = m256_const1_64( CB3 ); VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), m256_const1_64( CB4 ) ); VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), @@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc ) ROUND_B_4WAY(4); ROUND_B_4WAY(5); - sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] ); - sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] ); - sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] ); - sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] ); - sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] ); - sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] ); - sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] ); - sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] ); + sc->H[0] = mm256_xor3( V8, V0, sc->H[0] ); + sc->H[1] = mm256_xor3( V9, V1, sc->H[1] ); + sc->H[2] = mm256_xor3( VA, V2, sc->H[2] ); + sc->H[3] = mm256_xor3( VB, V3, sc->H[3] ); + sc->H[4] = mm256_xor3( VC, V4, sc->H[4] ); + sc->H[5] = mm256_xor3( VD, V5, sc->H[5] ); + sc->H[6] = mm256_xor3( VE, V6, sc->H[6] ); + sc->H[7] = mm256_xor3( VF, V7, sc->H[7] ); } void blake512_4way_init( blake_4way_big_context *sc ) { - __m256i zero = m256_zero; casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 ); casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B ); casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B ); @@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc ) casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); - casti_m256i( sc->S, 0 ) = zero; - casti_m256i( sc->S, 1 ) = zero; - casti_m256i( sc->S, 2 ) = zero; - casti_m256i( sc->S, 3 ) = zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; } @@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst, casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); - casti_m256i( sc->S, 0 ) = m256_zero; - casti_m256i( sc->S, 1 ) = m256_zero; - casti_m256i( sc->S, 2 ) = m256_zero; - casti_m256i( sc->S, 3 ) = m256_zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; diff --git a/algo/blake/decred-gate.c b/algo/blake/decred-gate.c index 0a90de7f..bee00dd1 100644 --- a/algo/blake/decred-gate.c +++ b/algo/blake/decred-gate.c @@ -8,7 +8,7 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data ) return &work_data[ DECRED_NONCE_INDEX ]; } -double decred_calc_network_diff( struct work* work ) +long double decred_calc_network_diff( struct work* work ) { // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... @@ -16,7 +16,7 @@ double decred_calc_network_diff( struct work* work ) uint32_t bits = ( nbits & 0xffffff ); int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28 int m; - double d = (double)0x0000ffff / (double)bits; + long double d = (long double)0x0000ffff / (long double)bits; for ( m = shift; m < 29; m++ ) d *= 256.0; @@ -25,7 +25,7 @@ double decred_calc_network_diff( struct work* work ) if ( shift == 28 ) d *= 256.0; // testnet if ( opt_debug_diff ) - applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, + applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d, shift, bits ); return net_diff; } @@ -70,7 +70,10 @@ void decred_be_build_stratum_request( char *req, struct work *work, rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); free(xnonce2str); } + +#if !defined(min) #define min(a,b) (a>b ? (b) :(a)) +#endif void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) { diff --git a/algo/blake/sph-blake2s.h b/algo/blake/sph-blake2s.h index 2949fa62..eb66b7a5 100644 --- a/algo/blake/sph-blake2s.h +++ b/algo/blake/sph-blake2s.h @@ -116,7 +116,7 @@ extern "C" { uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 } blake2s_param; - ALIGN( 64 ) typedef struct __blake2s_state + typedef struct ALIGN( 64 ) __blake2s_state { uint32_t h[8]; uint32_t t[2]; diff --git a/algo/blake/sph_blake2b.h b/algo/blake/sph_blake2b.h index eaae071d..17f4381c 100644 --- a/algo/blake/sph_blake2b.h +++ b/algo/blake/sph_blake2b.h @@ -18,7 +18,7 @@ #endif // state context -ALIGN(64) typedef struct { +typedef ALIGN(64) struct { uint8_t b[128]; // input buffer uint64_t h[8]; // chained state uint64_t t[2]; // total number of bytes diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index 92e71836..8b9de767 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16], qt[30] = expand2s8( qt, M, H, 30 ); qt[31] = expand2s8( qt, M, H, 31 ); - xl = _mm256_xor_si256( - mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm256_xor_si256( xl, _mm256_xor_si256( - mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ), + mm256_xor3( qt[19], qt[20], qt[21] ), + _mm256_xor_si256( qt[22], qt[23] ) ); + + xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ), + mm256_xor3( qt[26], qt[27], qt[28] ), + mm256_xor3( qt[29], qt[30], qt[31] ) ); #define DH1L( m, sl, sr, a, b, c ) \ - _mm256_add_epi32( \ - _mm256_xor_si256( M[m], \ - _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \ - _mm256_srli_epi32( qt[a], sr ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) ) + _mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \ + _mm256_srli_epi32( qt[a], sr ) ), \ + mm256_xor3( xl, qt[b], qt[c] ) ) #define DH1R( m, sl, sr, a, b, c ) \ - _mm256_add_epi32( \ - _mm256_xor_si256( M[m], \ - _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \ - _mm256_slli_epi32( qt[a], sr ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) ) + _mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \ + _mm256_slli_epi32( qt[a], sr ) ), \ + mm256_xor3( xl, qt[b], qt[c] ) ) #define DH2L( m, rl, sl, h, a, b, c ) \ _mm256_add_epi32( _mm256_add_epi32( \ - mm256_rol_32( dH[h], rl ), \ - _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ - _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \ - _mm256_xor_si256( qt[b], qt[c] ) ) ); + mm256_rol_32( dH[h], rl ), \ + mm256_xor3( xh, qt[a], M[m] ) ), \ + mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) ) #define DH2R( m, rl, sr, h, a, b, c ) \ _mm256_add_epi32( _mm256_add_epi32( \ - mm256_rol_32( dH[h], rl ), \ - _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ - _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \ - _mm256_xor_si256( qt[b], qt[c] ) ) ); + mm256_rol_32( dH[h], rl ), \ + mm256_xor3( xh, qt[a], M[m] ) ), \ + mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) ) dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 ); @@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16], #undef DH2L #undef DH2R -/* - dH[ 0] = _mm256_add_epi32( - _mm256_xor_si256( M[0], - _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ), - _mm256_srli_epi32( qt[16], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] )); - dH[ 1] = _mm256_add_epi32( - _mm256_xor_si256( M[1], - _mm256_xor_si256( _mm256_srli_epi32( xh, 7 ), - _mm256_slli_epi32( qt[17], 8 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] )); - dH[ 2] = _mm256_add_epi32( - _mm256_xor_si256( M[2], - _mm256_xor_si256( _mm256_srli_epi32( xh, 5 ), - _mm256_slli_epi32( qt[18], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] )); - dH[ 3] = _mm256_add_epi32( - _mm256_xor_si256( M[3], - _mm256_xor_si256( _mm256_srli_epi32( xh, 1 ), - _mm256_slli_epi32( qt[19], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] )); - dH[ 4] = _mm256_add_epi32( - _mm256_xor_si256( M[4], - _mm256_xor_si256( _mm256_srli_epi32( xh, 3 ), - _mm256_slli_epi32( qt[20], 0 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] )); - dH[ 5] = _mm256_add_epi32( - _mm256_xor_si256( M[5], - _mm256_xor_si256( _mm256_slli_epi32( xh, 6 ), - _mm256_srli_epi32( qt[21], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] )); - dH[ 6] = _mm256_add_epi32( - _mm256_xor_si256( M[6], - _mm256_xor_si256( _mm256_srli_epi32( xh, 4 ), - _mm256_slli_epi32( qt[22], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] )); - dH[ 7] = _mm256_add_epi32( - _mm256_xor_si256( M[7], - _mm256_xor_si256( _mm256_srli_epi32( xh, 11 ), - _mm256_slli_epi32( qt[23], 2 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] )); - dH[ 8] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[4], 9 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 8 ), - _mm256_xor_si256( qt[23], qt[ 8] ) ) ); - dH[ 9] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[5], 10 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 6 ), - _mm256_xor_si256( qt[16], qt[ 9] ) ) ); - dH[10] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[6], 11 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 6 ), - _mm256_xor_si256( qt[17], qt[10] ) ) ); - dH[11] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[7], 12 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 4 ), - _mm256_xor_si256( qt[18], qt[11] ) ) ); - dH[12] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[0], 13 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 3 ), - _mm256_xor_si256( qt[19], qt[12] ) ) ); - dH[13] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[1], 14 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 4 ), - _mm256_xor_si256( qt[20], qt[13] ) ) ); - dH[14] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[2], 15 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 7 ), - _mm256_xor_si256( qt[21], qt[14] ) ) ); - dH[15] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[3], 16 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ), - _mm256_xor_si256( qt[22], qt[15] ) ) ); -*/ } static const __m256i final_s8[16] = @@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16], qt[30] = expand2s16( qt, M, H, 30 ); qt[31] = expand2s16( qt, M, H, 31 ); - xl = _mm512_xor_si512( - mm512_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm512_xor_si512( xl, _mm512_xor_si512( - mm512_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ), + mm512_xor3( qt[19], qt[20], qt[21] ), + _mm512_xor_si512( qt[22], qt[23] ) ); + + xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ), + mm512_xor3( qt[26], qt[27], qt[28] ), + mm512_xor3( qt[29], qt[30], qt[31] ) ); #define DH1L( m, sl, sr, a, b, c ) \ - _mm512_add_epi32( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \ - _mm512_srli_epi32( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \ + _mm512_srli_epi32( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH1R( m, sl, sr, a, b, c ) \ - _mm512_add_epi32( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \ - _mm512_slli_epi32( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \ + _mm512_slli_epi32( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH2L( m, rl, sl, h, a, b, c ) \ _mm512_add_epi32( _mm512_add_epi32( \ - mm512_rol_32( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); + mm512_rol_32( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) ) #define DH2R( m, rl, sr, h, a, b, c ) \ _mm512_add_epi32( _mm512_add_epi32( \ - mm512_rol_32( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); + mm512_rol_32( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) ) dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 ); diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index ae97b942..9ab4f897 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) #define rb6(x) mm256_rol_64( x, 43 ) #define rb7(x) mm256_rol_64( x, 53 ) -#define rol_off_64( M, j, off ) \ - mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) - -#define add_elt_b( M, H, j ) \ - _mm256_xor_si256( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \ - rol_off_64( M, j, 3 ) ), \ - rol_off_64( M, j, 10 ) ), \ - _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) +#define rol_off_64( M, j ) \ + mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 ) +#define add_elt_b( mj0, mj3, mj10, h, K ) \ + _mm256_xor_si256( h, _mm256_add_epi64( K, \ + _mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) ) -#define expand1b( qt, M, H, i ) \ - _mm256_add_epi64( mm256_add4_64( \ +#define expand1_b( qt, i ) \ + mm256_add4_64( \ mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \ sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \ mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \ @@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \ sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \ mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \ - sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \ - add_elt_b( M, H, (i)-16 ) ) + sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ) -#define expand2b( qt, M, H, i) \ - _mm256_add_epi64( mm256_add4_64( \ +#define expand2_b( qt, i) \ + mm256_add4_64( \ mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \ qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \ mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \ @@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \ qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \ mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \ - sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \ - add_elt_b( M, H, (i)-16 ) ) - - + sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ) #define Wb0 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \ + _mm256_add_epi64( mh[13], mh[14] ) ) #define Wb1 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \ + _mm256_sub_epi64( mh[14], mh[15] ) ) #define Wb2 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \ + _mm256_sub_epi64( mh[12], mh[15] ) ) #define Wb3 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \ - _mm256_xor_si256( M[13], H[13] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \ + _mm256_sub_epi64( mh[10], \ + mh[13] ) ) #define Wb4 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \ + _mm256_add_epi64( mh[11], mh[14] ) ) #define Wb5 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \ + _mm256_sub_epi64( mh[12], mh[15] ) ) #define Wb6 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \ - _mm256_xor_si256( M[13], H[13] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \ + _mm256_sub_epi64( mh[11], mh[13] ) ) #define Wb7 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \ + _mm256_add_epi64( mh[12], mh[14] ) ) #define Wb8 \ _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[13], mh[15] ) ) #define Wb9 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[ 7], mh[14] ) ) #define Wb10 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \ + _mm256_sub_epi64( mh[ 7], mh[15] ) ) #define Wb11 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \ + _mm256_sub_epi64( mh[ 5], mh[ 9] ) ) #define Wb12 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \ - _mm256_xor_si256( M[10], H[10] ) ) ) + _mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[ 9], mh[10] ) ) #define Wb13 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \ - _mm256_xor_si256( M[11], H[11] ) ) ) + _mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \ + _mm256_add_epi64( mh[10], mh[11] ) ) #define Wb14 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \ - _mm256_xor_si256( M[12], H[12] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \ + _mm256_add_epi64( mh[11], mh[12] ) ) #define Wb15 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[ 4], H[4] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \ - _mm256_xor_si256( M[13], H[13] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[ 9], mh[13] ) ) void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) { __m256i qt[32], xl, xh; + __m256i mh[16]; + int i; + + for ( i = 0; i < 16; i++ ) + mh[i] = _mm256_xor_si256( M[i], H[i] ); qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); @@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] ); qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); - qt[16] = expand1b( qt, M, H, 16 ); - qt[17] = expand1b( qt, M, H, 17 ); - qt[18] = expand2b( qt, M, H, 18 ); - qt[19] = expand2b( qt, M, H, 19 ); - qt[20] = expand2b( qt, M, H, 20 ); - qt[21] = expand2b( qt, M, H, 21 ); - qt[22] = expand2b( qt, M, H, 22 ); - qt[23] = expand2b( qt, M, H, 23 ); - qt[24] = expand2b( qt, M, H, 24 ); - qt[25] = expand2b( qt, M, H, 25 ); - qt[26] = expand2b( qt, M, H, 26 ); - qt[27] = expand2b( qt, M, H, 27 ); - qt[28] = expand2b( qt, M, H, 28 ); - qt[29] = expand2b( qt, M, H, 29 ); - qt[30] = expand2b( qt, M, H, 30 ); - qt[31] = expand2b( qt, M, H, 31 ); + + __m256i mj[16]; + for ( i = 0; i < 16; i++ ) + mj[i] = rol_off_64( M, i ); + + qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], + (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) ); + qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], + (const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) ); + qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], + (const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) ); + qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], + (const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) ); + qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], + (const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) ); + qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], + (const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) ); + qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], + (const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) ); + qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], + (const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) ); + qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], + (const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) ); + qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], + (const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) ); + qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], + (const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) ); + qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], + (const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) ); + qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], + (const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) ); + qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], + (const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) ); + qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], + (const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) ); + qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], + (const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) ); + + qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) ); + qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) ); + qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) ); + qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) ); + qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) ); + qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) ); + qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) ); + qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) ); + qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) ); + qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) ); + qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) ); + qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) ); + qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) ); + qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) ); + qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) ); + qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) ); xl = _mm256_xor_si256( mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), @@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); - #define DH1L( m, sl, sr, a, b, c ) \ _mm256_add_epi64( \ _mm256_xor_si256( M[m], \ @@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #define r8b6(x) mm512_rol_64( x, 43 ) #define r8b7(x) mm512_rol_64( x, 53 ) -#define rol8w_off_64( M, j, off ) \ - mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) +#define rol8w_off_64( M, j ) \ + mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 ) -#define add_elt_b8( M, H, j ) \ - _mm512_xor_si512( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \ - rol8w_off_64( M, j, 3 ) ), \ - rol8w_off_64( M, j, 10 ) ), \ - _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) +#define add_elt_b8( mj0, mj3, mj10, h, K ) \ + _mm512_xor_si512( h, _mm512_add_epi64( K, \ + _mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) ) -#define expand1b8( qt, M, H, i ) \ - _mm512_add_epi64( mm512_add4_64( \ +#define expand1_b8( qt, i ) \ + mm512_add4_64( \ mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \ s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \ mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \ @@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \ s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \ mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \ - s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \ - add_elt_b8( M, H, (i)-16 ) ) + s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ) -#define expand2b8( qt, M, H, i) \ - _mm512_add_epi64( mm512_add4_64( \ +#define expand2_b8( qt, i) \ + mm512_add4_64( \ mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \ qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \ mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \ @@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \ qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \ mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \ - s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \ - add_elt_b8( M, H, (i)-16 ) ) + s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ) #define W8b0 \ _mm512_add_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \ - _mm512_xor_si512( M[ 7], H[ 7] ) ), \ - _mm512_xor_si512( M[10], H[10] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \ + _mm512_add_epi64( mh[13], mh[14] ) ) #define W8b1 \ _mm512_add_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \ - _mm512_xor_si512( M[ 8], H[ 8] ) ), \ - _mm512_xor_si512( M[11], H[11] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \ + _mm512_sub_epi64( mh[14], mh[15] ) ) #define W8b2 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ - _mm512_xor_si512( M[ 7], H[ 7] ) ), \ - _mm512_xor_si512( M[ 9], H[ 9] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \ + _mm512_sub_epi64( mh[12], mh[15] ) ) #define W8b3 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ - _mm512_xor_si512( M[ 1], H[ 1] ) ), \ - _mm512_xor_si512( M[ 8], H[ 8] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \ - _mm512_xor_si512( M[13], H[13] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \ + _mm512_sub_epi64( mh[10], mh[13] ) ) #define W8b4 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ - _mm512_xor_si512( M[ 2], H[ 2] ) ), \ - _mm512_xor_si512( M[ 9], H[ 9] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \ + _mm512_add_epi64( mh[11], mh[14] ) ) #define W8b5 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \ - _mm512_xor_si512( M[ 2], H[ 2] ) ), \ - _mm512_xor_si512( M[10], H[10] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \ + _mm512_sub_epi64( mh[12], mh[15] ) ) #define W8b6 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \ - _mm512_xor_si512( M[ 0], H[ 0] ) ), \ - _mm512_xor_si512( M[ 3], H[ 3] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \ - _mm512_xor_si512( M[13], H[13] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \ + _mm512_sub_epi64( mh[11], mh[13] ) ) #define W8b7 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ - _mm512_xor_si512( M[ 4], H[ 4] ) ), \ - _mm512_xor_si512( M[ 5], H[ 5] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \ + _mm512_add_epi64( mh[12], mh[14] ) ) #define W8b8 \ _mm512_add_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \ - _mm512_xor_si512( M[ 5], H[ 5] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[13], mh[15] ) ) #define W8b9 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ - _mm512_xor_si512( M[ 3], H[ 3] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[ 7], mh[14] ) ) #define W8b10 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \ - _mm512_xor_si512( M[ 1], H[ 1] ) ), \ - _mm512_xor_si512( M[ 4], H[ 4] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \ + _mm512_sub_epi64( mh[ 7], mh[15] ) ) #define W8b11 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \ - _mm512_xor_si512( M[ 0], H[ 0] ) ), \ - _mm512_xor_si512( M[ 2], H[ 2] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \ - _mm512_xor_si512( M[ 9], H[ 9] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \ + _mm512_sub_epi64( mh[ 5], mh[ 9] ) ) #define W8b12 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ - _mm512_xor_si512( M[ 3], H[ 3] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \ - _mm512_xor_si512( M[10], H[10] ) ) ) + _mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[ 9], mh[10] ) ) #define W8b13 \ _mm512_add_epi64( \ - _mm512_add_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \ - _mm512_xor_si512( M[ 4], H[ 4] ) ), \ - _mm512_xor_si512( M[ 7], H[ 7] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \ - _mm512_xor_si512( M[11], H[11] ) ) ) + _mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \ + _mm512_add_epi64( mh[10], mh[11] ) ) #define W8b14 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \ - _mm512_xor_si512( M[ 5], H[ 5] ) ), \ - _mm512_xor_si512( M[ 8], H[ 8] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \ - _mm512_xor_si512( M[12], H[12] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \ + _mm512_add_epi64( mh[11], mh[12] ) ) #define W8b15 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[ 4], H[4] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \ - _mm512_xor_si512( M[13], H[13] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[ 9], mh[13] ) ) void compress_big_8way( const __m512i *M, const __m512i H[16], __m512i dH[16] ) { __m512i qt[32], xl, xh; + __m512i mh[16]; + int i; + + for ( i = 0; i < 16; i++ ) + mh[i] = _mm512_xor_si512( M[i], H[i] ); qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] ); qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] ); @@ -1268,57 +1169,90 @@ void compress_big_8way( const __m512i *M, const __m512i H[16], qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] ); qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] ); qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] ); - qt[16] = expand1b8( qt, M, H, 16 ); - qt[17] = expand1b8( qt, M, H, 17 ); - qt[18] = expand2b8( qt, M, H, 18 ); - qt[19] = expand2b8( qt, M, H, 19 ); - qt[20] = expand2b8( qt, M, H, 20 ); - qt[21] = expand2b8( qt, M, H, 21 ); - qt[22] = expand2b8( qt, M, H, 22 ); - qt[23] = expand2b8( qt, M, H, 23 ); - qt[24] = expand2b8( qt, M, H, 24 ); - qt[25] = expand2b8( qt, M, H, 25 ); - qt[26] = expand2b8( qt, M, H, 26 ); - qt[27] = expand2b8( qt, M, H, 27 ); - qt[28] = expand2b8( qt, M, H, 28 ); - qt[29] = expand2b8( qt, M, H, 29 ); - qt[30] = expand2b8( qt, M, H, 30 ); - qt[31] = expand2b8( qt, M, H, 31 ); - - xl = _mm512_xor_si512( - mm512_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm512_xor_si512( xl, _mm512_xor_si512( - mm512_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + + __m512i mj[16]; + for ( i = 0; i < 16; i++ ) + mj[i] = rol8w_off_64( M, i ); + + qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], + (const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) ); + qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], + (const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) ); + qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], + (const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) ); + qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], + (const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) ); + qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], + (const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) ); + qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], + (const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) ); + qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], + (const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) ); + qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], + (const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) ); + qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], + (const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) ); + qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], + (const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) ); + qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], + (const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) ); + qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], + (const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) ); + qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], + (const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) ); + qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], + (const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) ); + qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], + (const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) ); + qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], + (const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) ); + + qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) ); + qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) ); + qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) ); + qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) ); + qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) ); + qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) ); + qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) ); + qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) ); + qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) ); + qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) ); + qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) ); + qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) ); + qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) ); + qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) ); + qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) ); + qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) ); + + xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ), + mm512_xor3( qt[19], qt[20], qt[21] ), + _mm512_xor_si512( qt[22], qt[23] ) ); + + xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ), + mm512_xor3( qt[26], qt[27], qt[28] ), + mm512_xor3( qt[29], qt[30], qt[31] ) ); #define DH1L( m, sl, sr, a, b, c ) \ - _mm512_add_epi64( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \ - _mm512_srli_epi64( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \ + _mm512_srli_epi64( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH1R( m, sl, sr, a, b, c ) \ - _mm512_add_epi64( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \ - _mm512_slli_epi64( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \ + _mm512_slli_epi64( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH2L( m, rl, sl, h, a, b, c ) \ _mm512_add_epi64( _mm512_add_epi64( \ - mm512_rol_64( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); - + mm512_rol_64( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) ) + #define DH2R( m, rl, sr, h, a, b, c ) \ _mm512_add_epi64( _mm512_add_epi64( \ - mm512_rol_64( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); + mm512_rol_64( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) ) dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c index 1201b8f2..06f7e095 100644 --- a/algo/cubehash/cube-hash-2way.c +++ b/algo/cubehash/cube-hash-2way.c @@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp ) _mm512_store_si512( (__m512i*)sp->h + 7, x7 ); } +// 8 ways, 4 way parallel double buffered +static void transform_4way_2buf( cube_4way_2buf_context *sp ) +{ + int r; + const int rounds = sp->rounds; + + __m512i x0, x1, x2, x3, x4, x5, x6, x7; + __m512i y0, y1, y2, y3, y4, y5, y6, y7; + __m512i tx0, tx1, ty0, ty1; + + x0 = _mm512_load_si512( (__m512i*)sp->h0 ); + x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 ); + x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 ); + x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 ); + x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 ); + x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 ); + x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 ); + x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 ); + + y0 = _mm512_load_si512( (__m512i*)sp->h1 ); + y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 ); + y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 ); + y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 ); + y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 ); + y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 ); + y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 ); + y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 ); + + + for ( r = 0; r < rounds; ++r ) + { + x4 = _mm512_add_epi32( x0, x4 ); + y4 = _mm512_add_epi32( y0, y4 ); + tx0 = x0; + ty0 = y0; + x5 = _mm512_add_epi32( x1, x5 ); + y5 = _mm512_add_epi32( y1, y5 ); + tx1 = x1; + ty1 = y1; + x0 = mm512_rol_32( x2, 7 ); + y0 = mm512_rol_32( y2, 7 ); + x6 = _mm512_add_epi32( x2, x6 ); + y6 = _mm512_add_epi32( y2, y6 ); + x1 = mm512_rol_32( x3, 7 ); + y1 = mm512_rol_32( y3, 7 ); + x7 = _mm512_add_epi32( x3, x7 ); + y7 = _mm512_add_epi32( y3, y7 ); + + + x2 = mm512_rol_32( tx0, 7 ); + y2 = mm512_rol_32( ty0, 7 ); + x0 = _mm512_xor_si512( x0, x4 ); + y0 = _mm512_xor_si512( y0, y4 ); + x4 = mm512_swap128_64( x4 ); + x3 = mm512_rol_32( tx1, 7 ); + y3 = mm512_rol_32( ty1, 7 ); + y4 = mm512_swap128_64( y4 ); + + x1 = _mm512_xor_si512( x1, x5 ); + y1 = _mm512_xor_si512( y1, y5 ); + x5 = mm512_swap128_64( x5 ); + x2 = _mm512_xor_si512( x2, x6 ); + y2 = _mm512_xor_si512( y2, y6 ); + y5 = mm512_swap128_64( y5 ); + x3 = _mm512_xor_si512( x3, x7 ); + y3 = _mm512_xor_si512( y3, y7 ); + + x6 = mm512_swap128_64( x6 ); + x4 = _mm512_add_epi32( x0, x4 ); + y4 = _mm512_add_epi32( y0, y4 ); + y6 = mm512_swap128_64( y6 ); + x5 = _mm512_add_epi32( x1, x5 ); + y5 = _mm512_add_epi32( y1, y5 ); + x7 = mm512_swap128_64( x7 ); + x6 = _mm512_add_epi32( x2, x6 ); + y6 = _mm512_add_epi32( y2, y6 ); + tx0 = x0; + ty0 = y0; + y7 = mm512_swap128_64( y7 ); + tx1 = x2; + ty1 = y2; + x0 = mm512_rol_32( x1, 11 ); + y0 = mm512_rol_32( y1, 11 ); + + x7 = _mm512_add_epi32( x3, x7 ); + y7 = _mm512_add_epi32( y3, y7 ); + + x1 = mm512_rol_32( tx0, 11 ); + y1 = mm512_rol_32( ty0, 11 ); + x0 = _mm512_xor_si512( x0, x4 ); + x4 = mm512_swap64_32( x4 ); + y0 = _mm512_xor_si512( y0, y4 ); + x2 = mm512_rol_32( x3, 11 ); + y4 = mm512_swap64_32( y4 ); + y2 = mm512_rol_32( y3, 11 ); + x1 = _mm512_xor_si512( x1, x5 ); + x5 = mm512_swap64_32( x5 ); + y1 = _mm512_xor_si512( y1, y5 ); + x3 = mm512_rol_32( tx1, 11 ); + y5 = mm512_swap64_32( y5 ); + y3 = mm512_rol_32( ty1, 11 ); + + x2 = _mm512_xor_si512( x2, x6 ); + x6 = mm512_swap64_32( x6 ); + y2 = _mm512_xor_si512( y2, y6 ); + y6 = mm512_swap64_32( y6 ); + x3 = _mm512_xor_si512( x3, x7 ); + x7 = mm512_swap64_32( x7 ); + y3 = _mm512_xor_si512( y3, y7 ); + + y7 = mm512_swap64_32( y7 ); + } + + _mm512_store_si512( (__m512i*)sp->h0, x0 ); + _mm512_store_si512( (__m512i*)sp->h0 + 1, x1 ); + _mm512_store_si512( (__m512i*)sp->h0 + 2, x2 ); + _mm512_store_si512( (__m512i*)sp->h0 + 3, x3 ); + _mm512_store_si512( (__m512i*)sp->h0 + 4, x4 ); + _mm512_store_si512( (__m512i*)sp->h0 + 5, x5 ); + _mm512_store_si512( (__m512i*)sp->h0 + 6, x6 ); + _mm512_store_si512( (__m512i*)sp->h0 + 7, x7 ); + + _mm512_store_si512( (__m512i*)sp->h1, y0 ); + _mm512_store_si512( (__m512i*)sp->h1 + 1, y1 ); + _mm512_store_si512( (__m512i*)sp->h1 + 2, y2 ); + _mm512_store_si512( (__m512i*)sp->h1 + 3, y3 ); + _mm512_store_si512( (__m512i*)sp->h1 + 4, y4 ); + _mm512_store_si512( (__m512i*)sp->h1 + 5, y5 ); + _mm512_store_si512( (__m512i*)sp->h1 + 6, y6 ); + _mm512_store_si512( (__m512i*)sp->h1 + 7, y7 ); +} + int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds, int blockbytes ) { @@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, return 0; } +int cube_4way_2buf_full( cube_4way_2buf_context *sp, + void *output0, void *output1, int hashbitlen, + const void *data0, const void *data1, size_t size ) +{ + __m512i *h0 = (__m512i*)sp->h0; + __m512i *h1 = (__m512i*)sp->h1; + __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512 + : (__m128i*)IV256 ); + sp->hashlen = hashbitlen/128; + sp->blocksize = 32/16; + sp->rounds = 16; + sp->pos = 0; + + h1[0] = h0[0] = m512_const1_128( iv[0] ); + h1[1] = h0[1] = m512_const1_128( iv[1] ); + h1[2] = h0[2] = m512_const1_128( iv[2] ); + h1[3] = h0[3] = m512_const1_128( iv[3] ); + h1[4] = h0[4] = m512_const1_128( iv[4] ); + h1[5] = h0[5] = m512_const1_128( iv[5] ); + h1[6] = h0[6] = m512_const1_128( iv[6] ); + h1[7] = h0[7] = m512_const1_128( iv[7] ); + + const int len = size >> 4; + const __m512i *in0 = (__m512i*)data0; + const __m512i *in1 = (__m512i*)data1; + __m512i *hash0 = (__m512i*)output0; + __m512i *hash1 = (__m512i*)output1; + int i; + + for ( i = 0; i < len; i++ ) + { + sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] ); + sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] ); + sp->pos++; + if ( sp->pos == sp->blocksize ) + { + transform_4way_2buf( sp ); + sp->pos = 0; + } + } + + // pos is zero for 64 byte data, 1 for 80 byte data. + __m512i tmp = m512_const2_64( 0, 0x0000000000000080 ); + sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp ); + sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp ); + + transform_4way_2buf( sp ); + + tmp = m512_const2_64( 0x0000000100000000, 0 ); + sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp ); + sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp ); + + for ( i = 0; i < 10; ++i ) + transform_4way_2buf( sp ); + + memcpy( hash0, sp->h0, sp->hashlen<<6); + memcpy( hash1, sp->h1, sp->hashlen<<6); + + return 0; +} + int cube_4way_update_close( cube_4way_context *sp, void *output, const void *data, size_t size ) @@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output, // 2 way 128 +// This isn't expected to be used with AVX512 so HW rotate intruction +// is assumed not avaiable. +// Use double buffering to optimize serial bit rotations. Full double +// buffering isn't practical because it needs twice as many registers +// with AVX2 having only half as many as AVX512. +#define ROL2( out0, out1, in0, in1, c ) \ +{ \ + __m256i t0 = _mm256_slli_epi32( in0, c ); \ + __m256i t1 = _mm256_slli_epi32( in1, c ); \ + out0 = _mm256_srli_epi32( in0, 32-(c) ); \ + out1 = _mm256_srli_epi32( in1, 32-(c) ); \ + out0 = _mm256_or_si256( out0, t0 ); \ + out1 = _mm256_or_si256( out1, t1 ); \ +} + static void transform_2way( cube_2way_context *sp ) { int r; @@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp ) x7 = _mm256_add_epi32( x3, x7 ); y0 = x0; y1 = x1; - x0 = mm256_rol_32( x2, 7 ); - x1 = mm256_rol_32( x3, 7 ); - x2 = mm256_rol_32( y0, 7 ); - x3 = mm256_rol_32( y1, 7 ); + ROL2( x0, x1, x2, x3, 7 ); + ROL2( x2, x3, y0, y1, 7 ); x0 = _mm256_xor_si256( x0, x4 ); + x4 = mm256_swap128_64( x4 ); x1 = _mm256_xor_si256( x1, x5 ); x2 = _mm256_xor_si256( x2, x6 ); - x3 = _mm256_xor_si256( x3, x7 ); - x4 = mm256_swap128_64( x4 ); x5 = mm256_swap128_64( x5 ); - x6 = mm256_swap128_64( x6 ); - x7 = mm256_swap128_64( x7 ); + x3 = _mm256_xor_si256( x3, x7 ); x4 = _mm256_add_epi32( x0, x4 ); + x6 = mm256_swap128_64( x6 ); + y0 = x0; x5 = _mm256_add_epi32( x1, x5 ); + x7 = mm256_swap128_64( x7 ); x6 = _mm256_add_epi32( x2, x6 ); - x7 = _mm256_add_epi32( x3, x7 ); - y0 = x0; y1 = x2; - x0 = mm256_rol_32( x1, 11 ); - x1 = mm256_rol_32( y0, 11 ); - x2 = mm256_rol_32( x3, 11 ); - x3 = mm256_rol_32( y1, 11 ); + ROL2( x0, x1, x1, y0, 11 ); + x7 = _mm256_add_epi32( x3, x7 ); + ROL2( x2, x3, x3, y1, 11 ); x0 = _mm256_xor_si256( x0, x4 ); - x1 = _mm256_xor_si256( x1, x5 ); - x2 = _mm256_xor_si256( x2, x6 ); - x3 = _mm256_xor_si256( x3, x7 ); x4 = mm256_swap64_32( x4 ); + x1 = _mm256_xor_si256( x1, x5 ); x5 = mm256_swap64_32( x5 ); + x2 = _mm256_xor_si256( x2, x6 ); x6 = mm256_swap64_32( x6 ); + x3 = _mm256_xor_si256( x3, x7 ); x7 = mm256_swap64_32( x7 ); } diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h index 25df10e8..a31ffde0 100644 --- a/algo/cubehash/cube-hash-2way.h +++ b/algo/cubehash/cube-hash-2way.h @@ -17,41 +17,41 @@ struct _cube_4way_context int pos; } __attribute__ ((aligned (128))); +struct _cube_4way_2buf_context +{ + __m512i h0[8]; + __m512i h1[8]; + int hashlen; + int rounds; + int blocksize; + int pos; +} __attribute__ ((aligned (128))); + + typedef struct _cube_4way_context cube_4way_context; +typedef struct _cube_4way_2buf_context cube_4way_2buf_context; + int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds, - int blockbytes ); + int blockbytes ); + int cube_4way_update( cube_4way_context *sp, const void *data, size_t size ); + int cube_4way_close( cube_4way_context *sp, void *output ); + int cube_4way_update_close( cube_4way_context *sp, void *output, const void *data, size_t size ); + int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, const void *data, size_t size ); -int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen, - const void *data, size_t size ); - -#define cube512_4way_init( sp ) cube_4way_update( sp, 512 ) -#define cube512_4way_update cube_4way_update -#define cube512_4way_update_close cube_4way_update -#define cube512_4way_close cube_4way_update -#define cube512_4way_full( sp, output, data, size ) \ - cube_4way_full( sp, output, 512, data, size ) -#define cube512_4x256_full( sp, output, data, size ) \ - cube_4x256_full( sp, output, 512, data, size ) - -#define cube256_4way_init( sp ) cube_4way_update( sp, 256 ) -#define cube256_4way_update cube_4way_update -#define cube256_4way_update_close cube_4way_update -#define cube256_4way_close cube_4way_update -#define cube256_4way_full( sp, output, data, size ) \ - cube_4way_full( sp, output, 256, data, size ) -#define cube256_4x256_full( sp, output, data, size ) \ - cube_4x256_full( sp, output, 256, data, size ) +int cube_4way_2buf_full( cube_4way_2buf_context *sp, + void *output0, void *output1, int hashbitlen, + const void *data0, const void *data1, size_t size ); #endif -// 2x128, 2 way parallel SSE2 +// 2x128, 2 way parallel AVX2 struct _cube_2way_context { diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index c87829db..5ea1b6f6 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -31,10 +31,14 @@ static void transform( cubehashParam *sp ) for ( r = 0; r < rounds; ++r ) { x1 = _mm512_add_epi32( x0, x1 ); - x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 ); - x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) ); - x0 = _mm512_xor_si512( mm512_rol_32( - mm512_swap256_128( x0 ), 11 ), x1 ); + x0 = mm512_swap_256( x0 ); + x0 = mm512_rol_32( x0, 7 ); + x0 = _mm512_xor_si512( x0, x1 ); + x1 = mm512_swap128_64( x1 ); + x1 = _mm512_add_epi32( x0, x1 ); + x0 = mm512_swap256_128( x0 ); + x0 = mm512_rol_32( x0, 11 ); + x0 = _mm512_xor_si512( x0, x1 ); x1 = mm512_swap64_32( x1 ); } diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index 55b27c60..ca1688a4 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -53,10 +53,24 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000 MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234}; +#define ECHO_SUBBYTES4(state, j) \ + state[0][j] = _mm_aesenc_si128(state[0][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[1][j] = _mm_aesenc_si128(state[1][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[2][j] = _mm_aesenc_si128(state[2][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[3][j] = _mm_aesenc_si128(state[3][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \ + state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \ + state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \ + state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero ) + #define ECHO_SUBBYTES(state, i, j) \ state[i][j] = _mm_aesenc_si128(state[i][j], k1);\ - state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\ - k1 = _mm_add_epi32(k1, M128(const1)) + k1 = _mm_add_epi32(k1, M128(const1));\ + state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero)) #define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \ s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\ @@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 t1 = _mm_and_si128(t1, M128(lsbmask));\ t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ s2 = _mm_xor_si128(s2, t2);\ - state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\ + state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\ state2[1][j] = _mm_xor_si128(state2[1][j], s2);\ state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\ state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\ @@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ s2 = _mm_xor_si128(s2, t2);\ state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\ - state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\ + state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\ state2[2][j] = _mm_xor_si128(state2[2][j], s2);\ state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\ s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\ @@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 s2 = _mm_xor_si128(s2, t2);\ state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\ state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\ - state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\ + state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\ state2[3][j] = _mm_xor_si128(state2[3][j], s2) +#define ECHO_ROUND_UNROLL2 \ + ECHO_SUBBYTES4(_state, 0);\ + ECHO_SUBBYTES4(_state, 1);\ + ECHO_SUBBYTES4(_state, 2);\ + ECHO_SUBBYTES4(_state, 3);\ + ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES4(_state2, 0);\ + ECHO_SUBBYTES4(_state2, 1);\ + ECHO_SUBBYTES4(_state2, 2);\ + ECHO_SUBBYTES4(_state2, 3);\ + ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) + +/* #define ECHO_ROUND_UNROLL2 \ ECHO_SUBBYTES(_state, 0, 0);\ ECHO_SUBBYTES(_state, 1, 0);\ @@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) - +*/ #define SAVESTATE(dst, src)\ diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c index eb3c41c2..c8e52cae 100644 --- a/algo/echo/echo-hash-4way.c +++ b/algo/echo/echo-hash-4way.c @@ -10,22 +10,27 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234 }; */ -// do these need to be reversed? #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -#define mul2mask \ - m512_const2_64( 0, 0x00001b00 ) -//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) -// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 ) - -#define lsbmask m512_const1_32( 0x01010101 ) +#define ECHO_SUBBYTES4(state, j) \ + state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \ + state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \ + state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \ + state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero ) #define ECHO_SUBBYTES( state, i, j ) \ state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \ - state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \ - k1 = _mm512_add_epi32( k1, m512_one_128 ); + k1 = _mm512_add_epi32( k1, one ); \ + state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); #define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \ { \ @@ -46,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = t1 = _mm512_and_si512( t1, lsbmask ); \ t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ s2 = _mm512_xor_si512( s2, t2 );\ - state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \ - _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \ + state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \ state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \ state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \ @@ -57,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ s2 = _mm512_xor_si512( s2, t2 ); \ state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \ - state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \ - _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \ + state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \ state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \ s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \ @@ -68,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = s2 = _mm512_xor_si512( s2, t2 ); \ state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \ state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \ - state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \ - _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \ + state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \ } while(0) +#define ECHO_ROUND_UNROLL2 \ + ECHO_SUBBYTES4(_state, 0);\ + ECHO_SUBBYTES4(_state, 1);\ + ECHO_SUBBYTES4(_state, 2);\ + ECHO_SUBBYTES4(_state, 3);\ + ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES4(_state2, 0);\ + ECHO_SUBBYTES4(_state2, 1);\ + ECHO_SUBBYTES4(_state2, 2);\ + ECHO_SUBBYTES4(_state2, 3);\ + ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) + +/* #define ECHO_ROUND_UNROLL2 \ ECHO_SUBBYTES(_state, 0, 0);\ ECHO_SUBBYTES(_state, 1, 0);\ @@ -114,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) +*/ #define SAVESTATE(dst, src)\ dst[0][0] = src[0][0];\ @@ -140,6 +162,9 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg, unsigned int r, b, i, j; __m512i t1, t2, s2, k1; __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; + __m512i one = m512_one_128; + __m512i mul2mask = m512_const2_64( 0, 0x00001b00 ); + __m512i lsbmask = m512_const1_32( 0x01010101 ); _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ]; _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ]; @@ -404,10 +429,24 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, #define lsbmask_2way m256_const1_32( 0x01010101 ) +#define ECHO_SUBBYTES4_2WAY( state, j ) \ + state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \ + state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \ + state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \ + state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero ) + #define ECHO_SUBBYTES_2WAY( state, i, j ) \ state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \ - k1 = _mm256_add_epi32( k1, m256_one_128 ); #define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \ { \ @@ -455,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \ } while(0) +#define ECHO_ROUND_UNROLL2_2WAY \ + ECHO_SUBBYTES4_2WAY(_state, 0);\ + ECHO_SUBBYTES4_2WAY(_state, 1);\ + ECHO_SUBBYTES4_2WAY(_state, 2);\ + ECHO_SUBBYTES4_2WAY(_state, 3);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES4_2WAY(_state2, 0);\ + ECHO_SUBBYTES4_2WAY(_state2, 1);\ + ECHO_SUBBYTES4_2WAY(_state2, 2);\ + ECHO_SUBBYTES4_2WAY(_state2, 3);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2) + +/* #define ECHO_ROUND_UNROLL2_2WAY \ ECHO_SUBBYTES_2WAY(_state, 0, 0);\ ECHO_SUBBYTES_2WAY(_state, 1, 0);\ @@ -496,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2) +*/ #define SAVESTATE_2WAY(dst, src)\ dst[0][0] = src[0][0];\ diff --git a/algo/fugue/fugue-aesni.c b/algo/fugue/fugue-aesni.c index 2dd253a7..8f0af139 100644 --- a/algo/fugue/fugue-aesni.c +++ b/algo/fugue/fugue-aesni.c @@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = { t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\ s7 = _mm_xor_si128(s7, t1) +#define PRESUPERMIX(t0, t1, t2, t3, t4)\ + t2 = t0;\ + t3 = _mm_add_epi8(t0, t0);\ + t4 = _mm_add_epi8(t3, t3);\ + t1 = _mm_srli_epi16(t0, 6);\ + t1 = _mm_and_si128(t1, M128(_lsbmask2));\ + t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\ + t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)) +/* #define PRESUPERMIX(x, t1, s1, s2, t2)\ s1 = x;\ s2 = _mm_add_epi8(x, x);\ @@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = { t1 = _mm_and_si128(t1, M128(_lsbmask2));\ s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\ x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1)) +*/ -#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\ +#define SUBSTITUTE(r0, _t2 )\ _t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\ _t2 = _mm_aesenclast_si128( _t2, m128_zero ) - + +#define SUPERMIX(t0, t1, t2, t3, t4)\ + t2 = t0;\ + t3 = _mm_add_epi8(t0, t0);\ + t4 = _mm_add_epi8(t3, t3);\ + t1 = _mm_srli_epi16(t0, 6);\ + t1 = _mm_and_si128(t1, M128(_lsbmask2));\ + t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \ + t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\ + t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\ + t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\ + t4 = _mm_xor_si128(t4, t1);\ + t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\ + t4 = _mm_xor_si128(t4, t1);\ + t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\ + t2 = mm128_xor3(t2, t3, t0 );\ + t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\ + t4 = mm128_xor3( t4, t1, t2 ); \ + t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\ + t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\ + t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\ + t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\ + t4 = mm128_xor3( t4, t2, t1 ); \ + t0 = _mm_xor_si128(t0, t3);\ + t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c))); + +/* #define SUPERMIX(t0, t1, t2, t3, t4)\ PRESUPERMIX(t0, t1, t2, t3, t4);\ POSTSUPERMIX(t0, t1, t2, t3, t4) - +*/ #define POSTSUPERMIX(t0, t1, t2, t3, t4)\ - t1 = t2;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\ + t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\ t4 = t1;\ t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\ t4 = _mm_xor_si128(t4, t1);\ - t1 = t4;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\ + t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\ t4 = _mm_xor_si128(t4, t1);\ - t1 = t2;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\ + t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\ t4 = _mm_xor_si128(t4, t1);\ - t2 = _mm_xor_si128(t2, t3);\ - t2 = _mm_xor_si128(t2, t0);\ + t2 = mm128_xor3(t2, t3, t0 );\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\ t4 = _mm_xor_si128(t4, t2);\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\ t4 = _mm_xor_si128(t4, t2);\ t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\ - t1 = t0;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\ + t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\ t4 = _mm_xor_si128(t4, t1);\ t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\ t0 = _mm_xor_si128(t0, t3);\ @@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = { t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\ t4 = _mm_xor_si128(t4, t0) - #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\ CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\ PACK_S0(r1c, r1a, _t0);\ - SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r1c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\ r2c = _mm_xor_si128(r2c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r2d = _mm_xor_si128(r2d, _t0);\ UNPACK_S0(r1c, r1a, _t3);\ - SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r2c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\ r3c = _mm_xor_si128(r3c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r3d = _mm_xor_si128(r3d, _t0);\ UNPACK_S0(r2c, r2a, _t3);\ - SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r3c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r3c);\ UNPACK_S0(r3c, r3a, _t3) - #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\ CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\ PACK_S0(r1c, r1a, _t0);\ - SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE( r1c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\ r2c = _mm_xor_si128(r2c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r2d = _mm_xor_si128(r2d, _t0);\ UNPACK_S0(r1c, r1a, _t3);\ - SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r2c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\ r3c = _mm_xor_si128(r3c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r3d = _mm_xor_si128(r3d, _t0);\ UNPACK_S0(r2c, r2a, _t3);\ - SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE( r3c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r3c);\ _t0 = _mm_shuffle_epi32(r3c, 0x39);\ r4c = _mm_xor_si128(r4c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r4d = _mm_xor_si128(r4d, _t0);\ UNPACK_S0(r3c, r3a, _t3);\ - SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE( r4c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r4c);\ UNPACK_S0(r4c, r4a, _t3) - - #define LOADCOLUMN(x, s, a)\ block[0] = col[(base + a + 0) % s];\ block[1] = col[(base + a + 1) % s];\ @@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u case 1: TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5], ctx->state[ 6], ctx->state[8], - ctx->state[9], ctx->state[10], _t0, _t1, _t2 ); + ctx->state[9], ctx->state[10], _t0, _t1, _t2 ); - SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], + SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], - ctx->state[6], ctx->state[0], ctx->state[6], - ctx->state[7], ctx->state[5], ctx->state[11], - ctx->state[5], ctx->state[6], ctx->state[4], - ctx->state[10] ); + ctx->state[6], ctx->state[0], ctx->state[6], + ctx->state[7], ctx->state[5], ctx->state[11], + ctx->state[5], ctx->state[6], ctx->state[4], + ctx->state[10] ); ctx->base++; pmsg += 4; uBlockCount--; @@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u case 2: TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[ 1], ctx->state[2], ctx->state[4], - ctx->state[ 5], ctx->state[6], _t0, _t1, _t2); + ctx->state[ 5], ctx->state[6], _t0, _t1, _t2); SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9], ctx->state[3], ctx->state[4], - ctx->state[2], ctx->state[8], ctx->state[2], - ctx->state[3], ctx->state[1], ctx->state[7], - ctx->state[1], ctx->state[2], ctx->state[0], - ctx->state[6]); + ctx->state[2], ctx->state[8], ctx->state[2], + ctx->state[3], ctx->state[1], ctx->state[7], + ctx->state[1], ctx->state[2], ctx->state[0], + ctx->state[6]); ctx->base = 0; pmsg += 4; @@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u break; } - while( uBlockCount > 0 ) { - TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9], - ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2], - _t0, _t1, _t2 ); - SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11], - ctx->state[5], ctx->state[11], ctx->state[0], - ctx->state[10], ctx->state[4], ctx->state[10], - ctx->state[11], ctx->state[9], ctx->state[3], - ctx->state[9], ctx->state[10], ctx->state[8], - ctx->state[2] ); + TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9], + ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2], + _t0, _t1, _t2 ); + SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5], + ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4], + ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3], + ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] ); ctx->base++; pmsg += 4; uBlockCount--; if( uBlockCount == 0 ) break; - TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5], - ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10], - _t0, _t1, _t2 ); + TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5], + ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10], + _t0, _t1, _t2 ); - SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0], - ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11], - ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]); + SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1], + ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0], + ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11], + ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] ); ctx->base++; pmsg += 4; uBlockCount--; if( uBlockCount == 0 ) break; - TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1], - ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6], - _t0, _t1, _t2); - SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9], - ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8], - ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7], - ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]); + TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1], + ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6], + _t0, _t1, _t2); + SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9], + ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8], + ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7], + ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]); ctx->base = 0; pmsg += 4; @@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u void Final512(hashState_fugue *ctx, BitSequence *hashval) { - unsigned int block[4] __attribute__ ((aligned (32))); - unsigned int col[36] __attribute__ ((aligned (16))); + unsigned int block[4] __attribute__ ((aligned (32))); + unsigned int col[36] __attribute__ ((aligned (16))); unsigned int i, base; __m128i r0, _t0, _t1, _t2, _t3; @@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); } @@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); @@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); @@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); @@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); } diff --git a/algo/fugue/fugue-aesni.h b/algo/fugue/fugue-aesni.h index be9806f4..13fd8f87 100644 --- a/algo/fugue/fugue-aesni.h +++ b/algo/fugue/fugue-aesni.h @@ -14,7 +14,11 @@ #ifndef FUGUE_HASH_API_H #define FUGUE_HASH_API_H -#if defined(__AES__) +#if defined(__AES__) + +#if !defined(__SSE4_1__) +#error "Unsupported configuration, AES needs SSE4.1. Compile without AES." +#endif #include "algo/sha/sha3_common.h" #include "simd-utils.h" @@ -33,12 +37,23 @@ typedef struct } hashState_fugue __attribute__ ((aligned (64))); + +// These functions are deprecated, use the lower case macro aliases that use +// the standard interface. This will be cleaned up at a later date. HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen); HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen); HashReturn fugue512_Final(hashState_fugue *state, void *hashval); +#define fugue512_init( state ) \ + fugue512_Init( state, 512 ) +#define fugue512_update( state, data, len ) \ + fugue512_Update( state, data, (len)<<3 ) +#define fugue512_final \ + fugue512_Final + + HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen); #endif // AES diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index e09e8dea..f2d376e9 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm_xor_si128(j, j);\ - j = _mm_cmpgt_epi8(j, i);\ + j = _mm_cmpgt_epi8( m128_zero, i);\ i = _mm_add_epi8(i, i);\ - j = _mm_and_si128(j, k);\ - i = _mm_xor_si128(i, j);\ + i = mm128_xorand(i, j, k );\ } /**/ @@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ + +#if defined(__AVX512VL__) + +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + TEMP2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + TEMP0 = mm128_xor3( b0, a4, a6 ); \ + /* spill values y_4, y_5 to memory */\ + TEMP1 = mm128_xor3( b1, a5, a7 );\ + b2 = mm128_xor3( b2, a6, a0 ); \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b3 = mm128_xor3( b3, a7, a1 ); \ + b1 = a1;\ + b6 = mm128_xor3( b6, a4, TEMP2 ); \ + b4 = mm128_xor3( b4, a0, TEMP2 ); \ + b7 = mm128_xor3( b7, a5, a3 ); \ + b5 = mm128_xor3( b5, a1, a3 ); \ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(TEMP2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#else + #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ b6 = a0;\ @@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; b1 = _mm_xor_si128(b1, a4);\ }/*MixBytes*/ +#endif + /* one round * a0-a7 = input rows diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h index 61c1b7b0..a8e76747 100644 --- a/algo/groestl/aes_ni/groestl256-intr-aes.h +++ b/algo/groestl/aes_ni/groestl256-intr-aes.h @@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm_xor_si128(j, j);\ - j = _mm_cmpgt_epi8(j, i);\ + j = _mm_cmpgt_epi8( m128_zero, i);\ i = _mm_add_epi8(i, i);\ - j = _mm_and_si128(j, k);\ - i = _mm_xor_si128(i, j);\ + i = mm128_xorand(i, j, k );\ } /* Yet another implementation of MixBytes. @@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ + +#if defined(__AVX512VL__) + +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + TEMP2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + TEMP0 = mm128_xor3( b0, a4, a6 ); \ + /* spill values y_4, y_5 to memory */\ + TEMP1 = mm128_xor3( b1, a5, a7 );\ + b2 = mm128_xor3( b2, a6, a0 ); \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b3 = mm128_xor3( b3, a7, a1 ); \ + b1 = a1;\ + b6 = mm128_xor3( b6, a4, TEMP2 ); \ + b4 = mm128_xor3( b4, a0, TEMP2 ); \ + b7 = mm128_xor3( b7, a5, a3 ); \ + b5 = mm128_xor3( b5, a1, a3 ); \ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(TEMP2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#else + #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ b6 = a0;\ @@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; b1 = _mm_xor_si128(b1, a4);\ }/*MixBytes*/ +#endif + /* one round * i = round number * a0-a7 = input rows diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h index 595dc3df..b76d8098 100644 --- a/algo/groestl/aes_ni/hash-groestl.h +++ b/algo/groestl/aes_ni/hash-groestl.h @@ -43,7 +43,8 @@ #define ROUNDS (ROUNDS1024) //#endif -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +#define ROTL64(a,n) rol64( a, n ) #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h index 9410266c..32ce1a5f 100644 --- a/algo/groestl/aes_ni/hash-groestl256.h +++ b/algo/groestl/aes_ni/hash-groestl256.h @@ -63,7 +63,8 @@ typedef crypto_uint64 u64; //#define ROUNDS (ROUNDS1024) //#endif -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +#define ROTL64(a,n) rol64( a, n ) #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c index dd82a867..adbdf664 100644 --- a/algo/groestl/groestl256-hash-4way.c +++ b/algo/groestl/groestl256-hash-4way.c @@ -51,7 +51,7 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output, const int hashlen_m128i = 32 >> 4; // bytes to __m128i const int hash_offset = SIZE256 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE256; + uint64_t blocks = len / SIZE256; __m512i* in = (__m512i*)input; int i; @@ -89,21 +89,21 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output, if ( i == SIZE256 - 1 ) { // only 1 vector left in buffer, all padding at once - ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 ); + ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 ); } else { // add first padding - ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); + ctx->buffer[i] = m512_const2_64( 0, 0x80 ); // add zero padding for ( i += 1; i < SIZE256 - 1; i++ ) ctx->buffer[i] = m512_zero; // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 ); + ctx->buffer[i] = m512_const2_64( blocks << 56, 0 ); } -// digest final padding block and do output transform + // digest final padding block and do output transform TF512_4way( ctx->chaining, ctx->buffer ); OF512_4way( ctx->chaining ); @@ -122,7 +122,7 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output, const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i const int hash_offset = SIZE256 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE256; + uint64_t blocks = len / SIZE256; __m512i* in = (__m512i*)input; int i; @@ -146,20 +146,18 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output, if ( i == SIZE256 - 1 ) { // only 1 vector left in buffer, all padding at once - ctx->buffer[i] = m512_const1_128( _mm_set_epi8( - blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 ); } else { // add first padding - ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); + ctx->buffer[i] = m512_const2_64( 0, 0x80 ); // add zero padding for ( i += 1; i < SIZE256 - 1; i++ ) ctx->buffer[i] = m512_zero; // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = m512_const1_128( _mm_set_epi8( - blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); + ctx->buffer[i] = m512_const2_64( blocks << 56, 0 ); } // digest final padding block and do output transform @@ -209,23 +207,23 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output, const int hashlen_m128i = 32 >> 4; // bytes to __m128i const int hash_offset = SIZE256 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE256; + uint64_t blocks = len / SIZE256; __m256i* in = (__m256i*)input; int i; - if (ctx->chaining == NULL || ctx->buffer == NULL) - return 1; + if (ctx->chaining == NULL || ctx->buffer == NULL) + return 1; - for ( i = 0; i < SIZE256; i++ ) - { + for ( i = 0; i < SIZE256; i++ ) + { ctx->chaining[i] = m256_zero; ctx->buffer[i] = m256_zero; - } + } - // The only non-zero in the IV is len. It can be hard coded. - ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 ); - ctx->buf_ptr = 0; - ctx->rem_ptr = 0; + // The only non-zero in the IV is len. It can be hard coded. + ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; // --- update --- @@ -247,7 +245,7 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output, if ( i == SIZE256 - 1 ) { // only 1 vector left in buffer, all padding at once - ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 ); + ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 ); } else { @@ -258,10 +256,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output, ctx->buffer[i] = m256_zero; // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 ); + ctx->buffer[i] = m256_const2_64( blocks << 56, 0 ); } -// digest final padding block and do output transform + // digest final padding block and do output transform TF512_2way( ctx->chaining, ctx->buffer ); OF512_2way( ctx->chaining ); @@ -279,7 +277,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output, const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i const int hash_offset = SIZE256 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE256; + uint64_t blocks = len / SIZE256; __m256i* in = (__m256i*)input; int i; @@ -303,8 +301,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output, if ( i == SIZE256 - 1 ) { // only 1 vector left in buffer, all padding at once - ctx->buffer[i] = m256_const1_128( _mm_set_epi8( - blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 ); } else { @@ -315,8 +312,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output, ctx->buffer[i] = m256_zero; // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = m256_const1_128( _mm_set_epi8( - blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); + ctx->buffer[i] = m256_const2_64( blocks << 56, 0 ); } // digest final padding block and do output transform diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index 25d91713..ff62a1c3 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e, * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm512_xor_si512(j, j);\ - j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\ + j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\ i = _mm512_add_epi8(i, i);\ - j = _mm512_and_si512(j, k);\ - i = _mm512_xor_si512(i, j);\ + i = mm512_xorand( i, j, k );\ } /* Yet another implementation of MixBytes. @@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e, We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ + +#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \ + b0, b1, b2, b3, b4, b5, b6, b7) { \ + /* t_i = a_i + a_{i+1} */\ + b6 = a0; \ + b7 = a1; \ + a0 = _mm512_xor_si512( a0, a1 ); \ + b0 = a2; \ + a1 = _mm512_xor_si512( a1, a2 ); \ + b1 = a3; \ + TEMP2 = _mm512_xor_si512( a2, a3 ); \ + b2 = a4; \ + a3 = _mm512_xor_si512( a3, a4 ); \ + b3 = a5; \ + a4 = _mm512_xor_si512( a4, a5 );\ + b4 = a6; \ + a5 = _mm512_xor_si512( a5, a6 ); \ + b5 = a7; \ + a6 = _mm512_xor_si512( a6, a7 ); \ + a7 = _mm512_xor_si512( a7, b6 ); \ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + TEMP0 = mm512_xor3( b0, a4, a6 ); \ + /* spill values y_4, y_5 to memory */\ + TEMP1 = mm512_xor3( b1, a5, a7 ); \ + b2 = mm512_xor3( b2, a6, a0 ); \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0; \ + b3 = mm512_xor3( b3, a7, a1 ); \ + b1 = a1; \ + b6 = mm512_xor3( b6, a4, TEMP2 ); \ + b4 = mm512_xor3( b4, a0, TEMP2 ); \ + b7 = mm512_xor3( b7, a5, a3 ); \ + b5 = mm512_xor3( b5, a1, a3 ); \ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm512_xor_si512( a0, a3 ); \ + a1 = _mm512_xor_si512( a1, a4 ); \ + a2 = _mm512_xor_si512( TEMP2, a5 ); \ + a3 = _mm512_xor_si512( a3, a6 ); \ + a4 = _mm512_xor_si512( a4, a7 ); \ + a5 = _mm512_xor_si512( a5, b0 ); \ + a6 = _mm512_xor_si512( a6, b1 ); \ + a7 = _mm512_xor_si512( a7, TEMP2 ); \ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \ + MUL2( a0, b0, b1 ); \ + a0 = _mm512_xor_si512( a0, TEMP0 ); \ + MUL2( a1, b0, b1 ); \ + a1 = _mm512_xor_si512( a1, TEMP1 ); \ + MUL2( a2, b0, b1 ); \ + a2 = _mm512_xor_si512( a2, b2 ); \ + MUL2( a3, b0, b1 ); \ + a3 = _mm512_xor_si512( a3, b3 ); \ + MUL2( a4, b0, b1 ); \ + a4 = _mm512_xor_si512( a4, b4 ); \ + MUL2( a5, b0, b1 ); \ + a5 = _mm512_xor_si512( a5, b5 ); \ + MUL2( a6, b0, b1 ); \ + a6 = _mm512_xor_si512( a6, b6 ); \ + MUL2( a7, b0, b1 ); \ + a7 = _mm512_xor_si512( a7, b7 ); \ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2( a0, b0, b1 ); \ + b5 = _mm512_xor_si512( b5, a0 ); \ + MUL2( a1, b0, b1 ); \ + b6 = _mm512_xor_si512( b6, a1 ); \ + MUL2( a2, b0, b1 ); \ + b7 = _mm512_xor_si512( b7, a2 ); \ + MUL2( a5, b0, b1 ); \ + b2 = _mm512_xor_si512( b2, a5 ); \ + MUL2( a6, b0, b1 ); \ + b3 = _mm512_xor_si512( b3, a6 ); \ + MUL2( a7, b0, b1 ); \ + b4 = _mm512_xor_si512( b4, a7 ); \ + MUL2( a3, b0, b1 ); \ + MUL2( a4, b0, b1 ); \ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm512_xor_si512( b0, a3 ); \ + b1 = _mm512_xor_si512( b1, a4 ); \ +}/*MixBytes*/ + + +#if 0 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ b6 = a0;\ @@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e, b0 = _mm512_xor_si512(b0, a3);\ b1 = _mm512_xor_si512(b1, a4);\ }/*MixBytes*/ - +#endif #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* AddRoundConstant */\ diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c index bff6af53..b7547339 100644 --- a/algo/groestl/groestl512-hash-4way.c +++ b/algo/groestl/groestl512-hash-4way.c @@ -43,7 +43,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output, const int hashlen_m128i = 64 / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE512; + uint64_t blocks = len / SIZE512; __m512i* in = (__m512i*)input; int i; @@ -64,16 +64,14 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output, if ( i == SIZE512 - 1 ) { // only 1 vector left in buffer, all padding at once - ctx->buffer[i] = m512_const1_128( _mm_set_epi8( - blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 ); } else { - ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); + ctx->buffer[i] = m512_const2_64( 0, 0x80 ); for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m512_zero; - ctx->buffer[i] = m512_const1_128( _mm_set_epi8( - blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); + ctx->buffer[i] = m512_const2_64( blocks << 56, 0 ); } TF1024_4way( ctx->chaining, ctx->buffer ); @@ -124,7 +122,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output, } else { - ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); + ctx->buffer[i] = m512_const2_64( 0, 0x80 ); for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m512_zero; ctx->buffer[i] = m512_const2_64( blocks << 56, 0 ); @@ -168,7 +166,7 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output, const int hashlen_m128i = 64 / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE512; + uint64_t blocks = len / SIZE512; __m256i* in = (__m256i*)input; int i; @@ -189,16 +187,14 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output, if ( i == SIZE512 - 1 ) { // only 1 vector left in buffer, all padding at once - ctx->buffer[i] = m256_const1_128( _mm_set_epi8( - blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 ); } else { ctx->buffer[i] = m256_const2_64( 0, 0x80 ); for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m256_zero; - ctx->buffer[i] = m256_const1_128( _mm_set_epi8( - blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); + ctx->buffer[i] = m256_const2_64( blocks << 56, 0 ); } TF1024_2way( ctx->chaining, ctx->buffer ); diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h index 5d8d7155..354e0187 100644 --- a/algo/groestl/groestl512-intr-4way.h +++ b/algo/groestl/groestl512-intr-4way.h @@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003, * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm512_xor_si512(j, j);\ - j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\ + j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\ i = _mm512_add_epi8(i, i);\ - j = _mm512_and_si512(j, k);\ - i = _mm512_xor_si512(i, j);\ + i = mm512_xorand( i, j, k );\ } /**/ @@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003, We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ +#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \ + b0, b1, b2, b3, b4, b5, b6, b7) { \ /* t_i = a_i + a_{i+1} */\ - b6 = a0;\ - b7 = a1;\ - a0 = _mm512_xor_si512(a0, a1);\ - b0 = a2;\ - a1 = _mm512_xor_si512(a1, a2);\ - b1 = a3;\ - a2 = _mm512_xor_si512(a2, a3);\ - b2 = a4;\ - a3 = _mm512_xor_si512(a3, a4);\ - b3 = a5;\ - a4 = _mm512_xor_si512(a4, a5);\ - b4 = a6;\ - a5 = _mm512_xor_si512(a5, a6);\ - b5 = a7;\ - a6 = _mm512_xor_si512(a6, a7);\ - a7 = _mm512_xor_si512(a7, b6);\ + b6 = a0; \ + b7 = a1; \ + a0 = _mm512_xor_si512( a0, a1 ); \ + b0 = a2; \ + a1 = _mm512_xor_si512( a1, a2 ); \ + b1 = a3; \ + TEMP2 = _mm512_xor_si512( a2, a3 ); \ + b2 = a4; \ + a3 = _mm512_xor_si512( a3, a4 ); \ + b3 = a5; \ + a4 = _mm512_xor_si512( a4, a5 );\ + b4 = a6; \ + a5 = _mm512_xor_si512( a5, a6 ); \ + b5 = a7; \ + a6 = _mm512_xor_si512( a6, a7 ); \ + a7 = _mm512_xor_si512( a7, b6 ); \ \ /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - b0 = _mm512_xor_si512(b0, a4);\ - b6 = _mm512_xor_si512(b6, a4);\ - b1 = _mm512_xor_si512(b1, a5);\ - b7 = _mm512_xor_si512(b7, a5);\ - b2 = _mm512_xor_si512(b2, a6);\ - b0 = _mm512_xor_si512(b0, a6);\ + TEMP0 = mm512_xor3( b0, a4, a6 ); \ /* spill values y_4, y_5 to memory */\ - TEMP0 = b0;\ - b3 = _mm512_xor_si512(b3, a7);\ - b1 = _mm512_xor_si512(b1, a7);\ - TEMP1 = b1;\ - b4 = _mm512_xor_si512(b4, a0);\ - b2 = _mm512_xor_si512(b2, a0);\ + TEMP1 = mm512_xor3( b1, a5, a7 ); \ + b2 = mm512_xor3( b2, a6, a0 ); \ /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - b0 = a0;\ - b5 = _mm512_xor_si512(b5, a1);\ - b3 = _mm512_xor_si512(b3, a1);\ - b1 = a1;\ - b6 = _mm512_xor_si512(b6, a2);\ - b4 = _mm512_xor_si512(b4, a2);\ - TEMP2 = a2;\ - b7 = _mm512_xor_si512(b7, a3);\ - b5 = _mm512_xor_si512(b5, a3);\ + b0 = a0; \ + b3 = mm512_xor3( b3, a7, a1 ); \ + b1 = a1; \ + b6 = mm512_xor3( b6, a4, TEMP2 ); \ + b4 = mm512_xor3( b4, a0, TEMP2 ); \ + b7 = mm512_xor3( b7, a5, a3 ); \ + b5 = mm512_xor3( b5, a1, a3 ); \ \ /* compute x_i = t_i + t_{i+3} */\ - a0 = _mm512_xor_si512(a0, a3);\ - a1 = _mm512_xor_si512(a1, a4);\ - a2 = _mm512_xor_si512(a2, a5);\ - a3 = _mm512_xor_si512(a3, a6);\ - a4 = _mm512_xor_si512(a4, a7);\ - a5 = _mm512_xor_si512(a5, b0);\ - a6 = _mm512_xor_si512(a6, b1);\ - a7 = _mm512_xor_si512(a7, TEMP2);\ + a0 = _mm512_xor_si512( a0, a3 ); \ + a1 = _mm512_xor_si512( a1, a4 ); \ + a2 = _mm512_xor_si512( TEMP2, a5 ); \ + a3 = _mm512_xor_si512( a3, a6 ); \ + a4 = _mm512_xor_si512( a4, a7 ); \ + a5 = _mm512_xor_si512( a5, b0 ); \ + a6 = _mm512_xor_si512( a6, b1 ); \ + a7 = _mm512_xor_si512( a7, TEMP2 ); \ \ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute w_i : add y_{i+4} */\ - b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\ - MUL2(a0, b0, b1);\ - a0 = _mm512_xor_si512(a0, TEMP0);\ - MUL2(a1, b0, b1);\ - a1 = _mm512_xor_si512(a1, TEMP1);\ - MUL2(a2, b0, b1);\ - a2 = _mm512_xor_si512(a2, b2);\ - MUL2(a3, b0, b1);\ - a3 = _mm512_xor_si512(a3, b3);\ - MUL2(a4, b0, b1);\ - a4 = _mm512_xor_si512(a4, b4);\ - MUL2(a5, b0, b1);\ - a5 = _mm512_xor_si512(a5, b5);\ - MUL2(a6, b0, b1);\ - a6 = _mm512_xor_si512(a6, b6);\ - MUL2(a7, b0, b1);\ - a7 = _mm512_xor_si512(a7, b7);\ + b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \ + MUL2( a0, b0, b1 ); \ + a0 = _mm512_xor_si512( a0, TEMP0 ); \ + MUL2( a1, b0, b1 ); \ + a1 = _mm512_xor_si512( a1, TEMP1 ); \ + MUL2( a2, b0, b1 ); \ + a2 = _mm512_xor_si512( a2, b2 ); \ + MUL2( a3, b0, b1 ); \ + a3 = _mm512_xor_si512( a3, b3 ); \ + MUL2( a4, b0, b1 ); \ + a4 = _mm512_xor_si512( a4, b4 ); \ + MUL2( a5, b0, b1 ); \ + a5 = _mm512_xor_si512( a5, b5 ); \ + MUL2( a6, b0, b1 ); \ + a6 = _mm512_xor_si512( a6, b6 ); \ + MUL2( a7, b0, b1 ); \ + a7 = _mm512_xor_si512( a7, b7 ); \ \ /* compute v_i : double w_i */\ /* add to y_4 y_5 .. v3, v4, ... */\ - MUL2(a0, b0, b1);\ - b5 = _mm512_xor_si512(b5, a0);\ - MUL2(a1, b0, b1);\ - b6 = _mm512_xor_si512(b6, a1);\ - MUL2(a2, b0, b1);\ - b7 = _mm512_xor_si512(b7, a2);\ - MUL2(a5, b0, b1);\ - b2 = _mm512_xor_si512(b2, a5);\ - MUL2(a6, b0, b1);\ - b3 = _mm512_xor_si512(b3, a6);\ - MUL2(a7, b0, b1);\ - b4 = _mm512_xor_si512(b4, a7);\ - MUL2(a3, b0, b1);\ - MUL2(a4, b0, b1);\ + MUL2( a0, b0, b1 ); \ + b5 = _mm512_xor_si512( b5, a0 ); \ + MUL2( a1, b0, b1 ); \ + b6 = _mm512_xor_si512( b6, a1 ); \ + MUL2( a2, b0, b1 ); \ + b7 = _mm512_xor_si512( b7, a2 ); \ + MUL2( a5, b0, b1 ); \ + b2 = _mm512_xor_si512( b2, a5 ); \ + MUL2( a6, b0, b1 ); \ + b3 = _mm512_xor_si512( b3, a6 ); \ + MUL2( a7, b0, b1 ); \ + b4 = _mm512_xor_si512( b4, a7 ); \ + MUL2( a3, b0, b1 ); \ + MUL2( a4, b0, b1 ); \ b0 = TEMP0;\ b1 = TEMP1;\ - b0 = _mm512_xor_si512(b0, a3);\ - b1 = _mm512_xor_si512(b1, a4);\ + b0 = _mm512_xor_si512( b0, a3 ); \ + b1 = _mm512_xor_si512( b1, a4 ); \ }/*MixBytes*/ /* one round @@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY = * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2_2WAY(i, j, k){\ - j = _mm256_xor_si256(j, j);\ - j = _mm256_cmpgt_epi8(j, i );\ + j = _mm256_cmpgt_epi8( m256_zero, i );\ i = _mm256_add_epi8(i, i);\ - j = _mm256_and_si256(j, k);\ - i = _mm256_xor_si256(i, j);\ + i = mm256_xorand( i, j, k );\ } #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c index 5a673034..4f17c644 100644 --- a/algo/groestl/myr-groestl.c +++ b/algo/groestl/myr-groestl.c @@ -11,7 +11,7 @@ #else #include "sph_groestl.h" #endif -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" typedef struct { #ifdef __AES__ @@ -19,7 +19,6 @@ typedef struct { #else sph_groestl512_context groestl; #endif - sph_sha256_context sha; } myrgr_ctx_holder; myrgr_ctx_holder myrgr_ctx; @@ -31,7 +30,6 @@ void init_myrgr_ctx() #else sph_groestl512_init( &myrgr_ctx.groestl ); #endif - sph_sha256_init( &myrgr_ctx.sha ); } void myriad_hash(void *output, const void *input) @@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input) sph_groestl512_close(&ctx.groestl, hash); #endif - sph_sha256( &ctx.sha, hash, 64 ); - sph_sha256_close( &ctx.sha, hash ); + sha256_full( hash, hash, 64 ); memcpy(output, hash, 32); } diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index 9fca48bf..c9f558cc 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, input, 640 ); groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) ); groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 ); uint32_t hash0[20] __attribute__ ((aligned (64))); @@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input ) // rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, - hash6, hash7 ); #else @@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input ) hash4, hash5, hash6, hash7, input, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); - - intrlv_8x32( vhash, hash0, hash1, hash2, hash3, - hash4, hash5, hash6, hash7, 512 ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); #endif + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7 ); + sha256_8way_update( &ctx.sha, vhash, 64 ); sha256_8way_close( &ctx.sha, output ); } diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index d86bd42d..38bf0763 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -545,39 +545,33 @@ static const sph_u32 T512[64][16] = { #define sE c7 #define sF m7 - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -// Hamsi 8 way +// Hamsi 8 way AVX512 + +// Intel says _mm512_movepi64_mask has (1L/1T) timimg while +// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13 +// on i9-9940x cmplt with zero was 3% faster than movepi. #define INPUT_BIG8 \ do { \ - __m512i db = *buf; \ - const uint64_t *tp = (uint64_t*)&T512[0][0]; \ - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \ + __m512i db = _mm512_ror_epi64( *buf, 1 ); \ + const __m512i zero = m512_zero; \ + const uint64_t *tp = (const uint64_t*)T512; \ + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \ for ( int u = 0; u < 64; u++ ) \ { \ - __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \ - dm = mm512_negate_32( _mm512_or_si512( dm, \ - _mm512_slli_epi64( dm, 32 ) ) ); \ - m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \ - m512_const1_64( tp[0] ) ) ); \ - m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \ - m512_const1_64( tp[1] ) ) ); \ - m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \ - m512_const1_64( tp[2] ) ) ); \ - m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \ - m512_const1_64( tp[3] ) ) ); \ - m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \ - m512_const1_64( tp[4] ) ) ); \ - m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \ - m512_const1_64( tp[5] ) ) ); \ - m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \ - m512_const1_64( tp[6] ) ) ); \ - m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \ - m512_const1_64( tp[7] ) ) ); \ + const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \ + m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \ + m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \ + m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \ + m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \ + m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \ + m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \ + m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \ + m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \ + db = _mm512_ror_epi64( db, 1 ); \ tp += 8; \ - db = _mm512_srli_epi64( db, 1 ); \ } \ } while (0) @@ -585,20 +579,13 @@ do { \ do { \ __m512i t; \ t = a; \ - a = _mm512_and_si512( a, c ); \ - a = _mm512_xor_si512( a, d ); \ - c = _mm512_xor_si512( c, b ); \ - c = _mm512_xor_si512( c, a ); \ - d = _mm512_or_si512( d, t ); \ - d = _mm512_xor_si512( d, b ); \ + a = mm512_xorand( d, a, c ); \ + c = mm512_xor3( a, b, c ); \ + b = mm512_xoror( b, d, t ); \ t = _mm512_xor_si512( t, c ); \ - b = d; \ - d = _mm512_or_si512( d, t ); \ - d = _mm512_xor_si512( d, a ); \ - a = _mm512_and_si512( a, b ); \ - t = _mm512_xor_si512( t, a ); \ - b = _mm512_xor_si512( b, d ); \ - b = _mm512_xor_si512( b, t ); \ + d = mm512_xoror( a, b, t ); \ + t = mm512_xorand( t, a, b ); \ + b = mm512_xor3( b, d, t ); \ a = c; \ c = b; \ b = d; \ @@ -609,14 +596,12 @@ do { \ do { \ a = mm512_rol_32( a, 13 ); \ c = mm512_rol_32( c, 3 ); \ - b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \ - d = _mm512_xor_si512( d, _mm512_xor_si512( c, \ - _mm512_slli_epi32( a, 3 ) ) ); \ + b = mm512_xor3( a, b, c ); \ + d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \ b = mm512_rol_32( b, 1 ); \ d = mm512_rol_32( d, 7 ); \ - a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \ - c = _mm512_xor_si512( c, _mm512_xor_si512( d, \ - _mm512_slli_epi32( b, 7 ) ) ); \ + a = mm512_xor3( a, b, d ); \ + c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \ a = mm512_rol_32( a, 5 ); \ c = mm512_rol_32( c, 22 ); \ } while (0) @@ -626,162 +611,192 @@ do { \ #define READ_STATE_BIG8(sc) \ do { \ - c0 = sc->h[0x0]; \ - c1 = sc->h[0x1]; \ - c2 = sc->h[0x2]; \ - c3 = sc->h[0x3]; \ - c4 = sc->h[0x4]; \ - c5 = sc->h[0x5]; \ - c6 = sc->h[0x6]; \ - c7 = sc->h[0x7]; \ + c0 = sc->h[0]; \ + c1 = sc->h[1]; \ + c2 = sc->h[2]; \ + c3 = sc->h[3]; \ + c4 = sc->h[4]; \ + c5 = sc->h[5]; \ + c6 = sc->h[6]; \ + c7 = sc->h[7]; \ } while (0) #define WRITE_STATE_BIG8(sc) \ do { \ - sc->h[0x0] = c0; \ - sc->h[0x1] = c1; \ - sc->h[0x2] = c2; \ - sc->h[0x3] = c3; \ - sc->h[0x4] = c4; \ - sc->h[0x5] = c5; \ - sc->h[0x6] = c6; \ - sc->h[0x7] = c7; \ + sc->h[0] = c0; \ + sc->h[1] = c1; \ + sc->h[2] = c2; \ + sc->h[3] = c3; \ + sc->h[4] = c4; \ + sc->h[5] = c5; \ + sc->h[6] = c6; \ + sc->h[7] = c7; \ } while (0) - -#define ROUND_BIG8(rc, alpha) \ +#define ROUND_BIG8( alpha ) \ do { \ __m512i t0, t1, t2, t3; \ - s0 = _mm512_xor_si512( s0, m512_const1_64( \ - ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \ - s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \ - s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \ - s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \ - s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \ - s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \ - s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \ - s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \ - s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \ - s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \ - sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \ - sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \ - sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \ - sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \ - sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \ - sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \ + s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \ + s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \ + s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \ + s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \ + s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \ + s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \ + s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \ + s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \ + s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \ + s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \ + sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \ + sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \ + sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \ + sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \ + sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \ + sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \ \ - SBOX8( s0, s4, s8, sC ); \ - SBOX8( s1, s5, s9, sD ); \ - SBOX8( s2, s6, sA, sE ); \ - SBOX8( s3, s7, sB, sF ); \ + SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \ + SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \ + SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \ + SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \ - _mm512_bslli_epi128( s5, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \ - _mm512_bslli_epi128( sE, 4 ) ); \ + s4 = mm512_swap64_32( s4 ); \ + s5 = mm512_swap64_32( s5 ); \ + sD = mm512_swap64_32( sD ); \ + sE = mm512_swap64_32( sE ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \ L8( s0, t1, s9, t3 ); \ - s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \ - s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \ - sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \ - sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \ + s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \ + s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \ + sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \ + sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ - _mm512_bslli_epi128( s6, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \ - _mm512_bslli_epi128( sF, 4 ) ); \ + s6 = mm512_swap64_32( s6 ); \ + sF = mm512_swap64_32( sF ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \ L8( s1, t1, sA, t3 ); \ - s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ - s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \ - sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \ - sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \ + s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \ + s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \ + sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \ + sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \ - _mm512_bslli_epi128( s7, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \ - _mm512_bslli_epi128( sC, 4 ) ); \ + s7 = mm512_swap64_32( s7 ); \ + sC = mm512_swap64_32( sC ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \ L8( s2, t1, sB, t3 ); \ - s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \ - s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \ - sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \ - sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \ + s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \ + s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \ + sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \ + sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \ + s6 = mm512_swap64_32( s6 ); \ + sF = mm512_swap64_32( sF ); \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \ - _mm512_bslli_epi128( s4, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \ - _mm512_bslli_epi128( sD, 4 ) ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \ L8( s3, t1, s8, t3 ); \ - s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \ - s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \ - sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \ - sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \ + s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \ + s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \ + sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \ + sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \ + s7 = mm512_swap64_32( s7 ); \ + sC = mm512_swap64_32( sC ); \ \ - t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \ + t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \ - t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \ - _mm512_bslli_epi128( sB, 4 ) ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \ + t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \ + t3 = mm512_swap64_32( t3 ); \ L8( t0, t1, t2, t3 ); \ + t3 = mm512_swap64_32( t3 ); \ s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \ - s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \ + s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \ s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \ s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \ - s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \ + s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \ sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \ - s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \ - sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \ + s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \ + sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \ \ - t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ - _mm512_bslli_epi128( sD, 4 ) ); \ - t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \ + t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \ L8( t0, t1, t2, t3 ); \ - s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \ + s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \ sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \ - s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ - sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \ + s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \ + sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \ s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \ - sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \ + sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \ s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \ sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \ + s4 = mm512_swap64_32( s4 ); \ + s5 = mm512_swap64_32( s5 ); \ + sD = mm512_swap64_32( sD ); \ + sE = mm512_swap64_32( sE ); \ } while (0) #define P_BIG8 \ do { \ - ROUND_BIG8(0, alpha_n); \ - ROUND_BIG8(1, alpha_n); \ - ROUND_BIG8(2, alpha_n); \ - ROUND_BIG8(3, alpha_n); \ - ROUND_BIG8(4, alpha_n); \ - ROUND_BIG8(5, alpha_n); \ + __m512i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ } while (0) #define PF_BIG8 \ do { \ - ROUND_BIG8( 0, alpha_f); \ - ROUND_BIG8( 1, alpha_f); \ - ROUND_BIG8( 2, alpha_f); \ - ROUND_BIG8( 3, alpha_f); \ - ROUND_BIG8( 4, alpha_f); \ - ROUND_BIG8( 5, alpha_f); \ - ROUND_BIG8( 6, alpha_f); \ - ROUND_BIG8( 7, alpha_f); \ - ROUND_BIG8( 8, alpha_f); \ - ROUND_BIG8( 9, alpha_f); \ - ROUND_BIG8(10, alpha_f); \ - ROUND_BIG8(11, alpha_f); \ + __m512i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \ + ROUND_BIG8( alpha ); \ } while (0) #define T_BIG8 \ do { /* order is important */ \ - c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \ - c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \ - c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \ - c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \ - c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \ - c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \ - c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \ - c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \ + c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \ + c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \ + c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \ + c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \ + c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \ + c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \ + c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \ + c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \ } while (0) void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) @@ -818,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf ) WRITE_STATE_BIG8( sc ); } - void hamsi512_8way_init( hamsi_8way_big_context *sc ) { sc->partial_len = 0; @@ -849,13 +863,11 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data, void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) { __m512i pad[1]; - int ch, cl; + uint32_t ch, cl; sph_enc32be( &ch, sc->count_high ); sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); - pad[0] = _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch, - cl, ch, cl, ch, cl, ch, cl, ch ); -// pad[0] = m512_const2_32( cl, ch ); + pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch ); sc->buf[0] = m512_const1_64( 0x80 ); hamsi_8way_big( sc, sc->buf, 1 ); hamsi_8way_big_final( sc, pad ); @@ -863,22 +875,19 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) mm512_block_bswap_32( (__m512i*)dst, sc->h ); } - #endif // AVX512 - -// Hamsi 4 way +// Hamsi 4 way AVX2 #define INPUT_BIG \ do { \ __m256i db = *buf; \ - const uint64_t *tp = (uint64_t*)&T512[0][0]; \ - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \ - for ( int u = 0; u < 64; u++ ) \ + const __m256i zero = m256_zero; \ + const uint64_t *tp = (const uint64_t*)T512; \ + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \ + for ( int u = 63; u >= 0; u-- ) \ { \ - __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \ - dm = mm256_negate_32( _mm256_or_si256( dm, \ - _mm256_slli_epi64( dm, 32 ) ) ); \ + __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \ m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ m256_const1_64( tp[0] ) ) ); \ m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ @@ -896,7 +905,6 @@ do { \ m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \ m256_const1_64( tp[7] ) ) ); \ tp += 8; \ - db = _mm256_srli_epi64( db, 1 ); \ } \ } while (0) @@ -945,180 +953,192 @@ do { \ #define READ_STATE_BIG(sc) \ do { \ - c0 = sc->h[0x0]; \ - c1 = sc->h[0x1]; \ - c2 = sc->h[0x2]; \ - c3 = sc->h[0x3]; \ - c4 = sc->h[0x4]; \ - c5 = sc->h[0x5]; \ - c6 = sc->h[0x6]; \ - c7 = sc->h[0x7]; \ + c0 = sc->h[0]; \ + c1 = sc->h[1]; \ + c2 = sc->h[2]; \ + c3 = sc->h[3]; \ + c4 = sc->h[4]; \ + c5 = sc->h[5]; \ + c6 = sc->h[6]; \ + c7 = sc->h[7]; \ } while (0) #define WRITE_STATE_BIG(sc) \ do { \ - sc->h[0x0] = c0; \ - sc->h[0x1] = c1; \ - sc->h[0x2] = c2; \ - sc->h[0x3] = c3; \ - sc->h[0x4] = c4; \ - sc->h[0x5] = c5; \ - sc->h[0x6] = c6; \ - sc->h[0x7] = c7; \ + sc->h[0] = c0; \ + sc->h[1] = c1; \ + sc->h[2] = c2; \ + sc->h[3] = c3; \ + sc->h[4] = c4; \ + sc->h[5] = c5; \ + sc->h[6] = c6; \ + sc->h[7] = c7; \ } while (0) -/* -#define s0 m0 -#define s1 c0 -#define s2 m1 -#define s3 c1 -#define s4 c2 -#define s5 m2 -#define s6 c3 -#define s7 m3 -#define s8 m4 -#define s9 c4 -#define sA m5 -#define sB c5 -#define sC c6 -#define sD m6 -#define sE c7 -#define sF m7 -*/ - -#define ROUND_BIG(rc, alpha) \ +#define ROUND_BIG( alpha ) \ do { \ __m256i t0, t1, t2, t3; \ - s0 = _mm256_xor_si256( s0, m256_const1_64( \ - ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \ - s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \ - s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \ - s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \ - s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \ - s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \ - s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \ - s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \ - s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \ - s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \ - sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \ - sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \ - sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \ - sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \ - sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \ - sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \ + s0 = _mm256_xor_si256( s0, alpha[ 0] ); \ + s1 = _mm256_xor_si256( s1, alpha[ 1] ); \ + s2 = _mm256_xor_si256( s2, alpha[ 2] ); \ + s3 = _mm256_xor_si256( s3, alpha[ 3] ); \ + s4 = _mm256_xor_si256( s4, alpha[ 4] ); \ + s5 = _mm256_xor_si256( s5, alpha[ 5] ); \ + s6 = _mm256_xor_si256( s6, alpha[ 6] ); \ + s7 = _mm256_xor_si256( s7, alpha[ 7] ); \ + s8 = _mm256_xor_si256( s8, alpha[ 8] ); \ + s9 = _mm256_xor_si256( s9, alpha[ 9] ); \ + sA = _mm256_xor_si256( sA, alpha[10] ); \ + sB = _mm256_xor_si256( sB, alpha[11] ); \ + sC = _mm256_xor_si256( sC, alpha[12] ); \ + sD = _mm256_xor_si256( sD, alpha[13] ); \ + sE = _mm256_xor_si256( sE, alpha[14] ); \ + sF = _mm256_xor_si256( sF, alpha[15] ); \ \ SBOX( s0, s4, s8, sC ); \ SBOX( s1, s5, s9, sD ); \ SBOX( s2, s6, sA, sE ); \ SBOX( s3, s7, sB, sF ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \ - _mm256_bslli_epi128( s5, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \ - _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ + s4 = mm256_swap64_32( s4 ); \ + s5 = mm256_swap64_32( s5 ); \ + sD = mm256_swap64_32( sD ); \ + sE = mm256_swap64_32( sE ); \ + t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \ + t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \ L( s0, t1, s9, t3 ); \ - s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \ + s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \ + sD = _mm256_blend_epi32( sD, t3, 0x55 ); \ + sE = _mm256_blend_epi32( sE, t3, 0xaa ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ - _mm256_bslli_epi128( s6, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \ - _mm256_bslli_epi128( sF, 4 ), 0xAA ); \ + s6 = mm256_swap64_32( s6 ); \ + sF = mm256_swap64_32( sF ); \ + t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \ + t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \ L( s1, t1, sA, t3 ); \ - s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \ + s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \ + sE = _mm256_blend_epi32( sE, t3, 0x55 ); \ + sF = _mm256_blend_epi32( sF, t3, 0xaa ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \ - _mm256_bslli_epi128( s7, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \ - _mm256_bslli_epi128( sC, 4 ), 0xAA ); \ + s7 = mm256_swap64_32( s7 ); \ + sC = mm256_swap64_32( sC ); \ + t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \ + t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \ L( s2, t1, sB, t3 ); \ - s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \ + s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \ + sF = _mm256_blend_epi32( sF, t3, 0x55 ); \ + sC = _mm256_blend_epi32( sC, t3, 0xaa ); \ + s6 = mm256_swap64_32( s6 ); \ + sF = mm256_swap64_32( sF ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \ - _mm256_bslli_epi128( s4, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \ - _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ + t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \ + t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \ L( s3, t1, s8, t3 ); \ - s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \ + s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \ + sC = _mm256_blend_epi32( sC, t3, 0x55 ); \ + sD = _mm256_blend_epi32( sD, t3, 0xaa ); \ + s7 = mm256_swap64_32( s7 ); \ + sC = mm256_swap64_32( sC ); \ \ - t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \ - t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \ - t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \ - _mm256_bslli_epi128( sB, 4 ), 0xAA ); \ + t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \ + t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \ + t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \ + t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \ + t3 = mm256_swap64_32( t3 ); \ L( t0, t1, t2, t3 ); \ + t3 = mm256_swap64_32( t3 ); \ s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \ - s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \ + s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \ s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \ - s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \ - s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \ - sA = _mm256_blend_epi32( sA, t2, 0xAA ); \ - s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \ - sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \ + s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \ + s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \ + sA = _mm256_blend_epi32( sA, t2, 0xaa ); \ + s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \ + sB = _mm256_blend_epi32( sB, t3, 0x55 ); \ \ - t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ - _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ - t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \ + t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \ + t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \ + t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \ + t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \ L( t0, t1, t2, t3 ); \ - s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \ - sC = _mm256_blend_epi32( sC, t0, 0xAA ); \ - s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \ - sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \ + s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \ + sC = _mm256_blend_epi32( sC, t0, 0xaa ); \ + s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \ + sD = _mm256_blend_epi32( sD, t1, 0xaa ); \ s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \ - sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \ + sE = _mm256_blend_epi32( sE, t2, 0xaa ); \ s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \ - sF = _mm256_blend_epi32( sF, t3, 0xAA ); \ + sF = _mm256_blend_epi32( sF, t3, 0xaa ); \ + s4 = mm256_swap64_32( s4 ); \ + s5 = mm256_swap64_32( s5 ); \ + sD = mm256_swap64_32( sD ); \ + sE = mm256_swap64_32( sE ); \ } while (0) #define P_BIG \ do { \ - ROUND_BIG(0, alpha_n); \ - ROUND_BIG(1, alpha_n); \ - ROUND_BIG(2, alpha_n); \ - ROUND_BIG(3, alpha_n); \ - ROUND_BIG(4, alpha_n); \ - ROUND_BIG(5, alpha_n); \ + __m256i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ } while (0) #define PF_BIG \ do { \ - ROUND_BIG( 0, alpha_f); \ - ROUND_BIG( 1, alpha_f); \ - ROUND_BIG( 2, alpha_f); \ - ROUND_BIG( 3, alpha_f); \ - ROUND_BIG( 4, alpha_f); \ - ROUND_BIG( 5, alpha_f); \ - ROUND_BIG( 6, alpha_f); \ - ROUND_BIG( 7, alpha_f); \ - ROUND_BIG( 8, alpha_f); \ - ROUND_BIG( 9, alpha_f); \ - ROUND_BIG(10, alpha_f); \ - ROUND_BIG(11, alpha_f); \ + __m256i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \ + ROUND_BIG( alpha ); \ } while (0) #define T_BIG \ do { /* order is important */ \ - c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \ - c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \ - c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \ - c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \ - c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \ - c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \ - c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \ - c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \ + c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \ + c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \ + c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \ + c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \ + c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \ + c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \ + c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \ + c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \ } while (0) void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) @@ -1186,14 +1206,12 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data, void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst ) { __m256i pad[1]; - int ch, cl; + uint32_t ch, cl; sph_enc32be( &ch, sc->count_high ); sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); - pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch ); + pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch ); sc->buf[0] = m256_const1_64( 0x80 ); -// sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL, -// 0UL, 0x80UL, 0UL, 0x80UL ); hamsi_big( sc, sc->buf, 1 ); hamsi_big_final( sc, pad ); diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index 6b45e10b..20c9755f 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -522,50 +522,53 @@ do { \ // Haval-256 8 way 32 bit avx2 +#if defined (__AVX512VL__) + +// ( ~( a ^ b ) ) & c +#define mm256_andnotxor( a, b, c ) \ + _mm256_ternarylogic_epi32( a, b, c, 0x82 ) + +#else + +#define mm256_andnotxor( a, b, c ) \ + _mm256_andnot_si256( _mm256_xor_si256( a, b ), c ) + +#endif + #define F1_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( x0, \ - _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \ - _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \ - _mm256_and_si256( x3, x6 ) ) ) ) \ + mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \ + _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \ + _mm256_and_si256( x3, x6 ) ) ) \ #define F2_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( \ - _mm256_and_si256( x2, \ - _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \ - _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \ - _mm256_xor_si256( x6, x0 ) ) ) ), \ - _mm256_xor_si256( \ - _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \ - _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \ + mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \ + mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 ) ), \ + mm256_andxor( x4, x1, x5 ), \ + mm256_xorand( x0, x3, x5 ) ) \ #define F3_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( \ - _mm256_and_si256( x3, \ - _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ - _mm256_xor_si256( x6, x0 ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \ - _mm256_and_si256( x2, x5 ) ), x0 ) ) + mm256_xor3( x0, \ + _mm256_and_si256( x3, \ + mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \ + _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \ + _mm256_and_si256( x2, x5 ) ) ) #define F4_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( \ - _mm256_xor_si256( \ - _mm256_and_si256( x3, \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ - _mm256_or_si256( x4, x6 ) ), x5 ) ), \ - _mm256_and_si256( x4, \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \ - _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \ - _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) ) - + mm256_xor3( \ + mm256_andxor( x3, x5, \ + _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ + _mm256_or_si256( x4, x6 ) ) ), \ + _mm256_and_si256( x4, \ + mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \ + _mm256_xor_si256( x1, x6 ) ) ), \ + mm256_xorand( x0, x2, x6 ) ) #define F5_8W(x6, x5, x4, x3, x2, x1, x0) \ _mm256_xor_si256( \ - _mm256_and_si256( x0, \ - mm256_not( _mm256_xor_si256( \ - _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \ - _mm256_and_si256( x2, x5 ) ), \ - _mm256_and_si256( x3, x6 ) ) ) + mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \ + mm256_xor3( _mm256_and_si256( x1, x4 ), \ + _mm256_and_si256( x2, x5 ), \ + _mm256_and_si256( x3, x6 ) ) ) #define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \ F1_8W(x1, x0, x3, x5, x6, x2, x4) diff --git a/algo/hodl/hodl-wolf.c b/algo/hodl/hodl-wolf.c index 6ff61757..7ce79da8 100644 --- a/algo/hodl/hodl-wolf.c +++ b/algo/hodl/hodl-wolf.c @@ -7,6 +7,7 @@ #include "hodl-gate.h" #include "hodl-wolf.h" #include "miner.h" +#include "algo/sha/sha256d.h" #if defined(__AES__) diff --git a/algo/hodl/sha512-avx.h b/algo/hodl/sha512-avx.h index eb7f094a..6fbb5bf7 100644 --- a/algo/hodl/sha512-avx.h +++ b/algo/hodl/sha512-avx.h @@ -45,6 +45,6 @@ void sha512Compute32b_parallel( uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]); -void sha512ProcessBlock(Sha512Context *context); +void sha512ProcessBlock(Sha512Context contexti[2] ); #endif diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c index 452bc8a6..98a9da01 100644 --- a/algo/jh/jh-hash-4way.c +++ b/algo/jh/jh-hash-4way.c @@ -51,15 +51,15 @@ extern "C"{ do { \ __m512i cc = _mm512_set1_epi64( c ); \ x3 = mm512_not( x3 ); \ - x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \ - tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \ - x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \ - x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \ - x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \ - x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \ - x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \ - x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \ - x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \ + x0 = mm512_xorandnot( x0, x2, cc ); \ + tmp = mm512_xorand( cc, x0, x1 ); \ + x0 = mm512_xorand( x0, x2, x3 ); \ + x3 = mm512_xorandnot( x3, x1, x2 ); \ + x1 = mm512_xorand( x1, x0, x2 ); \ + x2 = mm512_xorandnot( x2, x3, x0 ); \ + x0 = mm512_xoror( x0, x1, x3 ); \ + x3 = mm512_xorand( x3, x1, x2 ); \ + x1 = mm512_xorand( x1, tmp, x0 ); \ x2 = _mm512_xor_si512( x2, tmp ); \ } while (0) @@ -67,11 +67,11 @@ do { \ do { \ x4 = _mm512_xor_si512( x4, x1 ); \ x5 = _mm512_xor_si512( x5, x2 ); \ - x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \ + x6 = mm512_xor3( x6, x3, x0 ); \ x7 = _mm512_xor_si512( x7, x0 ); \ x0 = _mm512_xor_si512( x0, x5 ); \ x1 = _mm512_xor_si512( x1, x6 ); \ - x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \ + x2 = mm512_xor3( x2, x7, x4 ); \ x3 = _mm512_xor_si512( x3, x4 ); \ } while (0) @@ -318,12 +318,12 @@ static const sph_u64 C[] = { #define Wz_8W(x, c, n) \ do { \ __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \ - x ## h = _mm512_or_si512( _mm512_and_si512( \ - _mm512_srli_epi64(x ## h, (n)), (c)), t ); \ + x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \ t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \ - x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \ + x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \ } while (0) + #define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 ) #define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 ) #define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 ) diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index 282ae91a..c710836b 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -1,5 +1,6 @@ #include "keccak-gate.h" #include "sph_keccak.h" +#include "algo/sha/sha256d.h" int hard_coded_eb = 1; diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index cc883322..458201cb 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -53,7 +53,8 @@ static const uint64_t RC[] = { #define WRITE_STATE(sc) #define MOV64(d, s) (d = s) -#define XOR64_IOTA XOR64 +#define XOR64_IOTA XOR + #define LPAR ( #define RPAR ) @@ -70,12 +71,16 @@ static const uint64_t RC[] = { // Targetted macros, keccak-macros.h is included for each target. -#define DECL64(x) __m512i x -#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b)) -#define AND64(d, a, b) (d = _mm512_and_si512(a,b)) -#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) -#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) -#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) +#define DECL64(x) __m512i x +#define XOR(d, a, b) (d = _mm512_xor_si512(a,b)) +#define XOR64 XOR +#define AND64(d, a, b) (d = _mm512_and_si512(a,b)) +#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) +#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) +#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) +#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c)) +#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c)) +#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c )) #include "keccak-macros.c" @@ -233,12 +238,15 @@ keccak512_8way_close(void *cc, void *dst) #undef INPUT_BUF #undef DECL64 #undef XOR64 +#undef XOR #undef AND64 #undef OR64 #undef NOT64 #undef ROL64 #undef KECCAK_F_1600 - +#undef XOROR +#undef XORAND +#undef XOR3 #endif // AVX512 // AVX2 @@ -250,11 +258,15 @@ keccak512_8way_close(void *cc, void *dst) } while (0) #define DECL64(x) __m256i x -#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b)) +#define XOR(d, a, b) (d = _mm256_xor_si256(a,b)) +#define XOR64 XOR #define AND64(d, a, b) (d = _mm256_and_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define ROL64(d, v, n) (d = mm256_rol_64(v, n)) +#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c))) +#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c))) +#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c )) #include "keccak-macros.c" @@ -414,10 +426,14 @@ keccak512_4way_close(void *cc, void *dst) #undef INPUT_BUF #undef DECL64 #undef XOR64 +#undef XOR #undef AND64 #undef OR64 #undef NOT64 #undef ROL64 #undef KECCAK_F_1600 +#undef XOROR +#undef XORAND +#undef XOR3 #endif // AVX2 diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c index 64606c37..6b7776d3 100644 --- a/algo/keccak/keccak-macros.c +++ b/algo/keccak/keccak-macros.c @@ -1,6 +1,19 @@ #ifdef TH_ELT #undef TH_ELT #endif + +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + XOR3( tt0, d0, d1, d4 ); \ + XOR( tt1, d2, d3 ); \ + XOR( tt0, tt0, tt1 ); \ + ROL64( tt0, tt0, 1 ); \ + XOR3( tt1, c0, c1, c4 ); \ + XOR3( tt0, tt0, c2, c3 ); \ + XOR( t, tt0, tt1 ); \ +} while (0) +/* #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ DECL64(tt0); \ DECL64(tt1); \ @@ -17,7 +30,7 @@ XOR64(tt2, tt2, tt3); \ XOR64(t, tt0, tt2); \ } while (0) - +*/ #ifdef THETA #undef THETA #endif @@ -110,20 +123,34 @@ #ifdef KHI_XO #undef KHI_XO #endif + +#define KHI_XO(d, a, b, c) do { \ + XOROR(d, a, b, c); \ + } while (0) + +/* #define KHI_XO(d, a, b, c) do { \ DECL64(kt); \ OR64(kt, b, c); \ XOR64(d, a, kt); \ } while (0) +*/ #ifdef KHI_XA #undef KHI_XA #endif + +#define KHI_XA(d, a, b, c) do { \ + XORAND(d, a, b, c); \ + } while (0) + +/* #define KHI_XA(d, a, b, c) do { \ DECL64(kt); \ AND64(kt, b, c); \ XOR64(d, a, kt); \ } while (0) +*/ #ifdef KHI #undef KHI @@ -134,65 +161,47 @@ do { \ DECL64(c0); \ DECL64(c1); \ - DECL64(c2); \ - DECL64(c3); \ - DECL64(c4); \ DECL64(bnn); \ NOT64(bnn, b20); \ KHI_XO(c0, b00, b10, b20); \ KHI_XO(c1, b10, bnn, b30); \ - KHI_XA(c2, b20, b30, b40); \ - KHI_XO(c3, b30, b40, b00); \ - KHI_XA(c4, b40, b00, b10); \ + KHI_XA(b20, b20, b30, b40); \ + KHI_XO(b30, b30, b40, b00); \ + KHI_XA(b40, b40, b00, b10); \ MOV64(b00, c0); \ MOV64(b10, c1); \ - MOV64(b20, c2); \ - MOV64(b30, c3); \ - MOV64(b40, c4); \ NOT64(bnn, b41); \ KHI_XO(c0, b01, b11, b21); \ KHI_XA(c1, b11, b21, b31); \ - KHI_XO(c2, b21, b31, bnn); \ - KHI_XO(c3, b31, b41, b01); \ - KHI_XA(c4, b41, b01, b11); \ + KHI_XO(b21, b21, b31, bnn); \ + KHI_XO(b31, b31, b41, b01); \ + KHI_XA(b41, b41, b01, b11); \ MOV64(b01, c0); \ MOV64(b11, c1); \ - MOV64(b21, c2); \ - MOV64(b31, c3); \ - MOV64(b41, c4); \ NOT64(bnn, b32); \ KHI_XO(c0, b02, b12, b22); \ KHI_XA(c1, b12, b22, b32); \ - KHI_XA(c2, b22, bnn, b42); \ - KHI_XO(c3, bnn, b42, b02); \ - KHI_XA(c4, b42, b02, b12); \ + KHI_XA(b22, b22, bnn, b42); \ + KHI_XO(b32, bnn, b42, b02); \ + KHI_XA(b42, b42, b02, b12); \ MOV64(b02, c0); \ MOV64(b12, c1); \ - MOV64(b22, c2); \ - MOV64(b32, c3); \ - MOV64(b42, c4); \ NOT64(bnn, b33); \ KHI_XA(c0, b03, b13, b23); \ KHI_XO(c1, b13, b23, b33); \ - KHI_XO(c2, b23, bnn, b43); \ - KHI_XA(c3, bnn, b43, b03); \ - KHI_XO(c4, b43, b03, b13); \ + KHI_XO(b23, b23, bnn, b43); \ + KHI_XA(b33, bnn, b43, b03); \ + KHI_XO(b43, b43, b03, b13); \ MOV64(b03, c0); \ MOV64(b13, c1); \ - MOV64(b23, c2); \ - MOV64(b33, c3); \ - MOV64(b43, c4); \ NOT64(bnn, b14); \ KHI_XA(c0, b04, bnn, b24); \ KHI_XO(c1, bnn, b24, b34); \ - KHI_XA(c2, b24, b34, b44); \ - KHI_XO(c3, b34, b44, b04); \ - KHI_XA(c4, b44, b04, b14); \ + KHI_XA(b24, b24, b34, b44); \ + KHI_XO(b34, b34, b44, b04); \ + KHI_XA(b44, b44, b04, b14); \ MOV64(b04, c0); \ MOV64(b14, c1); \ - MOV64(b24, c2); \ - MOV64(b34, c3); \ - MOV64(b44, c4); \ } while (0) #ifdef IOTA @@ -201,6 +210,7 @@ #define IOTA(r) XOR64_IOTA(a00, a00, r) #ifdef P0 +#undef P0 #undef P1 #undef P2 #undef P3 diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index aad56b63..3d1ce0d9 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -66,6 +66,17 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = { a = _mm512_xor_si512(a,c0);\ b = _mm512_xor_si512(b,c1); +#define MULT24W( a0, a1 ) \ +do { \ + __m512i b = _mm512_xor_si512( a0, \ + _mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \ + a0 = _mm512_or_si512( _mm512_bsrli_epi128( b, 4 ), \ + _mm512_bslli_epi128( a1,12 ) ); \ + a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \ + _mm512_bslli_epi128( b,12 ) ); \ +} while(0) + +/* #define MULT24W( a0, a1, mask ) \ do { \ __m512i b = _mm512_xor_si512( a0, \ @@ -73,6 +84,7 @@ do { \ a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\ a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\ } while(0) +*/ // confirm pointer arithmetic // ok but use array indexes @@ -85,6 +97,21 @@ do { \ MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\ ADD_CONSTANT4W(*x, *(x+4), c0, c1); +#define SUBCRUMB4W(a0,a1,a2,a3,t)\ + t = a0;\ + a0 = mm512_xoror( a3, a0, a1 ); \ + a2 = _mm512_xor_si512(a2,a3);\ + a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ + a3 = mm512_xorand( a2, a3, t ); \ + a2 = mm512_xorand( a1, a2, a0);\ + a1 = _mm512_or_si512(a1,a3);\ + a3 = _mm512_xor_si512(a3,a2);\ + t = _mm512_xor_si512(t,a1);\ + a2 = _mm512_and_si512(a2,a1);\ + a1 = mm512_xnor(a1,a0);\ + a0 = t; + +/* #define SUBCRUMB4W(a0,a1,a2,a3,t)\ t = _mm512_load_si512(&a0);\ a0 = _mm512_or_si512(a0,a1);\ @@ -103,7 +130,25 @@ do { \ a2 = _mm512_and_si512(a2,a1);\ a1 = _mm512_xor_si512(a1,a0);\ a0 = _mm512_load_si512(&t); +*/ +#define MIXWORD4W(a,b,t1,t2)\ + b = _mm512_xor_si512(a,b);\ + t1 = _mm512_slli_epi32(a,2);\ + t2 = _mm512_srli_epi32(a,30);\ + a = mm512_xoror( b, t1, t2 ); \ + t1 = _mm512_slli_epi32(b,14);\ + t2 = _mm512_srli_epi32(b,18);\ + b = _mm512_or_si512(t1,t2);\ + b = mm512_xoror( a, t1, t2 ); \ + t1 = _mm512_slli_epi32(a,10);\ + t2 = _mm512_srli_epi32(a,22);\ + a = mm512_xoror( b, t1, t2 ); \ + t1 = _mm512_slli_epi32(b,1);\ + t2 = _mm512_srli_epi32(b,31);\ + b = _mm512_or_si512(t1,t2); + +/* #define MIXWORD4W(a,b,t1,t2)\ b = _mm512_xor_si512(a,b);\ t1 = _mm512_slli_epi32(a,2);\ @@ -121,6 +166,7 @@ do { \ t1 = _mm512_slli_epi32(b,1);\ t2 = _mm512_srli_epi32(b,31);\ b = _mm512_or_si512(t1,t2); +*/ #define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\ a1 = _mm512_shuffle_epi32(a1,147);\ @@ -235,21 +281,13 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg ) __m512i msg0, msg1; __m512i tmp[2]; __m512i x[8]; - const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff ); - - t0 = chainv[0]; - t1 = chainv[1]; - t0 = _mm512_xor_si512( t0, chainv[2] ); - t1 = _mm512_xor_si512( t1, chainv[3] ); - t0 = _mm512_xor_si512( t0, chainv[4] ); - t1 = _mm512_xor_si512( t1, chainv[5] ); - t0 = _mm512_xor_si512( t0, chainv[6] ); - t1 = _mm512_xor_si512( t1, chainv[7] ); - t0 = _mm512_xor_si512( t0, chainv[8] ); - t1 = _mm512_xor_si512( t1, chainv[9] ); + t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] ); + t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] ); + t0 = mm512_xor3( t0, chainv[6], chainv[8] ); + t1 = mm512_xor3( t1, chainv[7], chainv[9] ); - MULT24W( t0, t1, MASK ); + MULT24W( t0, t1 ); msg0 = _mm512_shuffle_epi32( msg[0], 27 ); msg1 = _mm512_shuffle_epi32( msg[1], 27 ); @@ -268,68 +306,67 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg ) t0 = chainv[0]; t1 = chainv[1]; - MULT24W( chainv[0], chainv[1], MASK ); + MULT24W( chainv[0], chainv[1] ); chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] ); chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] ); - MULT24W( chainv[2], chainv[3], MASK ); + MULT24W( chainv[2], chainv[3] ); chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]); chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]); - MULT24W( chainv[4], chainv[5], MASK ); + MULT24W( chainv[4], chainv[5] ); chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]); chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]); - MULT24W( chainv[6], chainv[7], MASK ); + MULT24W( chainv[6], chainv[7] ); chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]); chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]); - MULT24W( chainv[8], chainv[9], MASK ); + MULT24W( chainv[8], chainv[9] ); chainv[8] = _mm512_xor_si512( chainv[8], t0 ); chainv[9] = _mm512_xor_si512( chainv[9], t1 ); t0 = chainv[8]; t1 = chainv[9]; - MULT24W( chainv[8], chainv[9], MASK ); + MULT24W( chainv[8], chainv[9] ); chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] ); chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] ); - MULT24W( chainv[6], chainv[7], MASK ); + MULT24W( chainv[6], chainv[7] ); chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] ); chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] ); - MULT24W( chainv[4], chainv[5], MASK ); + MULT24W( chainv[4], chainv[5] ); chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] ); chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] ); - MULT24W( chainv[2], chainv[3], MASK ); + MULT24W( chainv[2], chainv[3] ); chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] ); chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] ); - MULT24W( chainv[0], chainv[1], MASK ); - chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 ); - chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 ); + MULT24W( chainv[0], chainv[1] ); + chainv[0] = mm512_xor3( chainv[0], t0, msg0 ); + chainv[1] = mm512_xor3( chainv[1], t1, msg1 ); - MULT24W( msg0, msg1, MASK ); + MULT24W( msg0, msg1 ); chainv[2] = _mm512_xor_si512( chainv[2], msg0 ); chainv[3] = _mm512_xor_si512( chainv[3], msg1 ); - MULT24W( msg0, msg1, MASK ); + MULT24W( msg0, msg1 ); chainv[4] = _mm512_xor_si512( chainv[4], msg0 ); chainv[5] = _mm512_xor_si512( chainv[5], msg1 ); - MULT24W( msg0, msg1, MASK ); + MULT24W( msg0, msg1 ); chainv[6] = _mm512_xor_si512( chainv[6], msg0 ); chainv[7] = _mm512_xor_si512( chainv[7], msg1 ); - MULT24W( msg0, msg1, MASK ); + MULT24W( msg0, msg1); chainv[8] = _mm512_xor_si512( chainv[8], msg0 ); chainv[9] = _mm512_xor_si512( chainv[9], msg1 ); - MULT24W( msg0, msg1, MASK ); + MULT24W( msg0, msg1 ); - // replace with ror chainv[3] = _mm512_rol_epi32( chainv[3], 1 ); chainv[5] = _mm512_rol_epi32( chainv[5], 2 ); chainv[7] = _mm512_rol_epi32( chainv[7], 3 ); @@ -388,19 +425,11 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b ) /*---- blank round with m=0 ----*/ rnd512_4way( state, zero ); - - t[0] = chainv[0]; - t[1] = chainv[1]; - - t[0] = _mm512_xor_si512( t[0], chainv[2] ); - t[1] = _mm512_xor_si512( t[1], chainv[3] ); - t[0] = _mm512_xor_si512( t[0], chainv[4] ); - t[1] = _mm512_xor_si512( t[1], chainv[5] ); - t[0] = _mm512_xor_si512( t[0], chainv[6] ); - t[1] = _mm512_xor_si512( t[1], chainv[7] ); - t[0] = _mm512_xor_si512( t[0], chainv[8] ); - t[1] = _mm512_xor_si512( t[1], chainv[9] ); - + + t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] ); + t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] ); + t[0] = mm512_xor3( t[0], chainv[6], chainv[8] ); + t[1] = mm512_xor3( t[1], chainv[7], chainv[9] ); t[0] = _mm512_shuffle_epi32( t[0], 27 ); t[1] = _mm512_shuffle_epi32( t[1], 27 ); @@ -496,7 +525,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data, { // remaining data bytes buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 ); - buffer[1] = m512_const2_64( 0, 0x0000000080000000 ); + buffer[1] = m512_const1_i128( 0x0000000080000000 ); } return 0; } @@ -520,7 +549,7 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval ) rnd512_4way( state, buffer ); else { // empty pad block, constant data - msg[0] = m512_const2_64( 0, 0x0000000080000000 ); + msg[0] = m512_const1_i128( 0x0000000080000000 ); msg[1] = m512_zero; rnd512_4way( state, msg ); } @@ -583,13 +612,13 @@ int luffa512_4way_full( luffa_4way_context *state, void *output, { // padding of partial block msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = m512_const2_64( 0, 0x0000000080000000 ); + msg[1] = m512_const1_i128( 0x0000000080000000 ); rnd512_4way( state, msg ); } else { // empty pad block - msg[0] = m512_const2_64( 0, 0x0000000080000000 ); + msg[0] = m512_const1_i128( 0x0000000080000000 ); msg[1] = m512_zero; rnd512_4way( state, msg ); } @@ -631,13 +660,13 @@ int luffa_4way_update_close( luffa_4way_context *state, { // padding of partial block msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = m512_const2_64( 0, 0x0000000080000000 ); + msg[1] = m512_const1_i128( 0x0000000080000000 ); rnd512_4way( state, msg ); } else { // empty pad block - msg[0] = m512_const2_64( 0, 0x0000000080000000 ); + msg[0] = m512_const1_i128( 0x0000000080000000 ); msg[1] = m512_zero; rnd512_4way( state, msg ); } @@ -666,8 +695,6 @@ do { \ a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \ } while(0) -// confirm pointer arithmetic -// ok but use array indexes #define STEP_PART(x,c0,c1,t)\ SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ @@ -678,23 +705,23 @@ do { \ ADD_CONSTANT(*x, *(x+4), c0, c1); #define SUBCRUMB(a0,a1,a2,a3,t)\ - t = _mm256_load_si256(&a0);\ + t = a0;\ a0 = _mm256_or_si256(a0,a1);\ a2 = _mm256_xor_si256(a2,a3);\ - a1 = _mm256_andnot_si256(a1, m256_neg1 );\ + a1 = mm256_not( a1 );\ a0 = _mm256_xor_si256(a0,a3);\ a3 = _mm256_and_si256(a3,t);\ a1 = _mm256_xor_si256(a1,a3);\ a3 = _mm256_xor_si256(a3,a2);\ a2 = _mm256_and_si256(a2,a0);\ - a0 = _mm256_andnot_si256(a0, m256_neg1 );\ + a0 = mm256_not( a0 );\ a2 = _mm256_xor_si256(a2,a1);\ a1 = _mm256_or_si256(a1,a3);\ t = _mm256_xor_si256(t,a1);\ a3 = _mm256_xor_si256(a3,a2);\ a2 = _mm256_and_si256(a2,a1);\ a1 = _mm256_xor_si256(a1,a0);\ - a0 = _mm256_load_si256(&t);\ + a0 = t;\ #define MIXWORD(a,b,t1,t2)\ b = _mm256_xor_si256(a,b);\ @@ -832,7 +859,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg ) __m256i msg0, msg1; __m256i tmp[2]; __m256i x[8]; - const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff ); + const __m256i MASK = m256_const1_i128( 0x00000000ffffffff ); t0 = chainv[0]; t1 = chainv[1]; @@ -1088,7 +1115,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data, { // remaining data bytes buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 ); - buffer[1] = m256_const2_64( 0, 0x0000000080000000 ); + buffer[1] = m256_const1_i128( 0x0000000080000000 ); } return 0; } @@ -1104,7 +1131,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval ) rnd512_2way( state, buffer ); else { // empty pad block, constant data - msg[0] = m256_const2_64( 0, 0x0000000080000000 ); + msg[0] = m256_const1_i128( 0x0000000080000000 ); msg[1] = m256_zero; rnd512_2way( state, msg ); } @@ -1159,13 +1186,13 @@ int luffa512_2way_full( luffa_2way_context *state, void *output, { // padding of partial block msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = m256_const2_64( 0, 0x0000000080000000 ); + msg[1] = m256_const1_i128( 0x0000000080000000 ); rnd512_2way( state, msg ); } else { // empty pad block - msg[0] = m256_const2_64( 0, 0x0000000080000000 ); + msg[0] = m256_const1_i128( 0x0000000080000000 ); msg[1] = m256_zero; rnd512_2way( state, msg ); } @@ -1206,13 +1233,13 @@ int luffa_2way_update_close( luffa_2way_context *state, { // padding of partial block msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = m256_const2_64( 0, 0x0000000080000000 ); + msg[1] = m256_const1_i128( 0x0000000080000000 ); rnd512_2way( state, msg ); } else { // empty pad block - msg[0] = m256_const2_64( 0, 0x0000000080000000 ); + msg[0] = m256_const1_i128( 0x0000000080000000 ); msg[1] = m256_zero; rnd512_2way( state, msg ); } diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index 780e56d7..fee498a6 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -23,7 +23,7 @@ #include "simd-utils.h" #include "luffa_for_sse2.h" -#define MULT2(a0,a1) do \ +#define MULT2( a0, a1 ) do \ { \ __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \ a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \ @@ -345,11 +345,11 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, // 16 byte partial block exists for 80 byte len if ( state->rembytes ) // padding of partial block - rnd512( state, m128_const_64( 0, 0x80000000 ), + rnd512( state, m128_const_i128( 0x80000000 ), mm128_bswap_32( cast_m128i( data ) ) ); else // empty pad block - rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) ); + rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) ); finalization512( state, (uint32*) output ); if ( state->hashbitlen > 512 ) @@ -394,11 +394,11 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, // 16 byte partial block exists for 80 byte len if ( state->rembytes ) // padding of partial block - rnd512( state, m128_const_64( 0, 0x80000000 ), + rnd512( state, m128_const_i128( 0x80000000 ), mm128_bswap_32( cast_m128i( data ) ) ); else // empty pad block - rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) ); + rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) ); finalization512( state, (uint32*) output ); if ( state->hashbitlen > 512 ) @@ -606,7 +606,6 @@ static void finalization512( hashState_luffa *state, uint32 *b ) casti_m256i( b, 0 ) = _mm256_shuffle_epi8( casti_m256i( hash, 0 ), shuff_bswap32 ); -// casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); rnd512( state, zero, zero ); @@ -621,7 +620,6 @@ static void finalization512( hashState_luffa *state, uint32 *b ) casti_m256i( b, 1 ) = _mm256_shuffle_epi8( casti_m256i( hash, 0 ), shuff_bswap32 ); -// casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); } #else diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index 833b87ec..f16047e9 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -16,7 +16,7 @@ typedef struct { blake256_16way_context blake; keccak256_8way_context keccak; - cube_4way_context cube; + cube_4way_2buf_context cube; skein256_8way_context skein; #if defined(__VAES__) groestl256_4way_context groestl; @@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx; bool init_allium_16way_ctx() { keccak256_8way_init( &allium_16way_ctx.keccak ); - cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 ); skein256_8way_init( &allium_16way_ctx.skein ); -#if defined(__VAES__) - groestl256_4way_init( &allium_16way_ctx.groestl, 32 ); -#else - init_groestl256( &allium_16way_ctx.groestl, 32 ); -#endif return true; } @@ -75,7 +69,6 @@ void allium_16way_hash( void *state, const void *input ) intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, 256 ); -// rintrlv_8x32_8x64( vhashA, vhash, 256 ); keccak256_8way_update( &ctx.keccak, vhashA, 32 ); keccak256_8way_close( &ctx.keccak, vhashA); keccak256_8way_init( &ctx.keccak ); @@ -111,12 +104,11 @@ void allium_16way_hash( void *state, const void *input ) intrlv_2x256( vhash, hash14, hash15, 256 ); LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash14, hash15, vhash, 256 ); - + intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 ); - cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); - cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 ); dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 ); @@ -124,8 +116,7 @@ void allium_16way_hash( void *state, const void *input ) intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 ); intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 ); - cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); - cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 ); dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 ); dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 ); @@ -255,7 +246,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, typedef struct { blake256_8way_context blake; keccak256_4way_context keccak; - cubehashParam cube; + cube_2way_context cube; skein256_4way_context skein; #if defined(__VAES__) groestl256_2way_context groestl; @@ -269,13 +260,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx; bool init_allium_8way_ctx() { keccak256_4way_init( &allium_8way_ctx.keccak ); - cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 ); skein256_4way_init( &allium_8way_ctx.skein ); -#if defined(__VAES__) - groestl256_2way_init( &allium_8way_ctx.groestl, 32 ); -#else - init_groestl256( &allium_8way_ctx.groestl, 32 ); -#endif return true; } @@ -298,7 +283,7 @@ void allium_8way_hash( void *hash, const void *input ) blake256_8way_close( &ctx.blake, vhashA ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhashA, 256 ); + vhashA, 256 ); intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 ); intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 ); @@ -320,21 +305,20 @@ void allium_8way_hash( void *hash, const void *input ) LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 ); LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 ); + + intrlv_2x128( vhashA, hash0, hash1, 256 ); + intrlv_2x128( vhashB, hash2, hash3, 256 ); + cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); + cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + dintrlv_2x128( hash0, hash1, vhashA, 256 ); + dintrlv_2x128( hash2, hash3, vhashB, 256 ); + + intrlv_2x128( vhashA, hash4, hash5, 256 ); + intrlv_2x128( vhashB, hash6, hash7, 256 ); + cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); + cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + dintrlv_2x128( hash4, hash5, vhashA, 256 ); + dintrlv_2x128( hash6, hash7, vhashB, 256 ); LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index c1d70e7d..8804c41c 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -188,7 +188,7 @@ bool register_allium_algo( algo_gate_t* gate ) gate->hash = (void*)&allium_hash; #endif gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT - | VAES_OPT | VAES256_OPT; + | VAES_OPT; opt_target_factor = 256.0; return true; }; diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index a5f8c9a4..531ce5d5 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -49,7 +49,7 @@ void lyra2z_16way_hash( void *state, const void *input ) dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15, - vhash, 256 ); + vhash, 256 ); intrlv_2x256( vhash, hash0, hash1, 256 ); LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 ); diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index b24b1730..1c904447 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ G2W_4X64( s0, s1, s2, s3 ); \ - s1 = mm512_ror256_64( s1); \ + s3 = mm512_shufll256_64( s3 ); \ + s1 = mm512_shuflr256_64( s1); \ s2 = mm512_swap256_128( s2 ); \ - s3 = mm512_rol256_64( s3 ); \ G2W_4X64( s0, s1, s2, s3 ); \ - s1 = mm512_rol256_64( s1 ); \ - s2 = mm512_swap256_128( s2 ); \ - s3 = mm512_ror256_64( s3 ); + s3 = mm512_shuflr256_64( s3 ); \ + s1 = mm512_shufll256_64( s1 ); \ + s2 = mm512_swap256_128( s2 ); #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \ LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ @@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ G_4X64( s0, s1, s2, s3 ); \ - s1 = mm256_ror_1x64( s1); \ + s3 = mm256_shufll_64( s3 ); \ + s1 = mm256_shuflr_64( s1); \ s2 = mm256_swap_128( s2 ); \ - s3 = mm256_rol_1x64( s3 ); \ G_4X64( s0, s1, s2, s3 ); \ - s1 = mm256_rol_1x64( s1 ); \ - s2 = mm256_swap_128( s2 ); \ - s3 = mm256_ror_1x64( s3 ); + s3 = mm256_shuflr_64( s3 ); \ + s1 = mm256_shufll_64( s1 ); \ + s2 = mm256_swap_128( s2 ); #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ @@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_ror256_64( s2, s3 ); \ + mm128_vrol256_64( s6, s7 ); \ + mm128_vror256_64( s2, s3 ); \ mm128_swap256_128( s4, s5 ); \ - mm128_rol256_64( s6, s7 ); \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_rol256_64( s2, s3 ); \ - mm128_swap256_128( s4, s5 ); \ - mm128_ror256_64( s6, s7 ); + mm128_vror256_64( s6, s7 ); \ + mm128_vrol256_64( s2, s3 ); \ + mm128_swap256_128( s4, s5 ); #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c index ab13a7e3..2bf4a11f 100644 --- a/algo/m7m/m7m.c +++ b/algo/m7m/m7m.c @@ -13,6 +13,7 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/ripemd/sph_ripemd.h" #include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #define EPSa DBL_EPSILON #define EPS1 DBL_EPSILON @@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce ) } typedef struct { - sph_sha256_context sha256; - sph_sha512_context sha512; + sha256_context sha256; + sph_sha512_context sha512; sph_keccak512_context keccak; sph_whirlpool_context whirlpool; sph_haval256_5_context haval; @@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx; void init_m7m_ctx() { - sph_sha256_init( &m7m_ctx ); + sha256_ctx_init( &m7m_ctx.sha256 ); sph_sha512_init( &m7m_ctx.sha512 ); sph_keccak512_init( &m7m_ctx.keccak ); sph_whirlpool_init( &m7m_ctx.whirlpool ); @@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64))); memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) ); - sph_sha256_context ctxf_sha256; memcpy(data, pdata, 80); - sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN ); + sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN ); sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN ); sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN ); sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN ); @@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) ); - sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN ); - sph_sha256_close( &ctx2.sha256, bhash[0] ); + sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN ); + sha256_final( &ctx2.sha256, bhash[0] ); sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN ); sph_sha512_close( &ctx2.sha512, bhash[1] ); @@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, bytes = mpz_sizeinbase(product, 256); mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product); - sph_sha256_init( &ctxf_sha256 ); - sph_sha256( &ctxf_sha256, bdata, bytes ); - sph_sha256_close( &ctxf_sha256, hash ); + sha256_full( hash, bdata, bytes ); digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75); mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16); @@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, mpzscale=bytes; mpz_export(bdata, NULL, -1, 1, 0, 0, product); - sph_sha256_init( &ctxf_sha256 ); - sph_sha256( &ctxf_sha256, bdata, bytes ); - sph_sha256_close( &ctxf_sha256, hash ); - } + sha256_full( hash, bdata, bytes ); + } if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) && !opt_benchmark ) ) diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c index d0bc1868..912fb2ec 100644 --- a/algo/panama/panama-hash-4way.c +++ b/algo/panama/panama-hash-4way.c @@ -312,10 +312,26 @@ do { \ BUPDATE1_8W( 7, 1 ); \ } while (0) +#if defined(__AVX512VL__) + +#define GAMMA_8W(n0, n1, n2, n4) \ + ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) ) + +#define THETA_8W(n0, n1, n2, n4) \ + ( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) ) + +#else + #define GAMMA_8W(n0, n1, n2, n4) \ (g ## n0 = _mm256_xor_si256( a ## n0, \ _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) ) +#define THETA_8W(n0, n1, n2, n4) \ + ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \ + a ## n4 ) ) ) + +#endif + #define PI_ALL_8W do { \ a0 = g0; \ a1 = mm256_rol_32( g7, 1 ); \ @@ -336,9 +352,6 @@ do { \ a16 = mm256_rol_32( g10, 8 ); \ } while (0) -#define THETA_8W(n0, n1, n2, n4) \ - ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \ - a ## n4 ) ) ) #define SIGMA_ALL_8W do { \ a0 = _mm256_xor_si256( g0, m256_one_32 ); \ diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index d15e6bd8..f29b951d 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -127,10 +127,8 @@ void quark_8way_hash( void *state, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - if ( ( vh_mask & 0x0f ) != 0x0f ) - groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); - if ( ( vh_mask & 0xf0 ) != 0xf0 ) - groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); @@ -139,22 +137,14 @@ void quark_8way_hash( void *state, const void *input ) dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 512 ); - if ( hash0[0] & 8 ) - groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - if ( hash1[0] & 8 ) - groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - if ( hash2[0] & 8) - groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - if ( hash3[0] & 8 ) - groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - if ( hash4[0] & 8 ) - groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); - if ( hash5[0] & 8 ) - groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); - if ( hash6[0] & 8 ) - groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); - if ( hash7[0] & 8 ) - groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 512 ); diff --git a/algo/ripemd/lbry-gate.c b/algo/ripemd/lbry-gate.c index ba38c651..d962f501 100644 --- a/algo/ripemd/lbry-gate.c +++ b/algo/ripemd/lbry-gate.c @@ -4,7 +4,7 @@ #include #include -double lbry_calc_network_diff( struct work *work ) +long double lbry_calc_network_diff( struct work *work ) { // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... @@ -12,7 +12,7 @@ double lbry_calc_network_diff( struct work *work ) uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] ); uint32_t bits = (nbits & 0xffffff); int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 - double d = (double)0x0000ffff / (double)bits; + long double d = (long double)0x0000ffff / (long double)bits; for (int m=shift; m < 29; m++) d *= 256.0; for (int m=29; m < shift; m++) d /= 256.0; diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c index 94f34171..e91b287c 100644 --- a/algo/ripemd/lbry.c +++ b/algo/ripemd/lbry.c @@ -7,24 +7,19 @@ #include #include #include "sph_ripemd.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" void lbry_hash(void* output, const void* input) { - sph_sha256_context ctx_sha256 __attribute__ ((aligned (64))); + sha256_context ctx_sha256 __attribute__ ((aligned (64))); sph_sha512_context ctx_sha512 __attribute__ ((aligned (64))); sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64))); uint32_t _ALIGN(64) hashA[16]; uint32_t _ALIGN(64) hashB[16]; uint32_t _ALIGN(64) hashC[16]; - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, input, 112 ); - sph_sha256_close( &ctx_sha256, hashA ); - - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hashA, 32 ); - sph_sha256_close( &ctx_sha256, hashA ); + sha256_full( hashA, input, 112 ); + sha256_full( hashA, hashA, 32 ); sph_sha512_init( &ctx_sha512 ); sph_sha512( &ctx_sha512, hashA, 32 ); @@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input) sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 ); sph_ripemd160_close( &ctx_ripemd, hashC ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hashB, 20 ); - sph_sha256( &ctx_sha256, hashC, 20 ); - sph_sha256_close( &ctx_sha256, hashA ); - - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hashA, 32 ); - sph_sha256_close( &ctx_sha256, hashA ); + sha256_ctx_init( &ctx_sha256 ); + sha256_update( &ctx_sha256, hashB, 20 ); + sha256_update( &ctx_sha256, hashC, 20 ); + sha256_final( &ctx_sha256, hashA ); + sha256_full( hashA, hashA, 32 ); + memcpy( output, hashA, 32 ); } diff --git a/algo/ripemd/sph_ripemd.c b/algo/ripemd/sph_ripemd.c index f2954971..dd610966 100644 --- a/algo/ripemd/sph_ripemd.c +++ b/algo/ripemd/sph_ripemd.c @@ -35,6 +35,7 @@ #include "sph_ripemd.h" +#if 0 /* * Round functions for RIPEMD (original). */ @@ -46,6 +47,7 @@ static const sph_u32 oIV[5] = { SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE), SPH_C32(0x10325476) }; +#endif /* * Round functions for RIPEMD-128 and RIPEMD-160. @@ -63,6 +65,8 @@ static const sph_u32 IV[5] = { #define ROTL SPH_ROTL32 +#if 0 + /* ===================================================================== */ /* * RIPEMD (original hash, deprecated). @@ -479,7 +483,7 @@ sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]) * One round of RIPEMD-128. The data must be aligned for 32-bit access. */ static void -ripemd128_round(const unsigned char *data, sph_u32 r[5]) +ripemd128_round(const unsigned char *data, sph_u32 r[4]) { #if SPH_LITTLE_FAST @@ -539,6 +543,8 @@ sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]) #undef RIPEMD128_IN } +#endif + /* ===================================================================== */ /* * RIPEMD-160. diff --git a/algo/ripemd/sph_ripemd.h b/algo/ripemd/sph_ripemd.h index 39fe5d1a..b677bd54 100644 --- a/algo/ripemd/sph_ripemd.h +++ b/algo/ripemd/sph_ripemd.h @@ -84,6 +84,7 @@ * can be cloned by copying the context (e.g. with a simple * memcpy()). */ +#if 0 typedef struct { #ifndef DOXYGEN_IGNORE unsigned char buf[64]; /* first field, for alignment */ @@ -204,6 +205,8 @@ void sph_ripemd128_close(void *cc, void *dst); */ void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]); +#endif + /* ===================================================================== */ /** diff --git a/algo/scrypt/neoscrypt.c b/algo/scrypt/neoscrypt.c index 7cb4c828..709b2688 100644 --- a/algo/scrypt/neoscrypt.c +++ b/algo/scrypt/neoscrypt.c @@ -69,8 +69,12 @@ typedef unsigned int uint; #define SCRYPT_HASH_BLOCK_SIZE 64U #define SCRYPT_HASH_DIGEST_SIZE 32U -#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) -#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) +//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) +//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + +#define ROTL32(a,b) rol32(a,b) +#define ROTR32(a,b) ror32(a,b) + #define U8TO32_BE(p) \ (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c new file mode 100644 index 00000000..23ad4e62 --- /dev/null +++ b/algo/scrypt/scrypt-core-4way.c @@ -0,0 +1,3265 @@ +#include "scrypt-core-4way.h" + +////////////////////////////////////////////////////////////////////////// +// +// Optimized Salsa implementation inspired by Pooler. +// Any similarities are not a coincidence. +// +// Implementations include reference X64, SSE2, AVX2 & AVX512 +// using both serial and parallel vectoring using SIMD instruction. +// +// Generic macros are providedi and invoked with different targets depending +// on level of parallelism and data organization. Targets for any macros +// needed must be defined by the calling function. XOR, ROL32 and ADD32 +// are needed in all cases. Additionally ROL_1X32, SWAP_64 and ROR_1X32 +// shuffles are needed for serial SIMD. +// +// SALSA_8ROUNDS_SIMD uses vectors on serial data rather than traditional +// n-way parallel hashing. +// The SIMD version has different implied arguments {X0:X3}, representing +// an array of 4 vectors of 4 32 bit words, while the version used for +// regular parallel hashing has {x0:xf} representing array of 16 by 32 bit +// words. +// These arguments must be defined by the calling function. +// The calling function must also define targets for all macros used for +// arithmetic, logic and shuffling: XOR, ROL32, ADD32 for all targets and +// ROL_1X32, SWAP_64, ROR_1X32 for serial SIMD targets. +// +// Serial and parallel SIMD will be combined with AVX2 doing 2 way +// parallel over 4 way linear for 8 way throughput, and AVX512 doing +// 4 way parallel over 4 way linear for 16 way thoughput. +// +// The term SIMD128 here refers to vectors that contain multiple contiguous +// data from a single stream (lane) as opposed to parallel vectors that +// contain interleaved words of data from multiple streams. +// +// The sequencing of techniques in the naming convention is a little +// mixed up. The logical hierarchy top down is to put Nbuffs at the top +// where each buffer then performs another technique. +// +// Although, Nway and SIMS128 are listed in top down order Nbuffs is +// always listed last: +// +// scrypt_core_simd128_2way means a linear simd operation on 2 parallel +// streams of data while +// scrypt_core_2way_simd128 is 2 parallel streams linear SIMD vectors. +// +/////////////////////////////////////////////////////////////////////////// + + +// Used by all targets, needs XOR, ROL32 & ADD32 macros defined +// Function, return typically overwrites in1 +// +#define ARX( in1, in2, in3, n ) \ + XOR( in1, ROL32( ADD32( in2, in3 ), n ) ) + +// Multi buffering has 2 main benefits and one drawback. +// Traditionally double buffering has been used to empty one bucket +// while another is filling. This requires a second (or 3rd, etc) +// bucket. The computing analogy is to use 2 registers, 1 to read +// and 1 to write, and switch back and forth. +// +// The second benefit in computing is using multiple registers to +// provide data independence that improves multiple instruction issue and +// pipelining in the CPU. The number of buffers is limited by the number +// of registers available. Three seems to be a swet spot as a 4 variable +// data set uses 12 registers triple buffered, leaving 4 of 16 as temps. +// Many pipelined instructions require 3 clocks to complete and triple +// bufferin keeps the pipeline full. Many execution units are also 3 wide +// allowing up to 3 similar instructions to be issued per clock. +// However, execution units are shared by hyperthreading which reduces +// the effect on a single thread. +// +// The drawback is the increased size of the data. Although multi buffering +// also improves memory throughput this is offset by the amount of +// memory required and it's effect on cache performance and will eventually +// hit memory bus saturation. +// +// For example scryptn2 struggles with more than 4 buffers, multi +// buffered and parallel SIMD combined, and performance drops. This can +// be mitigated somewhat by reducing the number of CPU threads but +// ultimately excessive multi buffering has a negative impact. +// +// Unlike paralle SIMD, increasing multi buffering does not require a +// CPU technology increase, ie SSE2 to AVX2 or AVX2 TO AVX512. +// SSE2 is limited to 4 way SIMD but no theoretical limit to multibuffering. +// Multi buffering also does not suffer the clock penalty of increasing +// parallism. +// +// Multi buffering implementations here focus on powers of 2, +// to match sha256 without re-interleaving the data. +// +// A decision will have to be made at run time, based of the N factor, +// whether to use multi buffering or serial execution. + +// Need TYPE macro defined. +#define ARX_2BUF( a1, a2, a3, b1, b2, b3, n ) \ +do{ \ + TYPE ta = ADD32( a2, a3 ); \ + TYPE tb = ADD32( b2, b3 ); \ + ta = ROL32( ta, n ); \ + tb = ROL32( tb, n ); \ + a1 = XOR( a1, ta ); \ + b1 = XOR( b1, tb ); \ +} while (0); + +#define ARX_3BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, n ) \ +do{ \ + TYPE ta = ADD32( a2, a3 ); \ + TYPE tb = ADD32( b2, b3 ); \ + TYPE tc = ADD32( c2, c3 ); \ + ta = ROL32( ta, n ); \ + tb = ROL32( tb, n ); \ + tc = ROL32( tc, n ); \ + a1 = XOR( a1, ta ); \ + b1 = XOR( b1, tb ); \ + c1 = XOR( c1, tc ); \ +} while (0); + + +// Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 & +// ROR_1X32 defined. +// +// Implied arguments ( X0 = { x3, x2, x1, x0 }, +// X1 = { x7, x6, x5, x4 }, +// X3 = { xb, xa, x9, x8 }, +// X3 = { xf, xe, xd, xc } ) +// +#define SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ) \ + /* Operate on columns */ \ + X1 = ARX( X1, X0, X3, 7 ); /* ( x4, x0, xc, 7 ) */ \ + X2 = ARX( X2, X1, X0, 9 ); /* ( x8, x4, x0, 9 ) */ \ + X3 = ARX( X3, X2, X1, 13 ); /* ( xc, x8, x4, 13 ) */ \ + X0 = ARX( X0, X3, X2, 18 ); /* ( x0, xc, x8, 18 ) */ \ + /* Rearrange data */ \ + X1 = ROL_1X32( X1 ); \ + X3 = ROR_1X32( X3 ); \ + X2 = SWAP_64( X2 ); \ + /* Operate on rows */ \ + X3 = ARX( X3, X0, X1, 7 ); \ + X2 = ARX( X2, X3, X0, 9 ); \ + X1 = ARX( X1, X2, X3, 13 ); \ + X0 = ARX( X0, X1, X2, 18 ); \ + /* Rearrange data */ \ + X3 = ROL_1X32( X3 ); \ + X1 = ROR_1X32( X1 ); \ + X2 = SWAP_64( X2 ); \ + +// Final round optimization, don't rearange data back to original order on exit +// Used only on pre-AVX2 CPUs where blend instruction is not avaiable. +// It saves a few redundant shuffles. +#define SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 ) \ + /* Operate on columns */ \ + X1 = ARX( X1, X0, X3, 7 ); /* ( x4, x0, xc, 7 ) */ \ + X2 = ARX( X2, X1, X0, 9 ); /* ( x8, x4, x0, 9 ) */ \ + X3 = ARX( X3, X2, X1, 13 ); /* ( xc, x8, x4, 13 ) */ \ + X0 = ARX( X0, X3, X2, 18 ); /* ( x0, xc, x8, 18 ) */ \ + /* Rearrange data */ \ + X1 = ROL_1X32( X1 ); \ + X3 = ROR_1X32( X3 ); \ + X2 = SWAP_64( X2 ); \ + /* Operate on rows */ \ + X3 = ARX( X3, X0, X1, 7 ); \ + X2 = ARX( X2, X3, X0, 9 ); \ + X1 = ARX( X1, X2, X3, 13 ); \ + X0 = ARX( X0, X1, X2, 18 ); \ + /* Final round, don't rearrange data + X1 = ROR_1X32( X1 ); \ + X2 = SWAP_64( X2 ); \ + X3 = ROL_1X32( X3 ); */ + +// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 ) +#define SALSA_2ROUNDS_SIMD128_2BUF \ + ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \ + ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \ + ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \ + ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \ + ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \ + ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \ + ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); \ + XA3 = ROL_1X32( XA3 ); \ + XB3 = ROL_1X32( XB3 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB1 = ROR_1X32( XB1 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); + +// For use when fast bit rotate is not available. +// contains target specif instructions, only use with 128 bit vectrors. +#define SALSA_2ROUNDS_SIMD128_2BUF_SLOROT \ +do{ \ + TYPE TA = ADD32( XA0, XA3 ); \ + TYPE TB = ADD32( XB0, XB3 ); \ + TYPE T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ + XA1 = XOR( XA1, T ); \ + XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 7 );\ + TB = _mm_srli_epi32( TB, 25 ); \ + XB1 = XOR( XB1, T ); \ + XB1 = XOR( XB1, TB ); \ +\ + TA = ADD32( XA1, XA0 ); \ + TB = ADD32( XB1, XB0 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ + XA2 = XOR( XA2, TA ); \ + T = _mm_slli_epi32( TB, 9 );\ + TB = _mm_srli_epi32( TB, 23 );\ + XB2 = XOR( XB2, T ); \ + XB2 = XOR( XB2, TB ); \ +\ + TA = ADD32( XA2, XA1 ); \ + TB = ADD32( XB2, XB1 ); \ + T = _mm_slli_epi32( TA, 13); \ + TA = _mm_srli_epi32( TA, 19 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XA3 = XOR( XA3, T ); \ + XA3 = XOR( XA3, TA ); \ + T = _mm_slli_epi32( TB, 13); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XB3 = XOR( XB3, T ); \ + XB3 = XOR( XB3, TB ); \ +\ + TA = ADD32( XA3, XA2 ); \ + TB = ADD32( XB3, XB2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ + T = _mm_slli_epi32( TB, 18 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ + XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ +\ + TA = ADD32( XA0, XA1 ); \ + TB = ADD32( XB0, XB1 ); \ + T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ + XA3 = ROR_1X32( XA3 ); \ + XA3 = XOR( XA3, T ); \ + XA3 = XOR( XA3, TA ); \ + T = _mm_slli_epi32( TB, 7 ); \ + TB = _mm_srli_epi32( TB, 25 ); \ + XB3 = ROR_1X32( XB3 ); \ + XB3 = XOR( XB3, T ); \ + XB3 = XOR( XB3, TB ); \ +\ + TA = ADD32( XA3, XA0 ); \ + TB = ADD32( XB3, XB0 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ + XA2 = XOR( XA2, TA ); \ + T = _mm_slli_epi32( TB, 9 ); \ + TB = _mm_srli_epi32( TB, 23 ); \ + XB2 = XOR( XB2, T ); \ + XB2 = XOR( XB2, TB ); \ +\ + TA = ADD32( XA2, XA3 ); \ + TB = ADD32( XB2, XB3 ); \ + T = _mm_slli_epi32( TA, 13 ); \ + TA = _mm_srli_epi32( TA, 19 ); \ + XA3 = ROL_1X32( XA3 ); \ + XB3 = ROL_1X32( XB3 ); \ + XA1 = XOR( XA1, T ); \ + XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 13 ); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XB1 = XOR( XB1, T ); \ + XB1 = XOR( XB1, TB ); \ +\ + TA = ADD32( XA1, XA2 ); \ + TB = ADD32( XB1, XB2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ + T = _mm_slli_epi32( TB, 18 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ + XB1 = ROR_1X32( XB1 ); \ +} while (0); + +#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \ + ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \ + ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \ + ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \ + ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \ + ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \ + ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \ + ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); + + +// Inlined ARX +#define SALSA_2ROUNDS_SIMD128_3BUF \ +do{ \ + TYPE TA = ADD32( XA0, XA3 ); \ + TYPE TB = ADD32( XB0, XB3 ); \ + TYPE TC = ADD32( XC0, XC3 ); \ + TA = ROL32( TA, 7 ); \ + TB = ROL32( TB, 7 ); \ + TC = ROL32( TC, 7 ); \ + XA1 = XOR( XA1, TA ); \ + XB1 = XOR( XB1, TB ); \ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA0 ); \ + TB = ADD32( XB1, XB0 ); \ + TC = ADD32( XC1, XC0 ); \ + TA = ROL32( TA, 9 ); \ + TB = ROL32( TB, 9 ); \ + TC = ROL32( TC, 9 ); \ + XA2 = XOR( XA2, TA ); \ + XB2 = XOR( XB2, TB ); \ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA1 ); \ + XA1 = ROL_1X32( XA1 ); \ + TB = ADD32( XB2, XB1 ); \ + TC = ADD32( XC2, XC1 ); \ + XB1 = ROL_1X32( XB1 ); \ + TA = ROL32( TA, 13 ); \ + XA3 = XOR( XA3, TA ); \ + XC1 = ROL_1X32( XC1 ); \ + TB = ROL32( TB, 13 ); \ + XB3 = XOR( XB3, TB ); \ + TC = ROL32( TC, 13 ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA2 ); \ + XA2 = SWAP_64( XA2 ); \ + TB = ADD32( XB3, XB2 ); \ + TC = ADD32( XC3, XC2 ); \ + TA = ROL32( TA, 18 ); \ + XB2 = SWAP_64( XB2 ); \ + XA0 = XOR( XA0, TA ); \ + TB = ROL32( TB, 18 ); \ + XB0 = XOR( XB0, TB ); \ + XC2 = SWAP_64( XC2 ); \ + TC = ROL32( TC, 18 ); \ + XC0 = XOR( XC0, TC ); \ +\ + TA = ADD32( XA0, XA1 ); \ + XA3 = ROR_1X32( XA3 ); \ + TB = ADD32( XB0, XB1 ); \ + TC = ADD32( XC0, XC1 ); \ + TA = ROL32( TA, 7 ); \ + XB3 = ROR_1X32( XB3 ); \ + XA3 = XOR( XA3, TA ); \ + TB = ROL32( TB, 7 ); \ + XC3 = ROR_1X32( XC3 ); \ + XB3 = XOR( XB3, TB ); \ + TC = ROL32( TC, 7 ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA0 ); \ + TB = ADD32( XB3, XB0 ); \ + TC = ADD32( XC3, XC0 ); \ + TA = ROL32( TA, 9 ); \ + TB = ROL32( TB, 9 ); \ + TC = ROL32( TC, 9 ); \ + XA2 = XOR( XA2, TA ); \ + XB2 = XOR( XB2, TB ); \ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA3 ); \ + TB = ADD32( XB2, XB3 ); \ + TA = ROL32( TA, 13 ); \ + TC = ADD32( XC2, XC3 ); \ + XA3 = ROL_1X32( XA3 ); \ + TB = ROL32( TB, 13 ); \ + XB3 = ROL_1X32( XB3 ); \ + XA1 = XOR( XA1, TA ); \ + TC = ROL32( TC, 13 ); \ + XC3 = ROL_1X32( XC3 ); \ + XB1 = XOR( XB1, TB ); \ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA2 ); \ + XA2 = SWAP_64( XA2 ); \ + TB = ADD32( XB1, XB2 ); \ + XB2 = SWAP_64( XB2 ); \ + TA = ROL32( TA, 18); \ + TC = ADD32( XC1, XC2 ); \ + XC2 = SWAP_64( XC2 ); \ + TB = ROL32( TB, 18); \ + XA0 = XOR( XA0, TA ); \ + XA1 = ROR_1X32( XA1 ); \ + TC = ROL32( TC, 18); \ + XB0 = XOR( XB0, TB ); \ + XB1 = ROR_1X32( XB1 ); \ + XC0 = XOR( XC0, TC ); \ + XC1 = ROR_1X32( XC1 ); \ +} while (0); + + +// slow rot, an attempt to optimze non-avx512 bit rotations +// Contains target specific instructions, only for use with 128 bit vectors +#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \ +do{ \ + TYPE TA = ADD32( XA0, XA3 ); \ + TYPE TB = ADD32( XB0, XB3 ); \ + TYPE TC = ADD32( XC0, XC3 ); \ + TYPE T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ + XA1 = XOR( XA1, T ); \ + XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 7 );\ + TB = _mm_srli_epi32( TB, 25 ); \ + XB1 = XOR( XB1, T ); \ + XB1 = XOR( XB1, TB ); \ + T = _mm_slli_epi32( TC, 7 );\ + TC = _mm_srli_epi32( TC, 25 );\ + XC1 = XOR( XC1, T ); \ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA0 ); \ + TB = ADD32( XB1, XB0 ); \ + TC = ADD32( XC1, XC0 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ + XA2 = XOR( XA2, TA ); \ + T = _mm_slli_epi32( TB, 9 );\ + TB = _mm_srli_epi32( TB, 23 );\ + XB2 = XOR( XB2, T ); \ + XB2 = XOR( XB2, TB ); \ + T = _mm_slli_epi32( TC, 9 );\ + TC = _mm_srli_epi32( TC, 23 );\ + XC2 = XOR( XC2, T ); \ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA1 ); \ + TB = ADD32( XB2, XB1 ); \ + TC = ADD32( XC2, XC1 ); \ + T = _mm_slli_epi32( TA, 13); \ + TA = _mm_srli_epi32( TA, 19 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XA3 = XOR( XA3, T ); \ + XA3 = XOR( XA3, TA ); \ + T = _mm_slli_epi32( TB, 13); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XB3 = XOR( XB3, T ); \ + XB3 = XOR( XB3, TB ); \ + T = _mm_slli_epi32( TC, 13); \ + TC = _mm_srli_epi32( TC, 19 ); \ + XC3 = XOR( XC3, T ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA2 ); \ + TB = ADD32( XB3, XB2 ); \ + TC = ADD32( XC3, XC2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ + T = _mm_slli_epi32( TB, 18 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ + XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ + T = _mm_slli_epi32( TC, 18 ); \ + TC = _mm_srli_epi32( TC, 14 ); \ + XC0 = XOR( XC0, T ); \ + XC0 = XOR( XC0, TC ); \ +\ + TA = ADD32( XA0, XA1 ); \ + TB = ADD32( XB0, XB1 ); \ + TC = ADD32( XC0, XC1 ); \ + T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ + XA3 = ROR_1X32( XA3 ); \ + XA3 = XOR( XA3, T ); \ + XA3 = XOR( XA3, TA ); \ + T = _mm_slli_epi32( TB, 7 ); \ + TB = _mm_srli_epi32( TB, 25 ); \ + XB3 = ROR_1X32( XB3 ); \ + XB3 = XOR( XB3, T ); \ + XB3 = XOR( XB3, TB ); \ + T = _mm_slli_epi32( TC, 7 ); \ + TC = _mm_srli_epi32( TC, 25 ); \ + XC3 = ROR_1X32( XC3 ); \ + XC3 = XOR( XC3, T ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA0 ); \ + TB = ADD32( XB3, XB0 ); \ + TC = ADD32( XC3, XC0 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ + XA2 = XOR( XA2, TA ); \ + T = _mm_slli_epi32( TB, 9 ); \ + TB = _mm_srli_epi32( TB, 23 ); \ + XB2 = XOR( XB2, T ); \ + XB2 = XOR( XB2, TB ); \ + T = _mm_slli_epi32( TC, 9 ); \ + TC = _mm_srli_epi32( TC, 23 ); \ + XC2 = XOR( XC2, T ); \ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA3 ); \ + TB = ADD32( XB2, XB3 ); \ + TC = ADD32( XC2, XC3 ); \ + T = _mm_slli_epi32( TA, 13 ); \ + TA = _mm_srli_epi32( TA, 19 ); \ + XA3 = ROL_1X32( XA3 ); \ + XB3 = ROL_1X32( XB3 ); \ + XC3 = ROL_1X32( XC3 ); \ + XA1 = XOR( XA1, T ); \ + XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 13 ); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XB1 = XOR( XB1, T ); \ + XB1 = XOR( XB1, TB ); \ + T = _mm_slli_epi32( TC, 13 ); \ + TC = _mm_srli_epi32( TC, 19 ); \ + XC1 = XOR( XC1, T ); \ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA2 ); \ + TB = ADD32( XB1, XB2 ); \ + TC = ADD32( XC1, XC2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ + T = _mm_slli_epi32( TB, 18 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ + XC2 = SWAP_64( XC2 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ + T = _mm_slli_epi32( TC, 18 ); \ + TC = _mm_srli_epi32( TC, 14 ); \ + XB1 = ROR_1X32( XB1 ); \ + XC1 = ROR_1X32( XC1 ); \ + XC0 = XOR( XC0, T ); \ + XC0 = XOR( XC0, TC ); \ +} while (0); + + +/* +// Standard version using ARX +#define SALSA_2ROUNDS_SIMD128_3BUF \ + ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ + XC1, XC0, XC3, 7 ); \ + ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ + XC2, XC1, XC0, 9 ); \ + ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ + XC3, XC2, XC1, 13 ); \ + ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ + XC0, XC3, XC2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XC3 = ROR_1X32( XC3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ + XC3, XC0, XC1, 7 ); \ + ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ + XC2, XC3, XC0, 9 ); \ + ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ + XC1, XC2, XC3, 13 ); \ + ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ + XC0, XC1, XC2, 18 ); \ + XA3 = ROL_1X32( XA3 ); \ + XB3 = ROL_1X32( XB3 ); \ + XC3 = ROL_1X32( XC3 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB1 = ROR_1X32( XB1 ); \ + XC1 = ROR_1X32( XC1 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); +*/ + +#define SALSA_2ROUNDS_FINAL_SIMD128_3BUF \ + ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ + XC1, XC0, XC3, 7 ); \ + ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ + XC2, XC1, XC0, 9 ); \ + ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ + XC3, XC2, XC1, 13 ); \ + ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ + XC0, XC3, XC2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XC3 = ROR_1X32( XC3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ + XC3, XC0, XC1, 7 ); \ + ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ + XC2, XC3, XC0, 9 ); \ + ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ + XC1, XC2, XC3, 13 ); \ + ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ + XC0, XC1, XC2, 18 ); + + +#define SALSA_8ROUNDS_SIMD128 \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); + +#define SALSA_8ROUNDS_FINAL_SIMD128 \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 ); + +// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 ) +#define SALSA_8ROUNDS_SIMD128_2BUF \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; + +#define SALSA_8ROUNDS_SIMD128_2BUF_SLOROT \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; + +#define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_FINAL_SIMD128_2BUF; + +#define SALSA_8ROUNDS_SIMD128_3BUF \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; + +#define SALSA_8ROUNDS_SIMD128_3BUF_SLOROT \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; + +#define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_FINAL_SIMD128_3BUF; + +// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, +// XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3, ) +#define SALSA_8ROUNDS_SIMD128_4BUF \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; + +#define SALSA_8ROUNDS_FINAL_SIMD128_4BUF \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_FINAL_SIMD128_4BUF; + +// Used by reference code and pure parallel implementations +// +// Implied arguments ( x0, x1, x2, x3, x4, x5, x6, x7, +// x8, x9, xa, xb, xc, xd, xe, xf ) +// +#define SALSA_COLUMN \ + x4 = ARX( x4, x0, xc, 7 ); \ + x9 = ARX( x9, x5, x1, 7 ); \ + xe = ARX( xe, xa, x6, 7 ); \ + x3 = ARX( x3, xf, xb, 7 ); \ + x8 = ARX( x8, x4, x0, 9 ); \ + xd = ARX( xd, x9, x5, 9 ); \ + x2 = ARX( x2, xe, xa, 9 ); \ + x7 = ARX( x7, x3, xf, 9 ); \ + xc = ARX( xc, x8, x4, 13 ); \ + x1 = ARX( x1, xd, x9, 13 ); \ + x6 = ARX( x6, x2, xe, 13 ); \ + xb = ARX( xb, x7, x3, 13 ); \ + x0 = ARX( x0, xc, x8, 18 ); \ + x5 = ARX( x5, x1, xd, 18 ); \ + xa = ARX( xa, x6, x2, 18 ); \ + xf = ARX( xf, xb, x7, 18 ) + +#define SALSA_ROW \ + x1 = ARX( x1, x0, x3, 7 ); \ + x6 = ARX( x6, x5, x4, 7 ); \ + xb = ARX( xb, xa, x9, 7 ); \ + xc = ARX( xc, xf, xe, 7 ); \ + x2 = ARX( x2, x1, x0, 9 ); \ + x7 = ARX( x7, x6, x5, 9 ); \ + x8 = ARX( x8, xb, xa, 9 ); \ + xd = ARX( xd, xc, xf, 9 ); \ + x3 = ARX( x3, x2, x1, 13 ); \ + x4 = ARX( x4, x7, x6, 13 ); \ + x9 = ARX( x9, x8, xb, 13 ); \ + xe = ARX( xe, xd, xc, 13 ); \ + x0 = ARX( x0, x3, x2, 18 ); \ + x5 = ARX( x5, x4, x7, 18 ); \ + xa = ARX( xa, x9, x8, 18 ); \ + xf = ARX( xf, xe, xd, 18 ); + +#define SALSA_2ROUNDS SALSA_COLUMN; SALSA_ROW; + +#define SALSA_8ROUNDS \ + SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// Tested OK but very slow +// 16 way parallel, requires 16x32 interleaving +static void xor_salsa8_16way( __m512i * const B, const __m512i * const C) +{ + __m512i x0 = B[ 0] = _mm512_xor_si512( B[ 0], C[ 0] ); + __m512i x1 = B[ 1] = _mm512_xor_si512( B[ 1], C[ 1] ); + __m512i x2 = B[ 2] = _mm512_xor_si512( B[ 2], C[ 2] ); + __m512i x3 = B[ 3] = _mm512_xor_si512( B[ 3], C[ 3] ); + __m512i x4 = B[ 4] = _mm512_xor_si512( B[ 4], C[ 4] ); + __m512i x5 = B[ 5] = _mm512_xor_si512( B[ 5], C[ 5] ); + __m512i x6 = B[ 6] = _mm512_xor_si512( B[ 6], C[ 6] ); + __m512i x7 = B[ 7] = _mm512_xor_si512( B[ 7], C[ 7] ); + __m512i x8 = B[ 8] = _mm512_xor_si512( B[ 8], C[ 8] ); + __m512i x9 = B[ 9] = _mm512_xor_si512( B[ 9], C[ 9] ); + __m512i xa = B[10] = _mm512_xor_si512( B[10], C[10] ); + __m512i xb = B[11] = _mm512_xor_si512( B[11], C[11] ); + __m512i xc = B[12] = _mm512_xor_si512( B[12], C[12] ); + __m512i xd = B[13] = _mm512_xor_si512( B[13], C[13] ); + __m512i xe = B[14] = _mm512_xor_si512( B[14], C[14] ); + __m512i xf = B[15] = _mm512_xor_si512( B[15], C[15] ); + + #define ROL32 _mm512_rol_epi32 + #define ADD32 _mm512_add_epi32 + #define XOR _mm512_xor_si512 + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] = _mm512_add_epi32( B[ 0], x0 ); + B[ 1] = _mm512_add_epi32( B[ 1], x1 ); + B[ 2] = _mm512_add_epi32( B[ 2], x2 ); + B[ 3] = _mm512_add_epi32( B[ 3], x3 ); + B[ 4] = _mm512_add_epi32( B[ 4], x4 ); + B[ 5] = _mm512_add_epi32( B[ 5], x5 ); + B[ 6] = _mm512_add_epi32( B[ 6], x6 ); + B[ 7] = _mm512_add_epi32( B[ 7], x7 ); + B[ 8] = _mm512_add_epi32( B[ 8], x8 ); + B[ 9] = _mm512_add_epi32( B[ 9], x9 ); + B[10] = _mm512_add_epi32( B[10], xa ); + B[11] = _mm512_add_epi32( B[11], xb ); + B[12] = _mm512_add_epi32( B[12], xc ); + B[13] = _mm512_add_epi32( B[13], xd ); + B[14] = _mm512_add_epi32( B[14], xe ); + B[15] = _mm512_add_epi32( B[15], xf ); +} + +void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ) +{ + for ( int n = 0; n < N; n++ ) + { + memcpy( &V[n * 32], X, 128*16 ); + xor_salsa8_16way( &X[ 0], &X[16] ); + xor_salsa8_16way( &X[16], &X[ 0] ); + } + for ( int n = 0; n < N; n++ ) + { + m512_ovly *vptr[16]; // pointer to V offset for each lane + m512_ovly *x16 = (m512_ovly*)(&X[16]); + + // create pointers to V for each lane using data from each lane of X[16] + // as index. + for ( int l = 0; l < 16; l++ ) + { + uint32_t xl = (*x16).u32[l]; + vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); + } + + for ( int i = 0; i < 32; i++ ) + { + m512_ovly v; // V value assembled from different indexes + for ( int l = 0; l < 8; l++ ) + v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; + X[i] = _mm512_xor_si512( X[i], v.m512 ); + } + + xor_salsa8_16way( &X[ 0], &X[16] ); + xor_salsa8_16way( &X[16], &X[ 0] ); + } +} + +// Working, not up to date, needs stream optimization. +// 4x32 interleaving +static void salsa8_simd128_4way( __m128i *b, const __m128i *c ) +{ + __m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + __m512i *B = (__m512i*)b; + const __m512i *C = (const __m512i*)c; + + // mix C into B then shuffle B into X + B[0] = _mm512_xor_si512( B[0], C[0] ); + B[1] = _mm512_xor_si512( B[1], C[1] ); + B[2] = _mm512_xor_si512( B[2], C[2] ); + B[3] = _mm512_xor_si512( B[3], C[3] ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[1], B[0] ); + X0 = _mm512_mask_blend_epi64( 0x30, B[3], B[2] ); + X0 = _mm512_mask_blend_epi64( 0x0f, X0, Y0 ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[2], B[1] ); + X1 = _mm512_mask_blend_epi64( 0x30, B[0], B[3] ); + X1 = _mm512_mask_blend_epi64( 0x0f, X1, Y0 ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[3], B[2] ); + X2 = _mm512_mask_blend_epi64( 0x30, B[1], B[0] ); + X2 = _mm512_mask_blend_epi64( 0x0f, X2, Y0 ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[0], B[3] ); + X3 = _mm512_mask_blend_epi64( 0x30, B[2], B[1] ); + X3 = _mm512_mask_blend_epi64( 0x0f, X3, Y0 ); + + // define targets for macros used in round function template + #define ROL_1X32 mm512_shufll_128 + #define ROR_1X32 mm512_shuflr_128 + #define SWAP_64 mm512_swap_256 + #define ROL32 _mm512_rol_epi32 + #define ADD32 _mm512_add_epi32 + #define XOR _mm512_xor_si512 + + SALSA_8ROUNDS_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + Y0 = _mm512_mask_blend_epi64( 0xc0, X0, X1 ); + Y1 = _mm512_mask_blend_epi64( 0x03, X0, X1 ); + Y2 = _mm512_mask_blend_epi64( 0x0c, X0, X1 ); + Y3 = _mm512_mask_blend_epi64( 0x30, X0, X1 ); + + Y0 = _mm512_mask_blend_epi64( 0x30, Y0, X2 ); + Y1 = _mm512_mask_blend_epi64( 0xc0, Y1, X2 ); + Y2 = _mm512_mask_blend_epi64( 0x03, Y2, X2 ); + Y3 = _mm512_mask_blend_epi64( 0x0c, Y3, X2 ); + + Y0 = _mm512_mask_blend_epi64( 0x0c, Y0, X3 ); + Y1 = _mm512_mask_blend_epi64( 0x30, Y1, X3 ); + Y2 = _mm512_mask_blend_epi64( 0xc0, Y2, X3 ); + Y3 = _mm512_mask_blend_epi64( 0x03, Y3, X3 ); + + B[0] = _mm512_add_epi32( B[0], Y0 ); + B[1] = _mm512_add_epi32( B[1], Y1 ); + B[2] = _mm512_add_epi32( B[2], Y2 ); + B[3] = _mm512_add_epi32( B[3], Y3 ); +} + +// data format for 512 bits: 4 * ( 4 way 32 ) +// { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2, +// l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 } + +void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ) +{ + for ( int n = 0; n < N; n++ ) + { + memcpy( &V[n * 32], X, 4*128 ); + salsa8_simd128_4way( &X[ 0], &X[16] ); + salsa8_simd128_4way( &X[16], &X[ 0] ); + } + + for ( int n = 0; n < N; n++ ) + { + uint32_t x16[4]; // index into V for each lane + memcpy( x16, &X[16], 16 ); + x16[0] = 32 * ( x16[0] & ( N-1) ); + x16[1] = 32 * ( x16[1] & ( N-1) ); + x16[2] = 32 * ( x16[2] & ( N-1) ); + x16[3] = 32 * ( x16[3] & ( N-1) ); + m128_ovly *v = (m128_ovly*)V; + + for( int i = 0; i < 32; i++ ) + { + X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3], + v[ x16[2] + i ].u32[2], + v[ x16[1] + i ].u32[1], + v[ x16[0] + i ].u32[0] ) ); + } + + salsa8_simd128_4way( &X[ 0], &X[16] ); + salsa8_simd128_4way( &X[16], &X[ 0] ); + } +} + +// 4x memory usage +// Working +// 4x128 interleaving +static void salsa_shuffle_4way_simd128( __m512i *X ) +{ + __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3; + + Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] ); + Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] ); + + Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] ); + Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] ); + + Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] ); + Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] ); + + Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] ); + Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] ); + + X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 ); + X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 ); + X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 ); + X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 ); +} + +static void salsa_unshuffle_4way_simd128( __m512i *X ) +{ + __m512i Y0, Y1, Y2, Y3; + + Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] ); + Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] ); + Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] ); + Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] ); + + Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] ); + Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] ); + Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] ); + Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] ); + + X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] ); + X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] ); + X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] ); + X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] ); +} + +static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C) +{ + __m512i X0, X1, X2, X3; + + X0 = B[0] = _mm512_xor_si512( B[0], C[0] ); + X1 = B[1] = _mm512_xor_si512( B[1], C[1] ); + X2 = B[2] = _mm512_xor_si512( B[2], C[2] ); + X3 = B[3] = _mm512_xor_si512( B[3], C[3] ); + + #define ROL_1X32 mm512_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm512_shuflr128_32 + #define SWAP_64 mm512_swap128_64 + #define ROL32 _mm512_rol_epi32 + #define ADD32 _mm512_add_epi32 + #define XOR _mm512_xor_si512 + + SALSA_8ROUNDS_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + B[0] = _mm512_add_epi32( B[0], X0 ); + B[1] = _mm512_add_epi32( B[1], X1 ); + B[2] = _mm512_add_epi32( B[2], X2 ); + B[3] = _mm512_add_epi32( B[3], X3 ); +} + +void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N ) +{ + salsa_shuffle_4way_simd128( X ); + salsa_shuffle_4way_simd128( X+4 ); + + for ( int n = 0; n < N; n++ ) + { + memcpy( &V[n * 8], X, 128*4 ); + salsa8_4way_simd128( &X[0], &X[4] ); + salsa8_4way_simd128( &X[4], &X[0] ); + } + + for ( int n = 0; n < N; n++ ) + { + m512_ovly x16; + x16 = ( (m512_ovly*)X )[4]; + uint32_t j0 = 8 * ( x16.u32[ 0] & ( N-1 ) ); + uint32_t j1 = 8 * ( x16.u32[ 4] & ( N-1 ) ); + uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) ); + uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) ); + + for ( int i = 0; i < 8; i++ ) + { + __m512i v10 = _mm512_mask_blend_epi32( 0x000f, V[ j1+i ], V[ j0+i ] ); + __m512i v32 = _mm512_mask_blend_epi32( 0x0f00, V[ j3+i ], V[ j2+i ] ); + X[i] = _mm512_xor_si512( X[i], _mm512_mask_blend_epi32( 0x00ff, + v32, v10 ) ); + } + + salsa8_4way_simd128( &X[0], &X[4] ); + salsa8_4way_simd128( &X[4], &X[0] ); + } + + salsa_unshuffle_4way_simd128( X ); + salsa_unshuffle_4way_simd128( X+4 ); +} + +#endif // AVX512 + +#if defined(__AVX2__) + +// 8x memory usage +// Tested OK but slow scrypt, very slow scryptn2, 2x4way is faster +// Crashes with large N & many threads, OOM? Use only for scrypt +// 8x32 interleaving +static void salsa8_8way( __m256i * const B, const __m256i * const C ) +{ + __m256i x0 = B[ 0] = _mm256_xor_si256( B[ 0], C[ 0] ); + __m256i x1 = B[ 1] = _mm256_xor_si256( B[ 1], C[ 1] ); + __m256i x2 = B[ 2] = _mm256_xor_si256( B[ 2], C[ 2] ); + __m256i x3 = B[ 3] = _mm256_xor_si256( B[ 3], C[ 3] ); + __m256i x4 = B[ 4] = _mm256_xor_si256( B[ 4], C[ 4] ); + __m256i x5 = B[ 5] = _mm256_xor_si256( B[ 5], C[ 5] ); + __m256i x6 = B[ 6] = _mm256_xor_si256( B[ 6], C[ 6] ); + __m256i x7 = B[ 7] = _mm256_xor_si256( B[ 7], C[ 7] ); + __m256i x8 = B[ 8] = _mm256_xor_si256( B[ 8], C[ 8] ); + __m256i x9 = B[ 9] = _mm256_xor_si256( B[ 9], C[ 9] ); + __m256i xa = B[10] = _mm256_xor_si256( B[10], C[10] ); + __m256i xb = B[11] = _mm256_xor_si256( B[11], C[11] ); + __m256i xc = B[12] = _mm256_xor_si256( B[12], C[12] ); + __m256i xd = B[13] = _mm256_xor_si256( B[13], C[13] ); + __m256i xe = B[14] = _mm256_xor_si256( B[14], C[14] ); + __m256i xf = B[15] = _mm256_xor_si256( B[15], C[15] ); + + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] = _mm256_add_epi32( B[ 0], x0 ); + B[ 1] = _mm256_add_epi32( B[ 1], x1 ); + B[ 2] = _mm256_add_epi32( B[ 2], x2 ); + B[ 3] = _mm256_add_epi32( B[ 3], x3 ); + B[ 4] = _mm256_add_epi32( B[ 4], x4 ); + B[ 5] = _mm256_add_epi32( B[ 5], x5 ); + B[ 6] = _mm256_add_epi32( B[ 6], x6 ); + B[ 7] = _mm256_add_epi32( B[ 7], x7 ); + B[ 8] = _mm256_add_epi32( B[ 8], x8 ); + B[ 9] = _mm256_add_epi32( B[ 9], x9 ); + B[10] = _mm256_add_epi32( B[10], xa ); + B[11] = _mm256_add_epi32( B[11], xb ); + B[12] = _mm256_add_epi32( B[12], xc ); + B[13] = _mm256_add_epi32( B[13], xd ); + B[14] = _mm256_add_epi32( B[14], xe ); + B[15] = _mm256_add_epi32( B[15], xf ); +} + +void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N ) +{ + for ( int n = 0; n < N; n++ ) + { + memcpy( &V[n * 32], X, 128*8 ); + salsa8_8way( &X[ 0], &X[16] ); + salsa8_8way( &X[16], &X[ 0] ); + } + + for ( int n = 0; n < N; n++ ) + { + m256_ovly *vptr[8]; // pointer to V offset for each lane + m256_ovly *x16 = (m256_ovly*)(&X[16]); + + // create pointers to V for each lane using data from each lane of X[16] + // as index. + for ( int l = 0; l < 8; l++ ) + { + uint32_t xl = (*x16).u32[l]; + vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); + } + + for ( int i = 0; i < 32; i++ ) + { + m256_ovly v; // V value assembled from different indexes + for ( int l = 0; l < 8; l++ ) + v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; + X[i] = _mm256_xor_si256( X[i], v.m256 ); + } + + salsa8_8way( &X[ 0], &X[16] ); + salsa8_8way( &X[16], &X[ 0] ); + } +} + +// 2x memory usage +// Working +// Essentially Pooler 6way +// 2x128 interleaved simd128 +// ------- lane 1 ------- ------- lane 0 ------- +// { l1x3, l1x2, l1x1, l1x0, l0x3, l0x2, l0x1, l0x0 } b[3] B[ 7: 0] +// { l1x7, l1x6, l1x5, l1x4, l0x7, l0x6, l0x5, l0x4 } b[2] B[15: 8] +// { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16] +// { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24] + +static void salsa_shuffle_2way_simd128( __m256i *X ) +{ + __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3; + + Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 ); + Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 ); + + Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 ); + Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 ); + + Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 ); + Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 ); + + Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 ); + Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 ); + + X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 ); + X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 ); + X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 ); + X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 ); +} + +static void salsa_unshuffle_2way_simd128( __m256i *X ) +{ + __m256i Y0, Y1, Y2, Y3; + + Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 ); + Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 ); + Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 ); + Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 ); + + Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 ); + Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 ); + Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 ); + Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 ); + + X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 ); + X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 ); + X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 ); + X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 ); +} + +static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C) +{ + __m256i X0, X1, X2, X3; + + X0 = B[0] = _mm256_xor_si256( B[0], C[0] ); + X1 = B[1] = _mm256_xor_si256( B[1], C[1] ); + X2 = B[2] = _mm256_xor_si256( B[2], C[2] ); + X3 = B[3] = _mm256_xor_si256( B[3], C[3] ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm256_shuflr128_32 + #define SWAP_64 mm256_swap128_64 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + + SALSA_8ROUNDS_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + B[0] = _mm256_add_epi32( B[0], X0 ); + B[1] = _mm256_add_epi32( B[1], X1 ); + B[2] = _mm256_add_epi32( B[2], X2 ); + B[3] = _mm256_add_epi32( B[3], X3 ); +} + +void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N ) +{ + salsa_shuffle_2way_simd128( X ); + salsa_shuffle_2way_simd128( X+4 ); + + for ( int n = 0; n < N; n++ ) + { + memcpy( &V[n * 8], X, 128*2 ); + salsa8_2way_simd128( &X[0], &X[4] ); + salsa8_2way_simd128( &X[4], &X[0] ); + } + + for ( int n = 0; n < N; n++ ) + { + m256_ovly x16; + x16 = ( (m256_ovly*)X )[4]; + uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) ); + uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) ); + + for ( int i = 0; i < 8; i++ ) + X[i] = _mm256_xor_si256( X[i], _mm256_blend_epi32( V[ j1+i ], + V[ j0+i ], 0x0f ) ); + + salsa8_2way_simd128( &X[0], &X[4] ); + salsa8_2way_simd128( &X[4], &X[0] ); + } + + salsa_unshuffle_2way_simd128( X ); + salsa_unshuffle_2way_simd128( X+4 ); +} + +// Working +// 2x128 interleaving +static void salsa8_2way_simd128_2buf( __m256i * const BA, __m256i * const BB, + const __m256i * const CA, const __m256i * const CB ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + + // mix C into B then shuffle B into X + BA[0] = _mm256_xor_si256( BA[0], CA[0] ); + BB[0] = _mm256_xor_si256( BB[0], CB[0] ); + BA[1] = _mm256_xor_si256( BA[1], CA[1] ); + BB[1] = _mm256_xor_si256( BB[1], CB[1] ); + BA[2] = _mm256_xor_si256( BA[2], CA[2] ); + BB[2] = _mm256_xor_si256( BB[2], CB[2] ); + BA[3] = _mm256_xor_si256( BA[3], CA[3] ); + BB[3] = _mm256_xor_si256( BB[3], CB[3] ); + + YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 ); + YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 ); + XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 ); + XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 ); + XA0 = _mm256_blend_epi32( XA0, YA0, 0x33); + XB0 = _mm256_blend_epi32( XB0, YB0, 0x33); + + YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 ); + YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 ); + XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 ); + XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 ); + XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 ); + XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 ); + YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 ); + XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 ); + XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 ); + XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 ); + XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 ); + YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 ); + XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 ); + XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 ); + XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 ); + XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm256_shuflr128_32 + #define SWAP_64 mm256_swap128_64 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_SIMD128_2BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 ); + YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 ); + YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 ); + YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 ); + YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 ); + YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 ); + YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 ); + YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 ); + + YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 ); + YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 ); + YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 ); + YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 ); + YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 ); + YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 ); + YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 ); + YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 ); + + YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 ); + YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 ); + YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 ); + YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 ); + YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 ); + YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 ); + YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 ); + YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 ); + + BA[0] = _mm256_add_epi32( BA[0], YA0 ); + BB[0] = _mm256_add_epi32( BB[0], YB0 ); + BA[1] = _mm256_add_epi32( BA[1], YA1 ); + BB[1] = _mm256_add_epi32( BB[1], YB1 ); + BA[2] = _mm256_add_epi32( BA[2], YA2 ); + BB[2] = _mm256_add_epi32( BB[2], YB2 ); + BA[3] = _mm256_add_epi32( BA[3], YA3 ); + BB[3] = _mm256_add_epi32( BB[3], YB3 ); + +} + +void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N ) +{ + __m256i *X0 = X; + __m256i *X1 = X + 8; + __m256i *V0 = V; + __m256i *V1 = V + 8*N; + + for ( int n = 0; n < N; n++ ) + { + for ( int i = 0; i < 8; i++ ) + { + _mm256_stream_si256( V0 + n*8 + i, X0[i] ); + _mm256_stream_si256( V1 + n*8 + i, X1[i] ); + } + salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] ); + salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] ); + } + for ( int n = 0; n < N; n++ ) + { + const m256_ovly x16a = ( (m256_ovly*)X0 )[4]; + const m256_ovly x16b = ( (m256_ovly*)X1 )[4]; + + const uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) ); + const uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) ); + const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) ); + const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) ); + + for ( int i = 0; i < 8; i++ ) + { + const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + i ); + const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + i ); + const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + i ); + const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + i ); + X0[i] = _mm256_xor_si256( X0[i], + _mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) ); + X1[i] = _mm256_xor_si256( X1[i], + _mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) ); + } + + salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] ); + salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] ); + } +} + +// Triple buffered, not up to date, needs stream optimization +// 2x128 interleaving +static void salsa8_2way_simd128_3buf( __m256i * const BA, __m256i * const BB, + __m256i * const BC, const __m256i * const CA, const __m256i * const CB, + const __m256i * const CC ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, XC0, XC1, XC2, XC3; + __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; + + // mix C into B then shuffle B into X + BA[0] = _mm256_xor_si256( BA[0], CA[0] ); + BB[0] = _mm256_xor_si256( BB[0], CB[0] ); + BC[0] = _mm256_xor_si256( BC[0], CC[0] ); + BA[1] = _mm256_xor_si256( BA[1], CA[1] ); + BB[1] = _mm256_xor_si256( BB[1], CB[1] ); + BC[1] = _mm256_xor_si256( BC[1], CC[1] ); + BA[2] = _mm256_xor_si256( BA[2], CA[2] ); + BB[2] = _mm256_xor_si256( BB[2], CB[2] ); + BC[2] = _mm256_xor_si256( BC[2], CC[2] ); + BA[3] = _mm256_xor_si256( BA[3], CA[3] ); + BB[3] = _mm256_xor_si256( BB[3], CB[3] ); + BC[3] = _mm256_xor_si256( BC[3], CC[3] ); + + YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 ); + YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 ); + YC0 = _mm256_blend_epi32( BC[1], BC[0], 0x11 ); + XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 ); + XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 ); + XC0 = _mm256_blend_epi32( BC[3], BC[2], 0x44 ); + XA0 = _mm256_blend_epi32( XA0, YA0, 0x33); + XB0 = _mm256_blend_epi32( XB0, YB0, 0x33); + XC0 = _mm256_blend_epi32( XC0, YC0, 0x33); + + YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 ); + YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 ); + YC0 = _mm256_blend_epi32( BC[2], BC[1], 0x11 ); + XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 ); + XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 ); + XC1 = _mm256_blend_epi32( BC[0], BC[3], 0x44 ); + XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 ); + XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 ); + XC1 = _mm256_blend_epi32( XC1, YC0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 ); + YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 ); + YC0 = _mm256_blend_epi32( BC[3], BC[2], 0x11 ); + XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 ); + XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 ); + XC2 = _mm256_blend_epi32( BC[1], BC[0], 0x44 ); + XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 ); + XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 ); + XC2 = _mm256_blend_epi32( XC2, YC0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 ); + YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 ); + YC0 = _mm256_blend_epi32( BC[0], BC[3], 0x11 ); + XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 ); + XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 ); + XC3 = _mm256_blend_epi32( BC[2], BC[1], 0x44 ); + XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 ); + XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 ); + XC3 = _mm256_blend_epi32( XC3, YC0, 0x33 ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm256_shuflr128_32 + #define SWAP_64 mm256_swap128_64 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_SIMD128_3BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 ); + YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 ); + YC0 = _mm256_blend_epi32( XC0, XC1, 0x88 ); + YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 ); + YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 ); + YC1 = _mm256_blend_epi32( XC0, XC1, 0x11 ); + YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 ); + YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 ); + YC2 = _mm256_blend_epi32( XC0, XC1, 0x22 ); + YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 ); + YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 ); + YC3 = _mm256_blend_epi32( XC0, XC1, 0x44 ); + + YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 ); + YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 ); + YC0 = _mm256_blend_epi32( YC0, XC2, 0x44 ); + YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 ); + YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 ); + YC1 = _mm256_blend_epi32( YC1, XC2, 0x88 ); + YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 ); + YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 ); + YC2 = _mm256_blend_epi32( YC2, XC2, 0x11 ); + YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 ); + YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 ); + YC3 = _mm256_blend_epi32( YC3, XC2, 0x22 ); + + YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 ); + YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 ); + YC0 = _mm256_blend_epi32( YC0, XC3, 0x22 ); + YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 ); + YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 ); + YC1 = _mm256_blend_epi32( YC1, XC3, 0x44 ); + YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 ); + YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 ); + YC2 = _mm256_blend_epi32( YC2, XC3, 0x88 ); + YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 ); + YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 ); + YC3 = _mm256_blend_epi32( YC3, XC3, 0x11 ); + + BA[0] = _mm256_add_epi32( BA[0], YA0 ); + BB[0] = _mm256_add_epi32( BB[0], YB0 ); + BC[0] = _mm256_add_epi32( BC[0], YC0 ); + BA[1] = _mm256_add_epi32( BA[1], YA1 ); + BB[1] = _mm256_add_epi32( BB[1], YB1 ); + BC[1] = _mm256_add_epi32( BC[1], YC1 ); + BA[2] = _mm256_add_epi32( BA[2], YA2 ); + BB[2] = _mm256_add_epi32( BB[2], YB2 ); + BC[2] = _mm256_add_epi32( BC[2], YC2 ); + BA[3] = _mm256_add_epi32( BA[3], YA3 ); + BB[3] = _mm256_add_epi32( BB[3], YB3 ); + BC[3] = _mm256_add_epi32( BC[3], YC3 ); + +} + +void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N ) +{ + __m256i *X0 = X; + __m256i *X1 = X+8; + __m256i *X2 = X+16; + __m256i *V0 = V; + __m256i *V1 = V + 8*N; + __m256i *V2 = V + 16*N; + + for ( int n = 0; n < N; n++ ) + { + memcpy( &V0[n * 8], X0, 128*2 ); + memcpy( &V1[n * 8], X1, 128*2 ); + memcpy( &V2[n * 8], X2, 128*2 ); + salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], + &X0[4], &X1[4], &X2[4] ); + salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4], + &X0[0], &X1[0], &X2[0] ); + } + for ( int n = 0; n < N; n++ ) + { + m256_ovly x16a, x16b, x16c; + x16a = ( (m256_ovly*)X0 )[4]; + x16b = ( (m256_ovly*)X1 )[4]; + x16c = ( (m256_ovly*)X2 )[4]; + + uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) ); + uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) ); + uint32_t j0c = 8 * ( x16c.u32[0] & ( N-1 ) ); + uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) ); + uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) ); + uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) ); + + for ( int i = 0; i < 8; i++ ) + { + X0[i] = _mm256_xor_si256( X0[i], + _mm256_blend_epi32( V0[ j1a+i ], V0[ j0a+i ], 0x0f ) ); + X1[i] = _mm256_xor_si256( X1[i], + _mm256_blend_epi32( V1[ j1b+i ], V1[ j0b+i ], 0x0f ) ); + X2[i] = _mm256_xor_si256( X2[i], + _mm256_blend_epi32( V2[ j1c+i ], V2[ j0c+i ], 0x0f ) ); + } + + salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], + &X0[4], &X1[4], &X2[4] ); + salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4], + &X0[0], &X1[0], &X2[0] ); + } +} + + +// 2x memory usage + +// Tested OK, good speed +// +// Serial SIMD over 2 way parallel + +// Uses uint64_t as a poorman's vector then applying linear SIMD to the +// pairs of data. +// +// Interleaving is standard 2 way. +// Use 64 bit shuffles but 32 bit arithmetic. + +// B = { lane1, lane0 } +// b[i] = { B[4*i+3], B[4*i+2], B[4*i+1], B[4*i] } + +// 2x32 interleaving +static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c ) +{ + __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + __m256i *B = (__m256i*)b; + const __m256i *C = (const __m256i*)c; + + // mix C into B then shuffle B into X + B[0] = _mm256_xor_si256( B[0], C[0] ); + B[1] = _mm256_xor_si256( B[1], C[1] ); + B[2] = _mm256_xor_si256( B[2], C[2] ); + B[3] = _mm256_xor_si256( B[3], C[3] ); + + Y0 = _mm256_blend_epi32( B[1], B[0], 0x03 ); + X0 = _mm256_blend_epi32( B[3], B[2], 0x30 ); + X0 = _mm256_blend_epi32( X0, Y0, 0x0f); + + Y0 = _mm256_blend_epi32( B[2], B[1], 0x03 ); + X1 = _mm256_blend_epi32( B[0], B[3], 0x30 ); + X1 = _mm256_blend_epi32( X1, Y0, 0x0f ); + + Y0 = _mm256_blend_epi32( B[3], B[2], 0x03 ); + X2 = _mm256_blend_epi32( B[1], B[0], 0x30 ); + X2 = _mm256_blend_epi32( X2, Y0, 0x0f ); + + Y0 = _mm256_blend_epi32( B[0], B[3], 0x03 ); + X3 = _mm256_blend_epi32( B[2], B[1], 0x30 ); + X3 = _mm256_blend_epi32( X3, Y0, 0x0f ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll_64 + #define ROR_1X32 mm256_shuflr_64 + #define SWAP_64 mm256_swap_128 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + + SALSA_8ROUNDS_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + Y0 = _mm256_blend_epi32( X0, X1, 0xc0 ); + Y1 = _mm256_blend_epi32( X0, X1, 0x03 ); + Y2 = _mm256_blend_epi32( X0, X1, 0x0c ); + Y3 = _mm256_blend_epi32( X0, X1, 0x30 ); + + Y0 = _mm256_blend_epi32( Y0, X2, 0x30 ); + Y1 = _mm256_blend_epi32( Y1, X2, 0xc0 ); + Y2 = _mm256_blend_epi32( Y2, X2, 0x03 ); + Y3 = _mm256_blend_epi32( Y3, X2, 0x0c ); + + Y0 = _mm256_blend_epi32( Y0, X3, 0x0c ); + Y1 = _mm256_blend_epi32( Y1, X3, 0x30 ); + Y2 = _mm256_blend_epi32( Y2, X3, 0xc0 ); + Y3 = _mm256_blend_epi32( Y3, X3, 0x03 ); + + B[0] = _mm256_add_epi32( B[0], Y0 ); + B[1] = _mm256_add_epi32( B[1], Y1 ); + B[2] = _mm256_add_epi32( B[2], Y2 ); + B[3] = _mm256_add_epi32( B[3], Y3 ); + +} + +// data format for 256 bits: 4 * ( 2 way 32 ) +// { l1d3, l0d3, l1d2, l0d2, l1d1, l0d1, l1d0, l0d0 } + +void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N ) +{ + for ( int n = 0; n < N; n++ ) + { + for ( int i = 0; i < 8; i++ ) + _mm256_stream_si256( (__m256i*)V + n*8 + i, casti_m256i( X, i ) ); + salsa8_simd128_2way( &X[ 0], &X[16] ); + salsa8_simd128_2way( &X[16], &X[ 0] ); + } + + for ( int n = 0; n < N; n++ ) + { + // need 2 J's + const uint32_t j0 = 32 * ( (uint32_t)( X[16] ) & ( N-1 ) ); + const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) ); + + for ( int i = 0; i < 32; i++ ) + X[i] ^= ( ( V[ j1 + i ] & 0xffffffff00000000 ) + | ( V[ j0 + i ] & 0x00000000ffffffff ) ); + + salsa8_simd128_2way( &X[ 0], &X[16] ); + salsa8_simd128_2way( &X[16], &X[ 0] ); + } +} + +// Double buffered, 4x memory usage +// 2x32 interleaving +static void salsa8_simd128_2way_2buf( uint64_t *ba, uint64_t *bb, + const uint64_t *ca, const uint64_t *cb ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + __m256i *BA = (__m256i*)ba; + __m256i *BB = (__m256i*)bb; + const __m256i *CA = (const __m256i*)ca; + const __m256i *CB = (const __m256i*)cb; + + // mix C into B then shuffle B into X + BA[0] = _mm256_xor_si256( BA[0], CA[0] ); + BB[0] = _mm256_xor_si256( BB[0], CB[0] ); + BA[1] = _mm256_xor_si256( BA[1], CA[1] ); + BB[1] = _mm256_xor_si256( BB[1], CB[1] ); + BA[2] = _mm256_xor_si256( BA[2], CA[2] ); + BB[2] = _mm256_xor_si256( BB[2], CB[2] ); + BA[3] = _mm256_xor_si256( BA[3], CA[3] ); + BB[3] = _mm256_xor_si256( BB[3], CB[3] ); + + YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x03 ); + YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x03 ); + XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x30 ); + XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x30 ); + XA0 = _mm256_blend_epi32( XA0, YA0, 0x0f); + XB0 = _mm256_blend_epi32( XB0, YB0, 0x0f); + + YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x03 ); + YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x03 ); + XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x30 ); + XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x30 ); + XA1 = _mm256_blend_epi32( XA1, YA0, 0x0f ); + XB1 = _mm256_blend_epi32( XB1, YB0, 0x0f ); + + YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x03 ); + YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x03 ); + XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x30 ); + XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x30 ); + XA2 = _mm256_blend_epi32( XA2, YA0, 0x0f ); + XB2 = _mm256_blend_epi32( XB2, YB0, 0x0f ); + + YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x03 ); + YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x03 ); + XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x30 ); + XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x30 ); + XA3 = _mm256_blend_epi32( XA3, YA0, 0x0f ); + XB3 = _mm256_blend_epi32( XB3, YB0, 0x0f ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll_64 + #define ROR_1X32 mm256_shuflr_64 + #define SWAP_64 mm256_swap_128 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_SIMD128_2BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + YA0 = _mm256_blend_epi32( XA0, XA1, 0xc0 ); + YB0 = _mm256_blend_epi32( XB0, XB1, 0xc0 ); + YA1 = _mm256_blend_epi32( XA0, XA1, 0x03 ); + YB1 = _mm256_blend_epi32( XB0, XB1, 0x03 ); + YA2 = _mm256_blend_epi32( XA0, XA1, 0x0c ); + YB2 = _mm256_blend_epi32( XB0, XB1, 0x0c ); + YA3 = _mm256_blend_epi32( XA0, XA1, 0x30 ); + YB3 = _mm256_blend_epi32( XB0, XB1, 0x30 ); + + YA0 = _mm256_blend_epi32( YA0, XA2, 0x30 ); + YB0 = _mm256_blend_epi32( YB0, XB2, 0x30 ); + YA1 = _mm256_blend_epi32( YA1, XA2, 0xc0 ); + YB1 = _mm256_blend_epi32( YB1, XB2, 0xc0 ); + YA2 = _mm256_blend_epi32( YA2, XA2, 0x03 ); + YB2 = _mm256_blend_epi32( YB2, XB2, 0x03 ); + YA3 = _mm256_blend_epi32( YA3, XA2, 0x0c ); + YB3 = _mm256_blend_epi32( YB3, XB2, 0x0c ); + + YA0 = _mm256_blend_epi32( YA0, XA3, 0x0c ); + YB0 = _mm256_blend_epi32( YB0, XB3, 0x0c ); + YA1 = _mm256_blend_epi32( YA1, XA3, 0x30 ); + YB1 = _mm256_blend_epi32( YB1, XB3, 0x30 ); + YA2 = _mm256_blend_epi32( YA2, XA3, 0xc0 ); + YB2 = _mm256_blend_epi32( YB2, XB3, 0xc0 ); + YA3 = _mm256_blend_epi32( YA3, XA3, 0x03 ); + YB3 = _mm256_blend_epi32( YB3, XB3, 0x03 ); + + BA[0] = _mm256_add_epi32( BA[0], YA0 ); + BB[0] = _mm256_add_epi32( BB[0], YB0 ); + BA[1] = _mm256_add_epi32( BA[1], YA1 ); + BB[1] = _mm256_add_epi32( BB[1], YB1 ); + BA[2] = _mm256_add_epi32( BA[2], YA2 ); + BB[2] = _mm256_add_epi32( BB[2], YB2 ); + BA[3] = _mm256_add_epi32( BA[3], YA3 ); + BB[3] = _mm256_add_epi32( BB[3], YB3 ); + +} + +void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N ) + +{ + uint64_t *X0 = X; + uint64_t *X1 = X+32; + uint64_t *V0 = V; + uint64_t *V1 = V + 32*N; + + for ( int n = 0; n < N; n++ ) + { + for ( int i = 0; i < 8; i++ ) + { + _mm256_stream_si256( (__m256i*)V0 + n*8 + i, casti_m256i( X0, i ) ); + _mm256_stream_si256( (__m256i*)V1 + n*8 + i, casti_m256i( X1, i ) ); + } + salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); + salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + } + + for ( int n = 0; n < N; n++ ) + { + // need 4 J's + const uint32_t j0l = 32 * ( (const uint32_t)( X0[16] ) & ( N-1 ) ); + const uint32_t j0h = 32 * ( (const uint32_t)( X0[16] >> 32 ) & ( N-1 ) ); + const uint32_t j1l = 32 * ( (const uint32_t)( X1[16] ) & ( N-1 ) ); + const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) ); + + for ( int i = 0; i < 32; i++ ) + { + X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 ) + | ( V0[ j0l + i ] & 0x00000000ffffffff ) ); + X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 ) + | ( V1[ j1l + i ] & 0x00000000ffffffff ) ); + } + salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); + salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + } +} + +// Working, deprecated, not up to date +// Triple buffered 2 way, 6x memory usage +// 2x32 interleaving +static void salsa8_simd128_2way_3buf( uint64_t *BA, uint64_t *BB, + uint64_t *BC, const uint64_t *CA, const uint64_t *CB, + const uint64_t *CC ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, + XC0, XC1, XC2, XC3; + __m256i *ba = (__m256i*)BA; + __m256i *bb = (__m256i*)BB; + __m256i *bc = (__m256i*)BC; + const __m256i *ca = (const __m256i*)CA; + const __m256i *cb = (const __m256i*)CB; + const __m256i *cc = (const __m256i*)CC; + m256_ovly ya[4], yb[4], yc[4], + za[4], zb[4], zc[4]; + + // mix C into B then shuffle B into X + ba[0] = _mm256_xor_si256( ba[0], ca[0] ); + bb[0] = _mm256_xor_si256( bb[0], cb[0] ); + bc[0] = _mm256_xor_si256( bc[0], cc[0] ); + ba[1] = _mm256_xor_si256( ba[1], ca[1] ); + bb[1] = _mm256_xor_si256( bb[1], cb[1] ); + bc[1] = _mm256_xor_si256( bc[1], cc[1] ); + ba[2] = _mm256_xor_si256( ba[2], ca[2] ); + bb[2] = _mm256_xor_si256( bb[2], cb[2] ); + bc[2] = _mm256_xor_si256( bc[2], cc[2] ); + ba[3] = _mm256_xor_si256( ba[3], ca[3] ); + bb[3] = _mm256_xor_si256( bb[3], cb[3] ); + bc[3] = _mm256_xor_si256( bc[3], cc[3] ); + + XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] ); + XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] ); + XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] ); + XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] ); + XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] ); + XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] ); + XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] ); + XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] ); + XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] ); + XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] ); + XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] ); + XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll_64 + #define ROR_1X32 mm256_shuflr_64 + #define SWAP_64 mm256_swap_128 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_FINAL_SIMD128_3BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + ya[0].m256 = XA0; yb[0].m256 = XB0; + yc[0].m256 = XC0; + ya[1].m256 = XA1; yb[1].m256 = XB1; + yc[1].m256 = XC1; + ya[2].m256 = XA2; yb[2].m256 = XB2; + yc[2].m256 = XC2; + ya[3].m256 = XA3; yb[3].m256 = XB3; + yc[3].m256 = XC3; + + za[0].u64[0] = ya[0].u64[0]; + zb[0].u64[0] = yb[0].u64[0]; + zc[0].u64[0] = yc[0].u64[0]; + za[0].u64[3] = ya[1].u64[0]; + zb[0].u64[3] = yb[1].u64[0]; + zc[0].u64[3] = yc[1].u64[0]; + za[0].u64[2] = ya[2].u64[0]; + zb[0].u64[2] = yb[2].u64[0]; + zc[0].u64[2] = yc[2].u64[0]; + za[0].u64[1] = ya[3].u64[0]; + zb[0].u64[1] = yb[3].u64[0]; + zc[0].u64[1] = yc[3].u64[0]; + + za[1].u64[1] = ya[0].u64[1]; + zb[1].u64[1] = yb[0].u64[1]; + zc[1].u64[1] = yc[0].u64[1]; + za[1].u64[0] = ya[1].u64[1]; + zb[1].u64[0] = yb[1].u64[1]; + zc[1].u64[0] = yc[1].u64[1]; + za[1].u64[3] = ya[2].u64[1]; + zb[1].u64[3] = yb[2].u64[1]; + zc[1].u64[3] = yc[2].u64[1]; + za[1].u64[2] = ya[3].u64[1]; + zb[1].u64[2] = yb[3].u64[1]; + zc[1].u64[2] = yc[3].u64[1]; + + za[2].u64[2] = ya[0].u64[2]; + zb[2].u64[2] = yb[0].u64[2]; + zc[2].u64[2] = yc[0].u64[2]; + za[2].u64[1] = ya[1].u64[2]; + zb[2].u64[1] = yb[1].u64[2]; + zc[2].u64[1] = yc[1].u64[2]; + za[2].u64[0] = ya[2].u64[2]; + zb[2].u64[0] = yb[2].u64[2]; + zc[2].u64[0] = yc[2].u64[2]; + za[2].u64[3] = ya[3].u64[2]; + zb[2].u64[3] = yb[3].u64[2]; + zc[2].u64[3] = yc[3].u64[2]; + + za[3].u64[3] = ya[0].u64[3]; + zb[3].u64[3] = yb[0].u64[3]; + zc[3].u64[3] = yc[0].u64[3]; + za[3].u64[2] = ya[1].u64[3]; + zb[3].u64[2] = yb[1].u64[3]; + zc[3].u64[2] = yc[1].u64[3]; + za[3].u64[1] = ya[2].u64[3]; + zb[3].u64[1] = yb[2].u64[3]; + zc[3].u64[1] = yc[2].u64[3]; + za[3].u64[0] = ya[3].u64[3]; + zb[3].u64[0] = yb[3].u64[3]; + zc[3].u64[0] = yc[3].u64[3]; + + ba[0] = _mm256_add_epi32( ba[0], za[0].m256 ); + bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 ); + bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 ); + ba[1] = _mm256_add_epi32( ba[1], za[1].m256 ); + bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 ); + bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 ); + ba[2] = _mm256_add_epi32( ba[2], za[2].m256 ); + bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 ); + bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 ); + ba[3] = _mm256_add_epi32( ba[3], za[3].m256 ); + bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 ); + bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 ); +} + +void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, + const uint32_t N ) +{ + uint64_t *X0 = X; + uint64_t *X1 = X+32; + uint64_t *X2 = X+64; + uint64_t *V0 = V; + uint64_t *V1 = V + 32*N; + uint64_t *V2 = V + 64*N; + + for ( int n = 0; n < N; n++ ) + { + memcpy( &V0[ n*32 ], X0, 2*128 ); + memcpy( &V1[ n*32 ], X1, 2*128 ); + memcpy( &V2[ n*32 ], X2, 2*128 ); + salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0], + &X0[16], &X1[16], &X2[16] ); + salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16], + &X0[ 0], &X1[ 0], &X2[ 0] ); + } + + for ( int n = 0; n < N; n++ ) + { + uint32_t j0l = 32 * ( (uint32_t)( X0[16] ) & ( N-1 ) ); + uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) ); + uint32_t j1l = 32 * ( (uint32_t)( X1[16] ) & ( N-1 ) ); + uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) ); + uint32_t j2l = 32 * ( (uint32_t)( X2[16] ) & ( N-1 ) ); + uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) ); + + for ( int i = 0; i < 32; i++ ) + { + X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 ) + | ( V0[ j0l + i ] & 0x00000000ffffffff ) ); + X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 ) + | ( V1[ j1l + i ] & 0x00000000ffffffff ) ); + X2[i] ^= ( ( V2[ j2h + i ] & 0xffffffff00000000 ) + | ( V2[ j2l + i ] & 0x00000000ffffffff ) ); + } + salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0], + &X0[16], &X1[16], &X2[16] ); + salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16], + &X0[ 0], &X1[ 0], &X2[ 0] ); + } +} + + +#endif // AVX2 + +#if defined(__SSE2__) // required and assumed + +// Simple 4 way parallel. +// Tested OK +// Scyptn2 a little slower than pooler +// Scrypt 2x faster than pooler +// 4x memory usage +// 4x32 interleaving +static void xor_salsa8_4way( __m128i * const B, const __m128i * const C ) +{ + __m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] ); + __m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] ); + __m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] ); + __m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] ); + __m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] ); + __m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] ); + __m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] ); + __m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] ); + __m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] ); + __m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] ); + __m128i xa = B[10] = _mm_xor_si128( B[10], C[10] ); + __m128i xb = B[11] = _mm_xor_si128( B[11], C[11] ); + __m128i xc = B[12] = _mm_xor_si128( B[12], C[12] ); + __m128i xd = B[13] = _mm_xor_si128( B[13], C[13] ); + __m128i xe = B[14] = _mm_xor_si128( B[14], C[14] ); + __m128i xf = B[15] = _mm_xor_si128( B[15], C[15] ); + + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] = _mm_add_epi32( B[ 0], x0 ); + B[ 1] = _mm_add_epi32( B[ 1], x1 ); + B[ 2] = _mm_add_epi32( B[ 2], x2 ); + B[ 3] = _mm_add_epi32( B[ 3], x3 ); + B[ 4] = _mm_add_epi32( B[ 4], x4 ); + B[ 5] = _mm_add_epi32( B[ 5], x5 ); + B[ 6] = _mm_add_epi32( B[ 6], x6 ); + B[ 7] = _mm_add_epi32( B[ 7], x7 ); + B[ 8] = _mm_add_epi32( B[ 8], x8 ); + B[ 9] = _mm_add_epi32( B[ 9], x9 ); + B[10] = _mm_add_epi32( B[10], xa ); + B[11] = _mm_add_epi32( B[11], xb ); + B[12] = _mm_add_epi32( B[12], xc ); + B[13] = _mm_add_epi32( B[13], xd ); + B[14] = _mm_add_epi32( B[14], xe ); + B[15] = _mm_add_epi32( B[15], xf ); +} + +void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ) +{ + for ( int n = 0; n < N; n++ ) + { + memcpy( &V[ n*32 ], X, 128*4 ); + xor_salsa8_4way( &X[ 0], &X[16] ); + xor_salsa8_4way( &X[16], &X[ 0] ); + } + for ( int n = 0; n < N; n++ ) + { + m128_ovly *vptr[4]; + m128_ovly *x16 = (m128_ovly*)(&X[16]); + + for ( int l = 0; l < 4; l++ ) + { + uint32_t xl = (*x16).u32[l]; + vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); + } + + for ( int i = 0; i < 32; i++ ) + { + m128_ovly v; + for ( int l = 0; l < 4; l++ ) + v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; + X[i] = _mm_xor_si128( X[i], v.m128 ); + } + + xor_salsa8_4way( &X[ 0], &X[16] ); + xor_salsa8_4way( &X[16], &X[ 0] ); + } +} + + +// Linear SIMD single thread. No memory increase but some shuffling overhead +// required. + +// 4 way 32 bit interleaved single 32 bit thread, interleave while loading, +// deinterleave while storing, do 2 way 128 & 4 way 128 parallel on top. +// +// SALSA_2ROUNDS( {x0,x5,xa,xf}, {x4,x9,xe,x3}, {x8,xd,x2,x7}, {xc,x1,x6,xb}) + +// Tested OK. +// No interleaving +static void salsa8_simd128( uint32_t *b, const uint32_t * const c) +{ + __m128i X0, X1, X2, X3; + __m128i *B = (__m128i*)b; + const __m128i *C = (const __m128i*)c; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + + // mix C into B then shuffle B into X + B[0] = _mm_xor_si128( B[0], C[0] ); + B[1] = _mm_xor_si128( B[1], C[1] ); + B[2] = _mm_xor_si128( B[2], C[2] ); + B[3] = _mm_xor_si128( B[3], C[3] ); + +#if defined(__SSE4_1__) + + __m128i Y0, Y1, Y2, Y3; + +#if defined(__AVX2__) + + Y0 = _mm_blend_epi32( B[1], B[0], 0x1 ); + X0 = _mm_blend_epi32( B[3], B[2], 0x4 ); + Y1 = _mm_blend_epi32( B[2], B[1], 0x1 ); + X1 = _mm_blend_epi32( B[0], B[3], 0x4 ); + Y2 = _mm_blend_epi32( B[3], B[2], 0x1 ); + X2 = _mm_blend_epi32( B[1], B[0], 0x4 ); + Y3 = _mm_blend_epi32( B[0], B[3], 0x1 ); + X3 = _mm_blend_epi32( B[2], B[1], 0x4 ); + X0 = _mm_blend_epi32( X0, Y0, 0x3); + X1 = _mm_blend_epi32( X1, Y1, 0x3 ); + X2 = _mm_blend_epi32( X2, Y2, 0x3 ); + X3 = _mm_blend_epi32( X3, Y3, 0x3 ); + +#else // SSE4_1 + + Y0 = _mm_blend_epi16( B[1], B[0], 0x03 ); + X0 = _mm_blend_epi16( B[3], B[2], 0x30 ); + Y1 = _mm_blend_epi16( B[2], B[1], 0x03 ); + X1 = _mm_blend_epi16( B[0], B[3], 0x30 ); + Y2 = _mm_blend_epi16( B[3], B[2], 0x03 ); + X2 = _mm_blend_epi16( B[1], B[0], 0x30 ); + Y3 = _mm_blend_epi16( B[0], B[3], 0x03 ); + X3 = _mm_blend_epi16( B[2], B[1], 0x30 ); + + X0 = _mm_blend_epi16( X0, Y0, 0x0f ); + X1 = _mm_blend_epi16( X1, Y1, 0x0f ); + X2 = _mm_blend_epi16( X2, Y2, 0x0f ); + X3 = _mm_blend_epi16( X3, Y3, 0x0f ); + +#endif // AVX2 else SSE4_1 + + SALSA_8ROUNDS_SIMD128; + +#if defined(__AVX2__) + + Y0 = _mm_blend_epi32( X0, X1, 0x8 ); + Y1 = _mm_blend_epi32( X0, X1, 0x1 ); + Y2 = _mm_blend_epi32( X0, X1, 0x2 ); + Y3 = _mm_blend_epi32( X0, X1, 0x4 ); + + Y0 = _mm_blend_epi32( Y0, X2, 0x4 ); + Y1 = _mm_blend_epi32( Y1, X2, 0x8 ); + Y2 = _mm_blend_epi32( Y2, X2, 0x1 ); + Y3 = _mm_blend_epi32( Y3, X2, 0x2 ); + + Y0 = _mm_blend_epi32( Y0, X3, 0x2 ); + Y1 = _mm_blend_epi32( Y1, X3, 0x4 ); + Y2 = _mm_blend_epi32( Y2, X3, 0x8 ); + Y3 = _mm_blend_epi32( Y3, X3, 0x1 ); + +#else // SSE4_1 + + Y0 = _mm_blend_epi16( X0, X1, 0xc0 ); + Y1 = _mm_blend_epi16( X0, X1, 0x03 ); + Y2 = _mm_blend_epi16( X0, X1, 0x0c ); + Y3 = _mm_blend_epi16( X0, X1, 0x30 ); + + Y0 = _mm_blend_epi16( Y0, X2, 0x30 ); + Y1 = _mm_blend_epi16( Y1, X2, 0xc0 ); + Y2 = _mm_blend_epi16( Y2, X2, 0x03 ); + Y3 = _mm_blend_epi16( Y3, X2, 0x0c ); + + Y0 = _mm_blend_epi16( Y0, X3, 0x0c ); + Y1 = _mm_blend_epi16( Y1, X3, 0x30 ); + Y2 = _mm_blend_epi16( Y2, X3, 0xc0 ); + Y3 = _mm_blend_epi16( Y3, X3, 0x03 ); + +#endif // AVX2 else SSE4_1 + + B[0] = _mm_add_epi32( B[0], Y0 ); + B[1] = _mm_add_epi32( B[1], Y1 ); + B[2] = _mm_add_epi32( B[2], Y2 ); + B[3] = _mm_add_epi32( B[3], Y3 ); + +#else // SSE2 + + m128_ovly y[4], z[4]; + + X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] ); + X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] ); + X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] ); + X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] ); + + SALSA_8ROUNDS_FINAL_SIMD128; + + // Final round doesn't shuffle data back to original input order, + // process it as is. + // X0 is unchanged { xf, xa, x5, x0 } + // X1 is shuffled left 1 (rol_1x32) { xe, x9, x4, x3 } + // X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 } + // X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 } + + y[0].m128 = X0; + y[1].m128 = X1; + y[2].m128 = X2; + y[3].m128 = X3; + + z[0].u32[0] = y[0].u32[0]; + z[0].u32[3] = y[1].u32[0]; + z[0].u32[2] = y[2].u32[0]; + z[0].u32[1] = y[3].u32[0]; + + z[1].u32[1] = y[0].u32[1]; + z[1].u32[0] = y[1].u32[1]; + z[1].u32[3] = y[2].u32[1]; + z[1].u32[2] = y[3].u32[1]; + + z[2].u32[2] = y[0].u32[2]; + z[2].u32[1] = y[1].u32[2]; + z[2].u32[0] = y[2].u32[2]; + z[2].u32[3] = y[3].u32[2]; + + z[3].u32[3] = y[0].u32[3]; + z[3].u32[2] = y[1].u32[3]; + z[3].u32[1] = y[2].u32[3]; + z[3].u32[0] = y[3].u32[3]; + + B[0] = _mm_add_epi32( B[0], z[0].m128 ); + B[1] = _mm_add_epi32( B[1], z[1].m128 ); + B[2] = _mm_add_epi32( B[2], z[2].m128 ); + B[3] = _mm_add_epi32( B[3], z[3].m128 ); + +#endif + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + +} + +void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + for ( int n = 0; n < N; n++ ) + { + for ( int i = 0; i < 8; i++ ) + _mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) ); + + salsa8_simd128( &X[ 0], &X[16] ); + salsa8_simd128( &X[16], &X[ 0] ); + } + for ( int n = 0; n < N; n++ ) + { + const int j = 32 * ( X[16] & ( N - 1 ) ); + for ( int i = 0; i < 32; i++ ) + X[i] ^= V[ j+i ]; + salsa8_simd128( &X[ 0], &X[16] ); + salsa8_simd128( &X[16], &X[ 0] ); + } +} + +// Double buffered, 2x memory usage +// No interleaving + +static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb ) +{ + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + +#if defined(__SSE4_1__) + +// __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 ); + YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 ); + ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 ); + ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 ); + + YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 ); + ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 ); + ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 ); + + YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 ); + YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 ); + ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 ); + ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 ); + + YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 ); + YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 ); + ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 ); + ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 ); + + XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 ); + XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 ); + + XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 ); + XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 ); + + XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 ); + XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 ); + + XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 ); + XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 ); + +#else + +// SSE4.1 + + YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 ); + YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 ); + ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 ); + ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 ); + + YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 ); + ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 ); + ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 ); + + YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 ); + YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 ); + ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 ); + ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 ); + + YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 ); + YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 ); + ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 ); + ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 ); + + XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f ); + XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f ); + + XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f ); + XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f ); + + XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f ); + XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f ); + + XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f ); + XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f ); + +#endif // AVX2 else SSE4_1 + +#else // SSE2 + + YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); + YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); + YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); + YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); + YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); + YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); + YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] ); + YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] ); + + XA[0] = YA0; + XB[0] = YB0; + XA[1] = YA1; + XB[1] = YB1; + XA[2] = YA2; + XB[2] = YB2; + XA[3] = YA3; + XB[3] = YB3; + +#endif +} + +static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb ) +{ + + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + +#if defined(__SSE4_1__) + + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 ); + YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 ); + YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 ); + YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 ); + YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 ); + YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 ); + YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 ); + + YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 ); + YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 ); + YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 ); + YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 ); + YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 ); + YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 ); + YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 ); + YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 ); + + XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 ); + XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 ); + XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 ); + XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 ); + XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 ); + XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 ); + XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 ); + XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 ); + YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 ); + YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 ); + YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c ); + YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c ); + YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 ); + YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 ); + + YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 ); + YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 ); + YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 ); + YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 ); + YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 ); + YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 ); + YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c ); + YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c ); + + XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c ); + XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c ); + XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 ); + XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 ); + XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 ); + XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 ); + XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 ); + XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 ); + +#endif // AVX2 else SSE4_1 + +#else // SSE2 + + m128_ovly ya[4], za[4], yb[4], zb[4]; + + ya[0].m128 = XA[0]; + yb[0].m128 = XB[0]; + ya[1].m128 = XA[1]; + yb[1].m128 = XB[1]; + ya[2].m128 = XA[2]; + yb[2].m128 = XB[2]; + ya[3].m128 = XA[3]; + yb[3].m128 = XB[3]; + + za[0].u32[0] = ya[0].u32[0]; + zb[0].u32[0] = yb[0].u32[0]; + za[0].u32[1] = ya[3].u32[1]; + zb[0].u32[1] = yb[3].u32[1]; + za[0].u32[2] = ya[2].u32[2]; + zb[0].u32[2] = yb[2].u32[2]; + za[0].u32[3] = ya[1].u32[3]; + zb[0].u32[3] = yb[1].u32[3]; + + za[1].u32[0] = ya[1].u32[0]; + zb[1].u32[0] = yb[1].u32[0]; + za[1].u32[1] = ya[0].u32[1]; + zb[1].u32[1] = yb[0].u32[1]; + za[1].u32[2] = ya[3].u32[2]; + zb[1].u32[2] = yb[3].u32[2]; + za[1].u32[3] = ya[2].u32[3]; + zb[1].u32[3] = yb[2].u32[3]; + + za[2].u32[0] = ya[2].u32[0]; + zb[2].u32[0] = yb[2].u32[0]; + za[2].u32[1] = ya[1].u32[1]; + zb[2].u32[1] = yb[1].u32[1]; + za[2].u32[2] = ya[0].u32[2]; + zb[2].u32[2] = yb[0].u32[2]; + za[2].u32[3] = ya[3].u32[3]; + zb[2].u32[3] = yb[3].u32[3]; + + za[3].u32[0] = ya[3].u32[0]; + zb[3].u32[0] = yb[3].u32[0]; + za[3].u32[1] = ya[2].u32[1]; + zb[3].u32[1] = yb[2].u32[1]; + za[3].u32[2] = ya[1].u32[2]; + zb[3].u32[2] = yb[1].u32[2]; + za[3].u32[3] = ya[0].u32[3]; + zb[3].u32[3] = yb[0].u32[3]; + + XA[0] = za[0].m128; + XB[0] = zb[0].m128; + XA[1] = za[1].m128; + XB[1] = zb[1].m128; + XA[2] = za[2].m128; + XB[2] = zb[2].m128; + XA[3] = za[3].m128; + XB[3] = zb[3].m128; + +#endif +} + +static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, + const uint32_t * const ca, const uint32_t * const cb ) +{ + __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + __m128i *BA = (__m128i*)ba; + __m128i *BB = (__m128i*)bb; + const __m128i *CA = (const __m128i*)ca; + const __m128i *CB = (const __m128i*)cb; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + #define TYPE __m128i + + XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] ); + XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] ); + XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] ); + XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] ); + XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] ); + XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] ); + XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] ); + XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] ); + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + SALSA_8ROUNDS_SIMD128_2BUF; + +#else + + SALSA_8ROUNDS_SIMD128_2BUF_SLOROT; + +#endif + + BA[0] = _mm_add_epi32( BA[0], XA0 ); + BB[0] = _mm_add_epi32( BB[0], XB0 ); + BA[1] = _mm_add_epi32( BA[1], XA1 ); + BB[1] = _mm_add_epi32( BB[1], XB1 ); + BA[2] = _mm_add_epi32( BA[2], XA2 ); + BB[2] = _mm_add_epi32( BB[2], XB2 ); + BA[3] = _mm_add_epi32( BA[3], XA3 ); + BB[3] = _mm_add_epi32( BB[3], XB3 ); + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE +} + +void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + uint32_t *X0 = X; + uint32_t *X1 = X+32; + uint32_t *V0 = V; + uint32_t *V1 = V + 32*N; + + salsa_simd128_shuffle_2buf( X0, X1 ); + salsa_simd128_shuffle_2buf( X0+16, X1+16 ); + + for ( int n = 0; n < N; n++ ) + { + #if defined(__AVX__) + + for ( int i = 0; i < 4; i++ ) + { + _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) ); + _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) ); + } + + #elif defined(__SSE4_1__) + + for ( int i = 0; i < 8; i++ ) + { + _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) ); + _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) ); + } + + #else + + memcpy( &V0[ n*32 ], X0, 128 ); + memcpy( &V1[ n*32 ], X1, 128 ); + + #endif + + salsa8_simd128_2buf( X0, X1, X0+16, X1+16 ); + salsa8_simd128_2buf( X0+16, X1+16, X0 , X1 ); + } + + for ( int n = 0; n < N; n++ ) + { + #if defined(__AVX2__) + + const int j0 = 4 * ( X0[16] & ( N-1 ) ); + const int j1 = 4 * ( X1[16] & ( N-1 ) ); + + const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0 ); + const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1 ); + const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 ); + const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 ); + const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 ); + const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 ); + const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 ); + const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 ); + + casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 ); + casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 ); + casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 ); + casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 ); + casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 ); + casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 ); + casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 ); + casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 ); + + #else + + const int j0 = 8 * ( X0[16] & ( N-1 ) ); + const int j1 = 8 * ( X1[16] & ( N-1 ) ); + for ( int i = 0; i < 8; i++ ) + { + const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i ); + const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i ); + casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 ); + casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 ); + } + + #endif + + salsa8_simd128_2buf( X0, X1, X0+16, X1+16 ); + salsa8_simd128_2buf( X0+16, X1+16, X0 , X1 ); + } + + salsa_simd128_unshuffle_2buf( X0, X1 ); + salsa_simd128_unshuffle_2buf( X0+16, X1+16 ); +} + + +static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb, + uint32_t *xc ) +{ + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + __m128i *XC = (__m128i*)xc; + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; + +#if defined(__SSE4_1__) + + __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 ); + YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 ); + YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 ); + ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 ); + ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 ); + ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 ); + + YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 ); + YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 ); + ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 ); + ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 ); + ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 ); + + YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 ); + YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 ); + YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 ); + ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 ); + ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 ); + ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 ); + + YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 ); + YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 ); + YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 ); + ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 ); + ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 ); + ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 ); + + XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 ); + XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 ); + XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 ); + + XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 ); + XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 ); + XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 ); + + XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 ); + XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 ); + XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 ); + + XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 ); + XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 ); + XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 ); + +#else + +// SSE4.1 + + YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 ); + YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 ); + YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 ); + ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 ); + ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 ); + ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 ); + + YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 ); + YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 ); + ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 ); + ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 ); + ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 ); + + YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 ); + YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 ); + YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 ); + ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 ); + ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 ); + ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 ); + + YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 ); + YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 ); + YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 ); + ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 ); + ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 ); + ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 ); + + XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f ); + XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f ); + XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f ); + + XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f ); + XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f ); + XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f ); + + XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f ); + XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f ); + XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f ); + + XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f ); + XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f ); + XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f ); + +#endif // AVX2 else SSE4_1 + +#else // SSE2 + + YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); + YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); + YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] ); + YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); + YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); + YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] ); + YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); + YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); + YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] ); + YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] ); + YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] ); + YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] ); + + XA[0] = YA0; + XB[0] = YB0; + XC[0] = YC0; + XA[1] = YA1; + XB[1] = YB1; + XC[1] = YC1; + XA[2] = YA2; + XB[2] = YB2; + XC[2] = YC2; + XA[3] = YA3; + XB[3] = YB3; + XC[3] = YC3; + +#endif +} + +static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb, + uint32_t* xc ) +{ + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + __m128i *XC = (__m128i*)xc; + +#if defined(__SSE4_1__) + + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 ); + YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 ); + YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 ); + YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 ); + YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 ); + YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 ); + YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 ); + YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 ); + YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 ); + YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 ); + YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 ); + + YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 ); + YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 ); + YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 ); + YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 ); + YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 ); + YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 ); + YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 ); + YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 ); + YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 ); + YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 ); + YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 ); + YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 ); + + XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 ); + XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 ); + XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 ); + XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 ); + XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 ); + XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 ); + XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 ); + XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 ); + XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 ); + XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 ); + XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 ); + XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 ); + YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 ); + YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 ); + YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 ); + YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 ); + YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c ); + YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c ); + YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c ); + YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 ); + YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 ); + YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 ); + + YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 ); + YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 ); + YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 ); + YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 ); + YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 ); + YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 ); + YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 ); + YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 ); + YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 ); + YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c ); + YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c ); + YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c ); + + XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c ); + XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c ); + XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c ); + XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 ); + XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 ); + XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 ); + XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 ); + XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 ); + XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 ); + XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 ); + XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 ); + XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 ); + +#endif // AVX2 else SSE4_1 + +#else // SSE2 + + m128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4]; + + ya[0].m128 = XA[0]; + yb[0].m128 = XB[0]; + yc[0].m128 = XC[0]; + ya[1].m128 = XA[1]; + yb[1].m128 = XB[1]; + yc[1].m128 = XC[1]; + ya[2].m128 = XA[2]; + yb[2].m128 = XB[2]; + yc[2].m128 = XC[2]; + ya[3].m128 = XA[3]; + yb[3].m128 = XB[3]; + yc[3].m128 = XC[3]; + + za[0].u32[0] = ya[0].u32[0]; + zb[0].u32[0] = yb[0].u32[0]; + zc[0].u32[0] = yc[0].u32[0]; + za[0].u32[1] = ya[3].u32[1]; + zb[0].u32[1] = yb[3].u32[1]; + zc[0].u32[1] = yc[3].u32[1]; + za[0].u32[2] = ya[2].u32[2]; + zb[0].u32[2] = yb[2].u32[2]; + zc[0].u32[2] = yc[2].u32[2]; + za[0].u32[3] = ya[1].u32[3]; + zb[0].u32[3] = yb[1].u32[3]; + zc[0].u32[3] = yc[1].u32[3]; + + za[1].u32[0] = ya[1].u32[0]; + zb[1].u32[0] = yb[1].u32[0]; + zc[1].u32[0] = yc[1].u32[0]; + za[1].u32[1] = ya[0].u32[1]; + zb[1].u32[1] = yb[0].u32[1]; + zc[1].u32[1] = yc[0].u32[1]; + za[1].u32[2] = ya[3].u32[2]; + zb[1].u32[2] = yb[3].u32[2]; + zc[1].u32[2] = yc[3].u32[2]; + za[1].u32[3] = ya[2].u32[3]; + zb[1].u32[3] = yb[2].u32[3]; + zc[1].u32[3] = yc[2].u32[3]; + + za[2].u32[0] = ya[2].u32[0]; + zb[2].u32[0] = yb[2].u32[0]; + zc[2].u32[0] = yc[2].u32[0]; + za[2].u32[1] = ya[1].u32[1]; + zb[2].u32[1] = yb[1].u32[1]; + zc[2].u32[1] = yc[1].u32[1]; + za[2].u32[2] = ya[0].u32[2]; + zb[2].u32[2] = yb[0].u32[2]; + zc[2].u32[2] = yc[0].u32[2]; + za[2].u32[3] = ya[3].u32[3]; + zb[2].u32[3] = yb[3].u32[3]; + zc[2].u32[3] = yc[3].u32[3]; + + za[3].u32[0] = ya[3].u32[0]; + zb[3].u32[0] = yb[3].u32[0]; + zc[3].u32[0] = yc[3].u32[0]; + za[3].u32[1] = ya[2].u32[1]; + zb[3].u32[1] = yb[2].u32[1]; + zc[3].u32[1] = yc[2].u32[1]; + za[3].u32[2] = ya[1].u32[2]; + zb[3].u32[2] = yb[1].u32[2]; + zc[3].u32[2] = yc[1].u32[2]; + za[3].u32[3] = ya[0].u32[3]; + zb[3].u32[3] = yb[0].u32[3]; + zc[3].u32[3] = yc[0].u32[3]; + + XA[0] = za[0].m128; + XB[0] = zb[0].m128; + XC[0] = zc[0].m128; + XA[1] = za[1].m128; + XB[1] = zb[1].m128; + XC[1] = zc[1].m128; + XA[2] = za[2].m128; + XB[2] = zb[2].m128; + XC[2] = zc[2].m128; + XA[3] = za[3].m128; + XB[3] = zb[3].m128; + XC[3] = zc[3].m128; + +#endif +} + +// Triple buffered, 3x memory usage +// No interleaving +static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, + const uint32_t *ca, const uint32_t *cb, const uint32_t *cc ) +{ + __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, + XC0, XC1, XC2, XC3; + __m128i *BA = (__m128i*)ba; + __m128i *BB = (__m128i*)bb; + __m128i *BC = (__m128i*)bc; + const __m128i *CA = (const __m128i*)ca; + const __m128i *CB = (const __m128i*)cb; + const __m128i *CC = (const __m128i*)cc; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + #define TYPE __m128i + + XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] ); + XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] ); + XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] ); + XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] ); + XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] ); + XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] ); + XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] ); + XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] ); + XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] ); + XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] ); + XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] ); + XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] ); + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + SALSA_8ROUNDS_SIMD128_3BUF; + +#else + + SALSA_8ROUNDS_SIMD128_3BUF_SLOROT; + +#endif + + BA[0] = _mm_add_epi32( BA[0], XA0 ); + BB[0] = _mm_add_epi32( BB[0], XB0 ); + BC[0] = _mm_add_epi32( BC[0], XC0 ); + BA[1] = _mm_add_epi32( BA[1], XA1 ); + BB[1] = _mm_add_epi32( BB[1], XB1 ); + BC[1] = _mm_add_epi32( BC[1], XC1 ); + BA[2] = _mm_add_epi32( BA[2], XA2 ); + BB[2] = _mm_add_epi32( BB[2], XB2 ); + BC[2] = _mm_add_epi32( BC[2], XC2 ); + BA[3] = _mm_add_epi32( BA[3], XA3 ); + BB[3] = _mm_add_epi32( BB[3], XB3 ); + BC[3] = _mm_add_epi32( BC[3], XC3 ); + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE +} + +void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + uint32_t *X0 = X; + uint32_t *X1 = X+32; + uint32_t *X2 = X+64; + uint32_t *V0 = V; + uint32_t *V1 = V + 32*N; + uint32_t *V2 = V + 64*N; + + salsa_simd128_shuffle_3buf( X0, X1, X2 ); + salsa_simd128_shuffle_3buf( X0+16, X1+16, X2+16 ); + + for ( int n = 0; n < N; n++ ) + { + #if defined(__AVX__) + + for ( int i = 0; i < 4; i++ ) + { + _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) ); + _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) ); + _mm256_stream_si256( (__m256i*)V2 + n*4 + i, casti_m256i( X2, i ) ); + } + + #elif defined(__SSE4_1__) + + for ( int i = 0; i < 8; i++ ) + { + _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) ); + _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) ); + _mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) ); + } + + #else + + memcpy( &V0[ n*32 ], X0, 128 ); + memcpy( &V1[ n*32 ], X1, 128 ); + memcpy( &V2[ n*32 ], X2, 128 ); + + #endif + + salsa8_simd128_3buf( X0, X1, X2 , X0+16, X1+16, X2+16 ); + salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0, X1, X2 ); + } + + for ( int n = 0; n < N; n++ ) + { + #if defined(__AVX2__) + + const int j0 = 4 * ( X0[16] & ( N-1 ) ); + const int j1 = 4 * ( X1[16] & ( N-1 ) ); + const int j2 = 4 * ( X2[16] & ( N-1 ) ); + + const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0 ); + const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1 ); + const __m256i v20 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2 ); + const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 ); + const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 ); + const __m256i v21 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+1 ); + const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 ); + const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 ); + const __m256i v22 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+2 ); + const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 ); + const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 ); + const __m256i v23 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+3 ); + + casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 ); + casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 ); + casti_m256i( X2, 0 ) = _mm256_xor_si256( casti_m256i( X2, 0 ), v20 ); + casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 ); + casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 ); + casti_m256i( X2, 1 ) = _mm256_xor_si256( casti_m256i( X2, 1 ), v21 ); + casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 ); + casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 ); + casti_m256i( X2, 2 ) = _mm256_xor_si256( casti_m256i( X2, 2 ), v22 ); + casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 ); + casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 ); + casti_m256i( X2, 3 ) = _mm256_xor_si256( casti_m256i( X2, 3 ), v23 ); + + #else + + const int j0 = 8 * ( X0[16] & ( N-1 ) ); + const int j1 = 8 * ( X1[16] & ( N-1 ) ); + const int j2 = 8 * ( X2[16] & ( N-1 ) ); + for ( int i = 0; i < 8; i++ ) + { + const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i ); + const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i ); + const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i ); + casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 ); + casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 ); + casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 ); + } + + #endif + + salsa8_simd128_3buf( X0, X1, X2 , X0+16, X1+16, X2+16 ); + salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0, X1, X2 ); + } + + salsa_simd128_unshuffle_3buf( X0, X1, X2 ); + salsa_simd128_unshuffle_3buf( X0+16, X1+16, X2+16 ); + +} + + +#endif // SSE2 + + +// Reference, used only for testing. +// Tested OK. + +static void xor_salsa8(uint32_t * const B, const uint32_t * const C) +{ + uint32_t x0 = (B[ 0] ^= C[ 0]), + x1 = (B[ 1] ^= C[ 1]), + x2 = (B[ 2] ^= C[ 2]), + x3 = (B[ 3] ^= C[ 3]); + uint32_t x4 = (B[ 4] ^= C[ 4]), + x5 = (B[ 5] ^= C[ 5]), + x6 = (B[ 6] ^= C[ 6]), + x7 = (B[ 7] ^= C[ 7]); + uint32_t x8 = (B[ 8] ^= C[ 8]), + x9 = (B[ 9] ^= C[ 9]), + xa = (B[10] ^= C[10]), + xb = (B[11] ^= C[11]); + uint32_t xc = (B[12] ^= C[12]), + xd = (B[13] ^= C[13]), + xe = (B[14] ^= C[14]), + xf = (B[15] ^= C[15]); + + + #define ROL32( a, c ) ror32( a, c ) + #define ADD32( a, b ) ( (a)+(b) ) + #define XOR( a, b ) ( (a)^(b) ) + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] += x0; + B[ 1] += x1; + B[ 2] += x2; + B[ 3] += x3; + B[ 4] += x4; + B[ 5] += x5; + B[ 6] += x6; + B[ 7] += x7; + B[ 8] += x8; + B[ 9] += x9; + B[10] += xa; + B[11] += xb; + B[12] += xc; + B[13] += xd; + B[14] += xe; + B[15] += xf; +} + +/** + * @param X input/ouput + * @param V scratch buffer + * @param N factor (def. 1024) + */ + + +void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + for ( int n = 0; n < N; n++ ) + { + memcpy( &V[ n*32 ], X, 128 ); + xor_salsa8( &X[ 0], &X[16] ); + xor_salsa8( &X[16], &X[ 0] ); + } + for ( int n = 0; n < N; n++ ) + { + int j = 32 * ( X[16] & ( N - 1 ) ); + for ( int i = 0; i < 32; i++ ) + X[i] ^= V[ j+i ]; + xor_salsa8( &X[ 0], &X[16] ); + xor_salsa8( &X[16], &X[ 0] ); + } +} + + + diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h new file mode 100644 index 00000000..6567733b --- /dev/null +++ b/algo/scrypt/scrypt-core-4way.h @@ -0,0 +1,70 @@ +#ifndef SCRYPT_CORE_4WAY_H__ +#define SCRYPT_CORE_4WAY_H__ + +#include "simd-utils.h" +#include +#include + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ); + +// Serial SIMD over 4 way parallel +void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ); + +// 4 way parallel over serial SIMD +void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N ); + +#endif + +#if defined(__AVX2__) + +void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N ); + +// 2 way parallel over SIMD128 +void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N ); + +// Double buffered 2 way parallel over SIMD128 +void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N ); + +// Triplee buffered 2 way parallel over SIMD128 +void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N ); + +// Serial SIMD128 over 2 way parallel +void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N ); + +// Double buffered simd over parallel +void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N ); + +// Triple buffered 2 way +void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N ); + +// Quadruple buffered +void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N ); + +#endif + +#if defined(__SSE2__) + +// Parallel 4 way, 4x memory +void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ); + +// Linear SIMD 1 way, 1x memory, lowest +void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ); + +// Double buffered, 2x memory +void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ); + +// Triple buffered +void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ); + +// Quadruple buffered, 4x memory +void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N ); + +#endif + +// For reference only +void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N ); + +#endif + diff --git a/algo/scrypt/scrypt-core-ref.c b/algo/scrypt/scrypt-core-ref.c new file mode 100644 index 00000000..ec564ed2 --- /dev/null +++ b/algo/scrypt/scrypt-core-ref.c @@ -0,0 +1,206 @@ +#include "scrypt-core-ref.h" + +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + +static void xor_salsa8(uint32_t * const B, const uint32_t * const C) +{ + uint32_t x0 = (B[ 0] ^= C[ 0]), + x1 = (B[ 1] ^= C[ 1]), + x2 = (B[ 2] ^= C[ 2]), + x3 = (B[ 3] ^= C[ 3]); + uint32_t x4 = (B[ 4] ^= C[ 4]), + x5 = (B[ 5] ^= C[ 5]), + x6 = (B[ 6] ^= C[ 6]), + x7 = (B[ 7] ^= C[ 7]); + uint32_t x8 = (B[ 8] ^= C[ 8]), + x9 = (B[ 9] ^= C[ 9]), + xa = (B[10] ^= C[10]), + xb = (B[11] ^= C[11]); + uint32_t xc = (B[12] ^= C[12]), + xd = (B[13] ^= C[13]), + xe = (B[14] ^= C[14]), + xf = (B[15] ^= C[15]); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + B[ 0] += x0; + B[ 1] += x1; + B[ 2] += x2; + B[ 3] += x3; + B[ 4] += x4; + B[ 5] += x5; + B[ 6] += x6; + B[ 7] += x7; + B[ 8] += x8; + B[ 9] += x9; + B[10] += xa; + B[11] += xb; + B[12] += xc; + B[13] += xd; + B[14] += xe; + B[15] += xf; +} + +/** + * @param X input/ouput + * @param V scratch buffer + * @param N factor (def. 1024) + */ +void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N) +{ + for (uint32_t i = 0; i < N; i++) { + memcpy(&V[i * 32], X, 128); + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } + for (uint32_t i = 0; i < N; i++) { + uint32_t j = 32 * (X[16] & (N - 1)); + for (uint8_t k = 0; k < 32; k++) + X[k] ^= V[j + k]; + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } +} + diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index e35adbf5..c36411bd 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -28,10 +28,13 @@ */ #include "algo-gate-api.h" - #include #include #include +#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha256-hash.h" +#include +#include "malloc-huge.h" static const uint32_t keypad[12] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 @@ -46,81 +49,230 @@ static const uint32_t finalblk[16] = { 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 }; -static __thread char *scratchbuf; -int scratchbuf_size = 0; +static const uint32_t sha256_initial_state[8] = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define SCRYPT_THROUGHPUT 16 + +#elif defined(__AVX2__) + +#define SCRYPT_THROUGHPUT 8 + +#else + +#define SCRYPT_THROUGHPUT 4 + +#endif + +// static int scrypt_throughput = 0; + +static int scratchbuf_size = 0; + +static __thread uint32_t *scratchbuf = NULL; + +// change this to a constant to be used directly as input state arg +// vectors still need an init function. +static inline void sha256_init_state( uint32_t *state ) +{ + state[ 0 ] = 0x6A09E667; + state[ 1 ] = 0xBB67AE85; + state[ 2 ] = 0x3C6EF372; + state[ 3 ] = 0xA54FF53A; + state[ 4 ] = 0x510E527F; + state[ 5 ] = 0x9B05688C; + state[ 6 ] = 0x1F83D9AB; + state[ 7 ] = 0x5BE0CD19; +} static inline void HMAC_SHA256_80_init(const uint32_t *key, uint32_t *tstate, uint32_t *ostate) { - uint32_t ihash[8]; - uint32_t pad[16]; - int i; + uint32_t ihash[8]; + uint32_t pad[16]; + int i; - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 16, 16); - memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 0); - memcpy(ihash, tstate, 32); - - sha256_init(ostate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform(ostate, pad, 0); - - sha256_init(tstate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - sha256_transform(tstate, pad, 0); + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 16, 16); + memcpy(pad + 4, keypad, 48); + + sha256_transform_le( tstate, pad, tstate ); + + memcpy( ihash, tstate, 32 ); + + for ( i = 0; i < 8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_transform_le( ostate, pad, sha256_initial_state ); + + for ( i = 0; i < 8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 16; i++ ) pad[i] = 0x36363636; + + sha256_transform_le( tstate, pad, sha256_initial_state ); } static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *salt, uint32_t *output) { - uint32_t istate[8], ostate2[8]; - uint32_t ibuf[16], obuf[16]; - int i, j; + uint32_t istate[8], ostate2[8]; + uint32_t ibuf[16], obuf[16]; + int i, j; - memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 0); - - memcpy(ibuf, salt + 16, 16); - memcpy(ibuf + 5, innerpad, 44); - memcpy(obuf + 8, outerpad, 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 32); - ibuf[4] = i + 1; - sha256_transform(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 32); - sha256_transform(ostate2, obuf, 0); - for (j = 0; j < 8; j++) - output[8 * i + j] = swab32(ostate2[j]); - } + sha256_transform_le( istate, salt, tstate ); + + memcpy(ibuf, salt + 16, 16); + memcpy(ibuf + 5, innerpad, 44); + memcpy(obuf + 8, outerpad, 32); + + for (i = 0; i < 4; i++) + { + memcpy(obuf, istate, 32); + ibuf[4] = i + 1; + + sha256_transform_le( obuf, ibuf, obuf ); + sha256_transform_le( ostate2, obuf, ostate ); + + for (j = 0; j < 8; j++) + output[8 * i + j] = bswap_32( ostate2[j] ); + } } static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *salt, uint32_t *output) { - uint32_t buf[16]; - int i; - - sha256_transform(tstate, salt, 1); - sha256_transform(tstate, salt + 16, 1); - sha256_transform(tstate, finalblk, 0); - memcpy(buf, tstate, 32); - memcpy(buf + 8, outerpad, 32); - - sha256_transform(ostate, buf, 0); - for (i = 0; i < 8; i++) - output[i] = swab32(ostate[i]); + uint32_t buf[16]; + int i; + + sha256_transform_be( tstate, salt, tstate ); + sha256_transform_be( tstate, salt+16, tstate ); + sha256_transform_le( tstate, finalblk, tstate ); + + memcpy(buf, tstate, 32); + memcpy(buf + 8, outerpad, 32); + + sha256_transform_le( ostate, buf, ostate ); + + for (i = 0; i < 8; i++) + output[i] = bswap_32( ostate[i] ); +} + +#if defined(__SHA__) + +static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, + const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1, + uint32_t *ostate0, uint32_t *ostate1 ) +{ + uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16]; + int i; + + memcpy( pad0, key0 + 16, 16 ); + memcpy( pad0 + 4, keypad, 48 ); + memcpy( pad1, key1 + 16, 16 ); + memcpy( pad1 + 4, keypad, 48 ); + + sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, + tstate0, tstate1 ); + + memcpy( ihash0, tstate0, 32 ); + memcpy( ihash1, tstate1, 32 ); + + for ( i = 0; i < 8; i++ ) + { + pad0[i] = ihash0[i] ^ 0x5c5c5c5c; + pad1[i] = ihash1[i] ^ 0x5c5c5c5c; + } + for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c; + + sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1, + sha256_initial_state, sha256_initial_state ); + + for ( i = 0; i < 8; i++ ) + { + pad0[i] = ihash0[i] ^ 0x36363636; + pad1[i] = ihash1[i] ^ 0x36363636; + } + for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x36363636; + + sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, + sha256_initial_state, sha256_initial_state ); +} + +static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0, + const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1, + const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0, + uint32_t *output1 ) +{ + uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8]; + uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16]; + int i, j; + + sha256_ni2way_transform_le( istate0, istate1, salt0, salt1, + tstate0, tstate1 ); + + memcpy( ibuf0, salt0 + 16, 16 ); + memcpy( ibuf0 + 5, innerpad, 44 ); + memcpy( obuf0 + 8, outerpad, 32 ); + memcpy( ibuf1, salt1 + 16, 16 ); + memcpy( ibuf1 + 5, innerpad, 44 ); + memcpy( obuf1 + 8, outerpad, 32 ); + + for ( i = 0; i < 4; i++ ) + { + memcpy( obuf0, istate0, 32 ); + memcpy( obuf1, istate1, 32 ); + ibuf0[4] = ibuf1[4] = i + 1; + + sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1, + obuf0, obuf1 ); + sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1, + ostate0, ostate1 ); + + for ( j = 0; j < 8; j++ ) + { + output0[ 8*i + j ] = bswap_32( ostateb0[j] ); + output1[ 8*i + j ] = bswap_32( ostateb1[j] ); + } + } +} + +static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0, + uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1, + const uint32_t *salt0, const uint32_t *salt1, + uint32_t *output0, uint32_t *output1 ) +{ + uint32_t buf0[16], buf1[16]; + int i; + + sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1, + tstate0, tstate1 ); + sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16, + tstate0, tstate1 ); + sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk, + tstate0, tstate1 ); + + memcpy( buf0, tstate0, 32 ); + memcpy( buf0 + 8, outerpad, 32 ); + memcpy( buf1, tstate1, 32 ); + memcpy( buf1 + 8, outerpad, 32 ); + + sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1, + ostate0, ostate1 ); + + for ( i = 0; i < 8; i++ ) + { + output0[i] = bswap_32( ostate0[i] ); + output1[i] = bswap_32( ostate1[i] ); + } } + +#endif + #ifdef HAVE_SHA256_4WAY static const uint32_t keypad_4way[4 * 12] = { @@ -160,6 +312,8 @@ static const uint32_t outerpad_4way[4 * 8] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000300, 0x00000300, 0x00000300, 0x00000300 }; + +/* static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = { 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x80000000, 0x80000000, 0x80000000, 0x80000000, @@ -178,37 +332,51 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000620, 0x00000620, 0x00000620, 0x00000620 }; +*/ -static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) +static inline void sha256_4way_init_state( void *state ) +{ + casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 ); + casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 ); + casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 ); + casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A ); + casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F ); + casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C ); + casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB ); + casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 ); +} + +static inline void HMAC_SHA256_80_init_4way( const uint32_t *key, + uint32_t *tstate, uint32_t *ostate ) { uint32_t _ALIGN(16) ihash[4 * 8]; uint32_t _ALIGN(16) pad[4 * 16]; int i; /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 4 * 16, 4 * 16); - memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 0); - memcpy(ihash, tstate, 4 * 32); - - sha256_init_4way(ostate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 4 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_4way(ostate, pad, 0); - - sha256_init_4way(tstate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 4 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_4way(tstate, pad, 0); + memcpy( pad, key + 4*16, 4*16 ); + memcpy( pad + 4*4, keypad_4way, 4*48 ); + + sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad, + (const __m128i*)tstate ); + + sha256_4way_init_state( tstate ); + + for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad, + (const __m128i*)tstate ); + + for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 4*16; i++ ) pad[i] = 0x36363636; + + sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad, + (const __m128i*)tstate ); } -static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { uint32_t _ALIGN(16) istate[4 * 8]; uint32_t _ALIGN(16) ostate2[4 * 8]; @@ -216,43 +384,62 @@ static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, uint32_t _ALIGN(16) obuf[4 * 16]; int i, j; - memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 0); + sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt, + (const __m128i*)tstate ); memcpy(ibuf, salt + 4 * 16, 4 * 16); memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 4 * 32); + for ( i = 0; i < 4; i++ ) + { ibuf[4 * 4 + 0] = i + 1; ibuf[4 * 4 + 1] = i + 1; ibuf[4 * 4 + 2] = i + 1; ibuf[4 * 4 + 3] = i + 1; - sha256_transform_4way(obuf, ibuf, 0); - memcpy(ostate2, ostate, 4 * 32); - sha256_transform_4way(ostate2, obuf, 0); - for (j = 0; j < 4 * 8; j++) - output[4 * 8 * i + j] = swab32(ostate2[j]); + sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf, + (const __m128i*)istate ); + + sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf, + (const __m128i*)ostate ); + + for ( j = 0; j < 4 * 8; j++ ) + output[4 * 8 * i + j] = bswap_32( ostate2[j] ); } } -static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { - uint32_t _ALIGN(16) buf[4 * 16]; + __m128i _ALIGN(64) final[ 8*16 ]; + uint32_t _ALIGN(64) buf[4 * 16]; int i; - sha256_transform_4way(tstate, salt, 1); - sha256_transform_4way(tstate, salt + 4 * 16, 1); - sha256_transform_4way(tstate, finalblk_4way, 0); - memcpy(buf, tstate, 4 * 32); + sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt, + (const __m128i*)tstate ); + sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16), + (const __m128i*)tstate ); + + final[ 0] = _mm_set1_epi32( 0x00000001 ); + final[ 1] = _mm_set1_epi32( 0x80000000 ); + final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6] + = final[ 7] = final[ 8] = final[ 9] = final[10] + = final[11] = final[12] = final[13] = final[14] + = _mm_setzero_si128(); + final[15] = _mm_set1_epi32 ( 0x00000620 ); + + sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final, + (const __m128i*)tstate ); + + memcpy(buf, tstate, 4 * 32); memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - sha256_transform_4way(ostate, buf, 0); - for (i = 0; i < 4 * 8; i++) - output[i] = swab32(ostate[i]); + sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf, + (const __m128i*)ostate ); + + for ( i = 0; i < 4 * 8; i++ ) + output[i] = bswap_32( ostate[i] ); } #endif /* HAVE_SHA256_4WAY */ @@ -260,6 +447,7 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, #ifdef HAVE_SHA256_8WAY +/* static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = { 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, @@ -278,41 +466,52 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 }; +*/ -static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) +static inline void sha256_8way_init_state( void *state ) +{ + casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 ); + casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 ); + casti_m256i( state, 2 ) = _mm256_set1_epi32( 0x3C6EF372 ); + casti_m256i( state, 3 ) = _mm256_set1_epi32( 0xA54FF53A ); + casti_m256i( state, 4 ) = _mm256_set1_epi32( 0x510E527F ); + casti_m256i( state, 5 ) = _mm256_set1_epi32( 0x9B05688C ); + casti_m256i( state, 6 ) = _mm256_set1_epi32( 0x1F83D9AB ); + casti_m256i( state, 7 ) = _mm256_set1_epi32( 0x5BE0CD19 ); +} + +static inline void HMAC_SHA256_80_init_8way( const uint32_t *key, + uint32_t *tstate, uint32_t *ostate ) { uint32_t _ALIGN(32) ihash[8 * 8]; uint32_t _ALIGN(32) pad[8 * 16]; int i; - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - pad[8 * 4 + i] = 0x80000000; - memset(pad + 8 * 5, 0x00, 8 * 40); - for (i = 0; i < 8; i++) - pad[8 * 15 + i] = 0x00000280; - sha256_transform_8way(tstate, pad, 0); - memcpy(ihash, tstate, 8 * 32); - - sha256_init_8way(ostate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 8 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_8way(ostate, pad, 0); - - sha256_init_8way(tstate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 8 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_8way(tstate, pad, 0); + memcpy( pad, key + 8*16, 8*16 ); + for ( i = 0; i < 8; i++ ) pad[ 8*4 + i ] = 0x80000000; + memset( pad + 8*5, 0x00, 8*40 ); + for ( i = 0; i < 8; i++ ) pad[ 8*15 + i ] = 0x00000280; + + sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad, + (const __m256i*)tstate ); + + sha256_8way_init_state( tstate ); + + for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 8*16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad, + (const __m256i*)tstate ); + + for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 8*16; i++ ) pad[i] = 0x36363636; + + sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad, + (const __m256i*)tstate ); } -static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { uint32_t _ALIGN(32) istate[8 * 8]; uint32_t _ALIGN(32) ostate2[8 * 8]; @@ -320,24 +519,20 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, uint32_t _ALIGN(32) obuf[8 * 16]; int i, j; - memcpy(istate, tstate, 8 * 32); - sha256_transform_8way(istate, salt, 0); - - memcpy(ibuf, salt + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - ibuf[8 * 5 + i] = 0x80000000; - memset(ibuf + 8 * 6, 0x00, 8 * 36); - for (i = 0; i < 8; i++) - ibuf[8 * 15 + i] = 0x000004a0; + sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt, + (const __m256i*)tstate ); + + memcpy( ibuf, salt + 8*16, 8*16 ); + for ( i = 0; i < 8; i++ ) ibuf[ 8*5 + i ] = 0x80000000; + memset( ibuf + 8*6, 0x00, 8*36 ); + for ( i = 0; i < 8; i++ ) ibuf[ 8*15 + i ] = 0x000004a0; - for (i = 0; i < 8; i++) - obuf[8 * 8 + i] = 0x80000000; - memset(obuf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - obuf[8 * 15 + i] = 0x00000300; + for ( i = 0; i < 8; i++ ) obuf[ 8*8 + i ] = 0x80000000; + memset( obuf + 8*9, 0x00, 8*24 ); + for ( i = 0; i < 8; i++ ) obuf[ 8*15 + i ] = 0x00000300; - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 8 * 32); + for ( i = 0; i < 4; i++ ) + { ibuf[8 * 4 + 0] = i + 1; ibuf[8 * 4 + 1] = i + 1; ibuf[8 * 4 + 2] = i + 1; @@ -346,48 +541,194 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, ibuf[8 * 4 + 5] = i + 1; ibuf[8 * 4 + 6] = i + 1; ibuf[8 * 4 + 7] = i + 1; - sha256_transform_8way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 8 * 32); - sha256_transform_8way(ostate2, obuf, 0); - for (j = 0; j < 8 * 8; j++) - output[8 * 8 * i + j] = swab32(ostate2[j]); + + sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf, + (const __m256i*)istate ); + + sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf, + (const __m256i*)ostate ); + + for ( j = 0; j < 8*8; j++ ) + output[ 8*8*i + j ] = bswap_32( ostate2[j] ); } } -static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { - uint32_t _ALIGN(32) buf[8 * 16]; + __m256i _ALIGN(128) final[ 8*16 ]; + uint32_t _ALIGN(128) buf[ 8*16 ]; int i; - sha256_transform_8way(tstate, salt, 1); - sha256_transform_8way(tstate, salt + 8 * 16, 1); - sha256_transform_8way(tstate, finalblk_8way, 0); - - memcpy(buf, tstate, 8 * 32); - for (i = 0; i < 8; i++) - buf[8 * 8 + i] = 0x80000000; - memset(buf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - buf[8 * 15 + i] = 0x00000300; - sha256_transform_8way(ostate, buf, 0); - + sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt, + (const __m256i*)tstate ); + sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16), + (const __m256i*)tstate ); + + final[ 0] = _mm256_set1_epi32( 0x00000001 ); + final[ 1] = _mm256_set1_epi32( 0x80000000 ); + final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6] + = final[ 7] = final[ 8] = final[ 9] = final[10] + = final[11] = final[12] = final[13] = final[14] + = _mm256_setzero_si256(); + final[15] = _mm256_set1_epi32 ( 0x00000620 ); + + sha256_8way_transform_le( (__m256i*)tstate, final, + (const __m256i*)tstate ); + + memcpy( buf, tstate, 8*32 ); + for ( i = 0; i < 8; i++ ) buf[ 8*8 + i ] = 0x80000000; + memset( buf + 8*9, 0x00, 8*24 ); + for ( i = 0; i < 8; i++ ) buf[ 8*15 + i ] = 0x00000300; + + sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf, + (const __m256i*)ostate ); + for (i = 0; i < 8 * 8; i++) - output[i] = swab32(ostate[i]); + output[i] = bswap_32(ostate[i]); } #endif /* HAVE_SHA256_8WAY */ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -//#if defined(USE_ASM) && defined(__x86_64__) +static inline void sha256_16way_init_state( void *state ) +{ + casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 ); + casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 ); + casti_m512i( state, 2 ) = _mm512_set1_epi32( 0x3C6EF372 ); + casti_m512i( state, 3 ) = _mm512_set1_epi32( 0xA54FF53A ); + casti_m512i( state, 4 ) = _mm512_set1_epi32( 0x510E527F ); + casti_m512i( state, 5 ) = _mm512_set1_epi32( 0x9B05688C ); + casti_m512i( state, 6 ) = _mm512_set1_epi32( 0x1F83D9AB ); + casti_m512i( state, 7 ) = _mm512_set1_epi32( 0x5BE0CD19 ); +} + +static inline void HMAC_SHA256_80_init_16way( const uint32_t *key, + uint32_t *tstate, uint32_t *ostate ) +{ + uint32_t _ALIGN(128) pad[16*16]; + uint32_t _ALIGN(128) ihash[16* 8]; + int i; + + memcpy( pad, key + 16*16, 16*16 ); + for ( i = 0; i < 16; i++ ) pad[ 16*4 + i ] = 0x80000000; + memset( pad + 16*5, 0x00, 16*40 ); + for ( i = 0; i < 16; i++ ) pad[ 16*15 + i ] = 0x00000280; + + sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad, + (const __m512i*)tstate ); + + sha256_16way_init_state( tstate ); + + for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 16*16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad, + (const __m512i*)tstate ); + + for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 16*16; i++ ) pad[i] = 0x36363636; + + sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad, + (const __m512i*)tstate ); +} + + +static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output ) +{ + uint32_t _ALIGN(128) ibuf[ 16*16 ]; + uint32_t _ALIGN(128) obuf[ 16*16 ]; + uint32_t _ALIGN(128) istate[ 16*8 ]; + uint32_t _ALIGN(128) ostate2[ 16*8 ]; + int i, j; + + sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt, + (const __m512i*)tstate ); + + memcpy( ibuf, salt + 16*16, 16*16 ); + for ( i = 0; i < 16; i++ ) ibuf[ 16*5 + i ] = 0x80000000; + memset( ibuf + 16*6, 0x00, 16*36 ); + for ( i = 0; i < 16; i++ ) ibuf[ 16*15 + i ] = 0x000004a0; + + for ( i = 0; i < 16; i++ ) obuf[ 16*8 + i ] = 0x80000000; + memset( obuf + 16*9, 0x00, 16*24 ); + for ( i = 0; i < 16; i++ ) obuf[ 16*15 + i ] = 0x00000300; + + for ( i = 0; i < 4; i++ ) + { + ibuf[ 16*4 + 0 ] = i + 1; + ibuf[ 16*4 + 1 ] = i + 1; + ibuf[ 16*4 + 2 ] = i + 1; + ibuf[ 16*4 + 3 ] = i + 1; + ibuf[ 16*4 + 4 ] = i + 1; + ibuf[ 16*4 + 5 ] = i + 1; + ibuf[ 16*4 + 6 ] = i + 1; + ibuf[ 16*4 + 7 ] = i + 1; + ibuf[ 16*4 + 8 ] = i + 1; + ibuf[ 16*4 + 9 ] = i + 1; + ibuf[ 16*4 + 10 ] = i + 1; + ibuf[ 16*4 + 11 ] = i + 1; + ibuf[ 16*4 + 12 ] = i + 1; + ibuf[ 16*4 + 13 ] = i + 1; + ibuf[ 16*4 + 14 ] = i + 1; + ibuf[ 16*4 + 15 ] = i + 1; + + sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf, + (const __m512i*)istate ); + + sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf, + (const __m512i*)ostate ); + + for ( j = 0; j < 16*8; j++ ) + output[ 16*8*i + j ] = bswap_32( ostate2[j] ); + } +} + +static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output ) +{ + __m512i _ALIGN(128) final[ 16*16 ]; + uint32_t _ALIGN(128) buf[ 16*16 ]; + int i; + + sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt, + (const __m512i*)tstate ); + sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16), + (const __m512i*)tstate ); + + final[ 0] = _mm512_set1_epi32( 0x00000001 ); + final[ 1] = _mm512_set1_epi32( 0x80000000 ); + final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6] + = final[ 7] = final[ 8] = final[ 9] = final[10] + = final[11] = final[12] = final[13] = final[14] + = _mm512_setzero_si512(); + final[15] = _mm512_set1_epi32 ( 0x00000620 ); + + sha256_16way_transform_le( (__m512i*)tstate, final, + (const __m512i*)tstate ); + + memcpy( buf, tstate, 16*32 ); + for ( i = 0; i < 16; i++ ) buf[ 16*8 + i ] = 0x80000000; + memset( buf + 16*9, 0x00, 16*24 ); + for ( i = 0; i < 16; i++ ) buf[ 16*15 + i ] = 0x00000300; + + sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf, + (const __m512i*)ostate ); + + for ( i = 0; i < 16*8; i++ ) + output[i] = bswap_32( ostate[i] ); +} + +#endif // AVX512 #define SCRYPT_MAX_WAYS 12 #define HAVE_SCRYPT_3WAY 1 -int scrypt_best_throughput(); void scrypt_core(uint32_t *X, uint32_t *V, int N); void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); -#if defined(USE_AVX2) + +#if defined(__AVX2__) #undef SCRYPT_MAX_WAYS #define SCRYPT_MAX_WAYS 24 #define HAVE_SCRYPT_6WAY 1 @@ -396,331 +737,749 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N); #ifndef SCRYPT_MAX_WAYS #define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 #endif -unsigned char *scrypt_buffer_alloc(int N) -{ - return (uchar*) malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63); -} +#include "scrypt-core-4way.h" -static bool scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, - uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id ) +/* +static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output, + uint32_t *midstate, int N, int thr_id ) { uint32_t tstate[8], ostate[8]; uint32_t X[32]; - uint32_t *V; - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - memcpy(tstate, midstate, 32); HMAC_SHA256_80_init(input, tstate, ostate); PBKDF2_SHA256_80_128(tstate, ostate, input, X); - scrypt_core(X, V, N); + scrypt_core_simd128( X, scratchbuf, N ); // woring +// scrypt_core_1way( X, V, N ); // working +// scrypt_core(X, V, N); PBKDF2_SHA256_128_32(tstate, ostate, X, output); return true; } +*/ -#ifdef HAVE_SHA256_4WAY -static int scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, - int thrid ) +#if ( SCRYPT_THROUGHPUT == 8 ) + +static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, + uint32_t *midstate, int N, int thrid ) { - uint32_t _ALIGN(128) tstate[4 * 8]; - uint32_t _ALIGN(128) ostate[4 * 8]; - uint32_t _ALIGN(128) W[4 * 32]; - uint32_t _ALIGN(128) X[4 * 32]; - uint32_t *V; - int i, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + uint32_t _ALIGN(128) tstate[ 8*8 ]; + uint32_t _ALIGN(128) ostate[ 8*8 ]; + uint32_t _ALIGN(128) W[ 8*32 ]; + uint32_t _ALIGN(128) X[ 8*32 ]; - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = input[k * 20 + i]; + intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60, + input+80, input+100, input+120, input+140, 640 ); + for ( int i = 0; i < 8; i++ ) + casti_m256i( tstate, i ) = _mm256_set1_epi32( midstate[i] ); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[4 * i + k] = midstate[i]; + HMAC_SHA256_80_init_8way( W, tstate, ostate ); + PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W ); - HMAC_SHA256_80_init_4way(W, tstate, ostate); + dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 ); + + if ( opt_param_n > 0x4000 ) + { + scrypt_core_simd128_3buf( X, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, scratchbuf, N ); + } + else + { + intrlv_2x128( W, X, X+ 32, 1024 ); + intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); + intrlv_2x128( W+128, X+128, X+160, 1024 ); + intrlv_2x128( W+192, X+192, X+224, 1024 ); + scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N ); + dintrlv_2x128( X, X+ 32, W, 1024 ); + dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x128( X+128, X+160, W+128, 1024 ); + dintrlv_2x128( X+192, X+224, W+192, 1024 ); + } + + + + // SCRYPT CORE + + // AVX2 + + + // AVX2 + // disable de/interleave for testing. +// scrypt_core_8way( (__m256i*)W , (__m256i*)V, N ); - PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); +/* + // AVX2 working + intrlv_2x128( W, X, X+ 32, 1024 ); + intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); + intrlv_2x128( W+128, X+128, X+160, 1024 ); + intrlv_2x128( W+192, X+192, X+224, 1024 ); + + // working +// scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N ); + + // working + scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + + dintrlv_2x128( X, X+ 32, W, 1024 ); + dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x128( X+128, X+160, W+128, 1024 ); + dintrlv_2x128( X+192, X+224, W+192, 1024 ); +*/ + +/* + // AVX2 + intrlv_2x32( W, X , X+ 32, 1024 ); + intrlv_2x32( W+64, X+ 64, X+ 96, 1024 ); + intrlv_2x32( W+128, X+128, X+160, 1024 ); + intrlv_2x32( W+192, X+192, X+224, 1024 ); + + // working + scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); + +// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[k * 32 + i] = W[4 * i + k]; + dintrlv_2x32( X, X+ 32, W, 1024 ); + dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x32( X+128, X+160, W+128, 1024 ); + dintrlv_2x32( X+192, X+224, W+192, 1024 ); +*/ + + // SSE2 + +/* + // SSE2 working + intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 ); + scrypt_core_4way( (__m128i*) W, (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); + dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ + +/* + // SSE2 + scrypt_core_simd128( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+160, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+224, V, N ); +*/ +/* + // SSE2 working + scrypt_core_simd128_2buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); +*/ +/************** + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); +*************/ - scrypt_core(X + 0 * 32, V, N); - scrypt_core(X + 1 * 32, V, N); - scrypt_core(X + 2 * 32, V, N); - scrypt_core(X + 3 * 32, V, N); if ( work_restart[thrid].restart ) return 0; - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = X[k * 32 + i]; + intrlv_8x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, 1024 ); - PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); + PBKDF2_SHA256_128_32_8way( tstate, ostate, W, W ); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[k * 8 + i] = W[4 * i + k]; + dintrlv_8x32( output, output+ 8, output+16, output+24, + output+32, output+40, output+48, output+56, W, 256 ); return 1; } -#endif /* HAVE_SHA256_4WAY */ -#ifdef HAVE_SCRYPT_3WAY +#endif // AVX2 + +#if ( SCRYPT_THROUGHPUT == 16 ) -static int scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, - int thrid ) +static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, + uint32_t *midstate, int N, int thrid ) { - uint32_t _ALIGN(64) tstate[3 * 8], ostate[3 * 8]; - uint32_t _ALIGN(64) X[3 * 32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + uint32_t _ALIGN(128) tstate[ 16*8 ]; + uint32_t _ALIGN(128) ostate[ 16*8 ]; + uint32_t _ALIGN(128) W[ 16*32 ]; + uint32_t _ALIGN(128) X[ 16*32 ]; + + intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60, + input+ 80, input+100, input+120, input+140, + input+160, input+180, input+200, input+220, + input+240, input+260, input+280, input+300, 640 ); + for ( int i = 0; i < 8; i++ ) + casti_m512i( tstate, i ) = _mm512_set1_epi32( midstate[i] ); + + HMAC_SHA256_80_init_16way( W, tstate, ostate ); + PBKDF2_SHA256_80_128_16way( tstate, ostate, W, W ); + + dintrlv_16x32( X, X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224, + X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480, + W, 1024 ); + + + if ( opt_param_n > 0x4000 ) + { + scrypt_core_simd128_3buf( X, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+256, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+352, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, scratchbuf, N ); + } + else + { + intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); + intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 ); + intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 ); + scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N ); + dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); + dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 ); + dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 ); + } + + // SCRYPT CORE + + + // AVX512 +/* + // AVX512 16 way working + intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, + X+256, X+256+32, X+256+64, X+256+96, X+256+128, + X+256+160, X+256+192, X+256+224, 1024 ); - memcpy(tstate + 0, midstate, 32); - memcpy(tstate + 8, midstate, 32); - memcpy(tstate + 16, midstate, 32); + scrypt_core_16way( (__m512i*)W , (__m512i*)V, N ); + + dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, + X+256, X+256+32, X+256+64, X+256+96, X+256+128, + X+256+160, X+256+192, X+256+224, W, 1024 ); +*/ +/* + // AVX512 working + intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 ); + intrlv_4x32( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 ); + intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 ); + scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N ); + dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); + dintrlv_4x32( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 ); + dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 ); +*/ +/* + // AVX512, working + intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); + intrlv_4x128( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 ); + intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 ); + scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N ); + dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); + dintrlv_4x128( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 ); + dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 ); +*/ - HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); - HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); + // AVX2 + +/* + // AVX2 + // disable de/interleave for testing. + scrypt_core_8way( (__m256i*)W , (__m256i*)V, N ); +*/ + +/* + // AVX2 working + intrlv_2x128( W, X, X+ 32, 1024 ); + intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); + intrlv_2x128( W+128, X+128, X+160, 1024 ); + intrlv_2x128( W+192, X+192, X+224, 1024 ); + intrlv_2x128( W+256, X+256, X+256+ 32, 1024 ); + intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 ); + intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 ); + intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 ); + + // working + scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N ); if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128_2buf( (__m256i*)(W+256), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N ); + + // working +// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N ); + + dintrlv_2x128( X, X+ 32, W, 1024 ); + dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x128( X+128, X+160, W+128, 1024 ); + dintrlv_2x128( X+192, X+224, W+192, 1024 ); + dintrlv_2x128( X+256, X+256+ 32, W+256, 1024 ); + dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 ); + dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 ); + dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 ); +*/ - PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); - PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); - PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); +/* + // AVX2 + intrlv_2x32( W, X , X+ 32, 1024 ); + intrlv_2x32( W+64, X+ 64, X+ 96, 1024 ); + intrlv_2x32( W+128, X+128, X+160, 1024 ); + intrlv_2x32( W+192, X+192, X+224, 1024 ); + + // working +// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); +// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); + +// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); + + dintrlv_2x32( X, X+ 32, W, 1024 ); + dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x32( X+128, X+160, W+128, 1024 ); + dintrlv_2x32( X+192, X+224, W+192, 1024 ); +*/ + + // SSE2 +/* + // SSE2 working + intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 ); + scrypt_core_4way( (__m128i*) W, (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); + dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ +/* + // SSE2 + scrypt_core_simd128( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+160, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+224, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+288, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+320, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+352, V, N ); if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+384, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+416, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+448, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+480, V, N ); +*/ +/* + // SSE2 working + scrypt_core_simd128_2buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+320, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+384, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, V, N ); +*/ +/*************** + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+352, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, V, N ); +********************/ +/* + scrypt_core_3way( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_3way( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_3way( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_3way( X+352, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, V, N ); +*/ - scrypt_core_3way(X, V, N); if ( work_restart[thrid].restart ) return 0; - PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); - PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); - PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); + intrlv_16x32( W, X, X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224, + X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480, + 1024 ); + + PBKDF2_SHA256_128_32_16way( tstate, ostate, W, W ); + + dintrlv_16x32( output, output+ 8, output+ 16, output+ 24, + output+ 32, output+ 40, output+ 48, output+ 56, + output+ 64, output+ 72, output+ 80, output+ 88, + output+ 96, output+104, output+112, output+120, W, 256 ); return 1; } -#ifdef HAVE_SHA256_4WAY -static bool scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, - int thrid ) + +#endif // AVX512 + +#if 0 +static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output, + uint32_t *midstate, int N, int thrid ) { - uint32_t _ALIGN(128) tstate[12 * 8]; - uint32_t _ALIGN(128) ostate[12 * 8]; - uint32_t _ALIGN(128) W[12 * 32]; - uint32_t _ALIGN(128) X[12 * 32]; - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + uint32_t _ALIGN(128) tstate[ 2*8 ]; + uint32_t _ALIGN(128) ostate[ 2*8 ]; + uint32_t _ALIGN(128) W[ 2*32 ]; - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; + memcpy( tstate, midstate, 32 ); + memcpy( tstate+ 8, midstate, 32 ); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[32 * j + 4 * i + k] = midstate[i]; + HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8, + ostate, ostate+8 ); + PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, + input, input+20, W, W+32 ); - HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); - HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); + scrypt_core_simd128_2buf( W, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; - if ( work_restart[thrid].restart ) return 0; + PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32, + output, output+8 ); + + return 1; +} - PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); +static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output, + uint32_t *midstate, int N, int thrid ) +{ + uint32_t _ALIGN(128) tstate[4 * 8]; + uint32_t _ALIGN(128) ostate[4 * 8]; + uint32_t _ALIGN(128) W[4 * 32]; + + memcpy( tstate, midstate, 32 ); + memcpy( tstate+ 8, midstate, 32 ); + memcpy( tstate+16, midstate, 32 ); + memcpy( tstate+24, midstate, 32 ); + + HMAC_SHA256_80_init( input, tstate, ostate ); + PBKDF2_SHA256_80_128( tstate, ostate, input, W ); + + HMAC_SHA256_80_init( input +20, tstate+ 8, ostate+ 8 ); + PBKDF2_SHA256_80_128( tstate+ 8, ostate+ 8, input +20, W+32 ); + + HMAC_SHA256_80_init( input +40, tstate+16, ostate+16 ); + PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 ); + + HMAC_SHA256_80_init( input +60, tstate+24, ostate+24 ); + PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 ); + +/* + // Working Linear single threaded SIMD + scrypt_core_simd128( W, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+96, V, N ); +*/ + // working, double buffered linear simd + scrypt_core_simd128_2buf( W, scratchbuf, N ); if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( W+64, scratchbuf, N ); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; +/* + scrypt_core_simd128_3buf( W, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+96, V, N ); +*/ - scrypt_core_3way(X + 0 * 96, V, N); - scrypt_core_3way(X + 1 * 96, V, N); - scrypt_core_3way(X + 2 * 96, V, N); - scrypt_core_3way(X + 3 * 96, V, N); if ( work_restart[thrid].restart ) return 0; - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; + PBKDF2_SHA256_128_32( tstate, ostate, W, output ); + + PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 ); - PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); + PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 ); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; + PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 ); return 1; } -#endif /* HAVE_SHA256_4WAY */ - -#endif /* HAVE_SCRYPT_3WAY */ +#endif -#ifdef HAVE_SCRYPT_6WAY -static int scrypt_1024_1_1_256_24way( const uint32_t *input, - uint32_t *output, uint32_t *midstate, - unsigned char *scratchpad, int N, int thrid ) +#if ( SCRYPT_THROUGHPUT == 4 ) +static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, + uint32_t *midstate, int N, int thrid ) { - uint32_t _ALIGN(128) tstate[24 * 8]; - uint32_t _ALIGN(128) ostate[24 * 8]; - uint32_t _ALIGN(128) W[24 * 32]; - uint32_t _ALIGN(128) X[24 * 32]; - uint32_t *V; - int i, j, k; - - V = (uint32_t *)( ( (uintptr_t)(scratchpad) + 63 ) & ~ (uintptr_t)(63) ); - - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 20; i++ ) - for ( k = 0; k < 8; k++ ) - W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; + uint32_t _ALIGN(128) tstate[ 4*8 ]; + uint32_t _ALIGN(128) ostate[ 4*8 ]; + uint32_t _ALIGN(128) W[ 4*32 ]; - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 8; i++ ) - for ( k = 0; k < 8; k++ ) - tstate[8 * 8 * j + 8 * i + k] = midstate[i]; + intrlv_4x32( W, input, input+20, input+40, input+60, 640 ); + for ( int i = 0; i < 8; i++ ) + casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] ); - HMAC_SHA256_80_init_8way( W + 0, tstate + 0, ostate + 0 ); - HMAC_SHA256_80_init_8way( W + 256, tstate + 64, ostate + 64 ); - HMAC_SHA256_80_init_8way( W + 512, tstate + 128, ostate + 128 ); + HMAC_SHA256_80_init_4way(W, tstate, ostate); + PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - if ( work_restart[thrid].restart ) return 0; - - PBKDF2_SHA256_80_128_8way( tstate + 0, ostate + 0, W + 0, W + 0 ); - PBKDF2_SHA256_80_128_8way( tstate + 64, ostate + 64, W + 256, W + 256 ); - PBKDF2_SHA256_80_128_8way( tstate + 128, ostate + 128, W + 512, W + 512 ); + if ( opt_param_n > 0x4000 ) + { + uint32_t _ALIGN(128) X[ 4*32 ]; + dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 ); + scrypt_core_simd128_2buf( X, scratchbuf, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, scratchbuf, N ); + intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); + } + else + scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N ); - if ( work_restart[thrid].restart ) return 0; - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 32; i++ ) - for ( k = 0; k < 8; k++ ) - X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - scrypt_core_6way( X + 0 * 32, V, N ); - scrypt_core_6way( X + 6 * 32, V, N ); +// dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 ); - if ( work_restart[thrid].restart ) return 0; +////// SCRYPT_CORE - scrypt_core_6way( X + 12 * 32, V, N ); - scrypt_core_6way( X + 18 * 32, V, N ); + + // working, simple 4 way parallel, best for scrypt +// scrypt_core_4way( (__m128i*)W, (__m128i*)V, N ); +/* + // Working Linear single threaded SIMD + scrypt_core_simd128( X, V, N ); if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+96, V, N ); +*/ +/* + // working, double buffered linear simd, best for n2 + scrypt_core_simd128_2buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, V, N ); +*/ +/* + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+96, V, N ); +*/ - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 32; i++ ) - for ( k = 0; k < 8; k++ ) - W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; +//////////////////////////////// - PBKDF2_SHA256_128_32_8way( tstate + 0, ostate + 0, W + 0, W + 0 ); - PBKDF2_SHA256_128_32_8way( tstate + 64, ostate + 64, W + 256, W + 256 ); - PBKDF2_SHA256_128_32_8way( tstate + 128, ostate + 128, W + 512, W + 512 ); + if ( work_restart[thrid].restart ) return 0; + +// intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); + + PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 8; i++ ) - for ( k = 0; k < 8; k++ ) - output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; + dintrlv_4x32( output, output+8, output+16, output+24, W, 256 ); return 1; } -#endif /* HAVE_SCRYPT_6WAY */ +#endif // SCRYPT_THROUGHPUT == 4 + +//#endif // SHA extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { + uint32_t _ALIGN(64) hash[ 8*SCRYPT_THROUGHPUT ]; + uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; - uint32_t midstate[8]; - uint32_t n = pdata[19] - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - int throughput = scrypt_best_throughput(); - int i; + uint32_t midstate[8]; + uint32_t n = pdata[19] - 1; + int thr_id = mythr->id; + int i; volatile uint8_t *restart = &(work_restart[thr_id].restart); -#ifdef HAVE_SHA256_4WAY - if (sha256_use_4way()) - throughput *= 4; -#endif - -// applog(LOG_INFO,"Scrypt thoughput %d",throughput); + for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) + memcpy( data + i * 20, pdata, 80 ); - for (i = 0; i < throughput; i++) - memcpy(data + i * 20, pdata, 80); - - sha256_init(midstate); - sha256_transform(midstate, data, 0); - - do { + sha256_transform_le( midstate, data, sha256_initial_state ); + + do { bool rc = true; - for (i = 0; i < throughput; i++) - data[i * 20 + 19] = ++n; - -#if defined(HAVE_SHA256_4WAY) - if (throughput == 4) - rc = scrypt_1024_1_1_256_4way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) - if (throughput == 12) - rc = scrypt_1024_1_1_256_12way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else -#endif -#if defined(HAVE_SCRYPT_6WAY) - if (throughput == 24) - rc = scrypt_1024_1_1_256_24way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) - if (throughput == 3) - rc = scrypt_1024_1_1_256_3way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else + for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n; + +//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if ( SCRYPT_THROUGHPUT == 16 ) +// if ( SCRYPT_THROUGHPUT == 16 ) + rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n, + thr_id ); +// else +//#endif +//#if defined(__AVX2__) +#elif ( SCRYPT_THROUGHPUT == 8 ) +// if ( SCRYPT_THROUGHPUT == 8 ) + rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n, + thr_id ); +// else +//#endif +#elif ( SCRYPT_THROUGHPUT == 4 ) +// if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way +//#if defined(__SHA__) +// rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n, +// thr_id ); +//#else + rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n, + thr_id ); +#else + +#error "Invalid SCRYPT_THROUGHPUT" + #endif - rc = scrypt_1024_1_1_256(data, hash, midstate, scratchbuf, - scratchbuf_size, thr_id ); - +/* +#if defined(__SHA__) + else + if ( SCRYPT_THROUGHPUT == 2 ) // slower on Ryzen than 4way_sha & 8way + rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n, + thr_id ); +#endif + else // should never get here + rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id ); +*/ + + // test the hash if ( rc ) - for ( i = 0; i < throughput; i++ ) + for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) { - if ( unlikely( valid_hash( hash + i * 8, ptarget ) ) ) + if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) ) { - pdata[19] = data[i * 20 + 19]; +// applog( LOG_INFO, "Thread %d, Lane %d", thr_id,i ); + pdata[19] = data[i * 20 + 19]; submit_solution( work, hash + i * 8, mythr ); - } + } } - } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) ); + + + } while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) ); *hashes_done = n - pdata[19]; pdata[19] = n; @@ -729,28 +1488,74 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, bool scrypt_miner_thread_init( int thr_id ) { - scratchbuf = scrypt_buffer_alloc( scratchbuf_size ); - if ( scratchbuf ) - return true; - applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); - return false; + scratchbuf = malloc_hugepages( scratchbuf_size ); + if ( scratchbuf ) + { + if ( opt_debug ) + applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id ); + } + else + scratchbuf = _mm_malloc( scratchbuf_size, 128 ); + + if ( scratchbuf ) return true; + + applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); + return false; } bool register_scrypt_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AVX2_OPT; - gate->miner_thread_init =(void*)&scrypt_miner_thread_init; - gate->scanhash = (void*)&scanhash_scrypt; - opt_target_factor = 65536.0; - - if ( !opt_param_n ) - { - opt_param_n = 1024; - scratchbuf_size = 1024; - } - else - scratchbuf_size = opt_param_n; - applog(LOG_INFO,"Scrypt paramaters: N= %d, R= 1.", opt_param_n ); - return true; +//#if defined(__SHA__) +// gate->optimizations = SSE2_OPT | SHA_OPT; +//#else + gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT; +//#endif + gate->miner_thread_init =(void*)&scrypt_miner_thread_init; + gate->scanhash = (void*)&scanhash_scrypt; + opt_target_factor = 65536.0; + opt_param_n = opt_param_n ? opt_param_n : 1024; + applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n ); + +// scrypt_throughput can be defined at compile time and used to replace +// MAX_WAYS to reduce memory usage. + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +// scrypt_throughput = 16; + if ( opt_param_n > 0x4000 ) + scratchbuf_size = opt_param_n * 3 * 128; // 3 buf + else + scratchbuf_size = opt_param_n * 4 * 128; // 4 way + +/* SHA is slower than AVX2 on Ryzen +#elif defined(__SHA__) + scrypt_throughput = 4; + scratchbuf_size = opt_param_n * 2 * 128; // 2 buf +*/ + +#elif defined(__AVX2__) +// scrypt_throughput = 8; + if ( opt_param_n > 0x4000 ) + scratchbuf_size = opt_param_n * 3 * 128; // 3 buf + else + scratchbuf_size = opt_param_n * 2 * 128; // 2 way +#else +// scrypt_throughput = 4; + if ( opt_param_n > 0x4000 ) + scratchbuf_size = opt_param_n * 2 * 128; // 2 buf + else + scratchbuf_size = opt_param_n * 4 * 128; // 4 way +#endif + + char t_units[4] = {0}; + char d_units[4] = {0}; + double t_size = (double)scratchbuf_size; + double d_size = (double)scratchbuf_size * opt_n_threads; + + format_number_si( &t_size, t_units ); + format_number_si( &d_size, d_units ); + applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n", + SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units ); + + return true; }; diff --git a/algo/sha/hmac-sha256-hash.c b/algo/sha/hmac-sha256-hash.c index 3c2f4d20..2cdf9c82 100644 --- a/algo/sha/hmac-sha256-hash.c +++ b/algo/sha/hmac-sha256-hash.c @@ -39,17 +39,10 @@ void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] ) { -#if defined(HMAC_SPH_SHA) - sph_sha256_context ctx; - sph_sha256_init( &ctx ); - sph_sha256( &ctx, in, len ); - sph_sha256_close( &ctx, digest ); -#else - SHA256_CTX ctx; - SHA256_Init( &ctx ); - SHA256_Update( &ctx, in, len ); - SHA256_Final( digest, &ctx ); -#endif + sha256_context ctx; + sha256_ctx_init( &ctx ); + sha256_update( &ctx, in, len ); + sha256_final( &ctx, digest ); } /** @@ -71,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len, void HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen ) { - unsigned char pad[64]; + unsigned char pad[64] __attribute__ ((aligned (64))); unsigned char khash[32]; const unsigned char * K = _K; size_t i; @@ -79,51 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen ) /* If Klen > 64, the key is really SHA256(K). */ if ( Klen > 64 ) { - -#if defined(HMAC_SPH_SHA) - sph_sha256_init( &ctx->ictx ); - sph_sha256( &ctx->ictx, K, Klen ); - sph_sha256_close( &ctx->ictx, khash ); -#else - SHA256_Init( &ctx->ictx ); - SHA256_Update( &ctx->ictx, K, Klen ); - SHA256_Final( khash, &ctx->ictx ); -#endif - K = khash; - Klen = 32; + sha256_ctx_init( &ctx->ictx ); + sha256_update( &ctx->ictx, K, Klen ); + sha256_final( &ctx->ictx, khash ); + K = khash; + Klen = 32; } /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ -#if defined(HMAC_SPH_SHA) - sph_sha256_init( &ctx->ictx ); -#else - SHA256_Init( &ctx->ictx ); -#endif + sha256_ctx_init( &ctx->ictx ); for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36; memset( pad + Klen, 0x36, 64 - Klen ); -#if defined(HMAC_SPH_SHA) - sph_sha256( &ctx->ictx, pad, 64 ); -#else - SHA256_Update( &ctx->ictx, pad, 64 ); -#endif + sha256_update( &ctx->ictx, pad, 64 ); /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ -#if defined(HMAC_SPH_SHA) - sph_sha256_init( &ctx->octx ); -#else - SHA256_Init( &ctx->octx ); -#endif + sha256_ctx_init( &ctx->octx ); for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c; memset( pad + Klen, 0x5c, 64 - Klen ); -#if defined(HMAC_SPH_SHA) - sph_sha256( &ctx->octx, pad, 64 ); -#else - SHA256_Update( &ctx->octx, pad, 64 ); -#endif + sha256_update( &ctx->octx, pad, 64 ); } /* Add bytes to the HMAC-SHA256 operation. */ @@ -131,33 +101,17 @@ void HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len ) { /* Feed data to the inner SHA256 operation. */ -#if defined(HMAC_SPH_SHA) - sph_sha256( &ctx->ictx, in, len ); -#else - SHA256_Update( &ctx->ictx, in, len ); -#endif + sha256_update( &ctx->ictx, in, len ); } /* Finish an HMAC-SHA256 operation. */ void -HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx ) +HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx ) { - unsigned char ihash[32]; - -#if defined(HMAC_SPH_SHA) - sph_sha256_close( &ctx->ictx, ihash ); - sph_sha256( &ctx->octx, ihash, 32 ); - sph_sha256_close( &ctx->octx, digest ); -#else - /* Finish the inner SHA256 operation. */ - SHA256_Final( ihash, &ctx->ictx ); - - /* Feed the inner hash to the outer SHA256 operation. */ - SHA256_Update( &ctx->octx, ihash, 32 ); - - /* Finish the outer SHA256 operation. */ - SHA256_Final( digest, &ctx->octx ); -#endif + uint32_t ihash[8] __attribute__ ((aligned (32))); + sha256_final( &ctx->ictx, ihash ); + sha256_update( &ctx->octx, ihash, 32 ); + sha256_final( &ctx->octx, digest ); } /** @@ -170,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt, size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen ) { HMAC_SHA256_CTX PShctx, hctx; - uint8_t _ALIGN(128) T[32]; - uint8_t _ALIGN(128) U[32]; + uint64_t _ALIGN(128) T[4]; + uint64_t _ALIGN(128) U[4]; +// uint8_t _ALIGN(128) T[32]; +// uint8_t _ALIGN(128) U[32]; uint32_t ivec; size_t i, clen; uint64_t j; @@ -207,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt, // _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] ); // _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] ); -// for ( k = 0; k < 4; k++ ) T[k] ^= U[k]; + for ( k = 0; k < 4; k++ ) T[k] ^= U[k]; - for ( k = 0; k < 32; k++ ) - T[k] ^= U[k]; +// for ( k = 0; k < 32; k++ ) +// T[k] ^= U[k]; } /* Copy as many bytes as necessary into buf. */ diff --git a/algo/sha/hmac-sha256-hash.h b/algo/sha/hmac-sha256-hash.h index 41e5673a..7a281df8 100644 --- a/algo/sha/hmac-sha256-hash.h +++ b/algo/sha/hmac-sha256-hash.h @@ -29,30 +29,20 @@ #ifndef HMAC_SHA256_H__ #define HMAC_SHA256_H__ -//#define HMAC_SSL_SHA 1 -#define HMAC_SPH_SHA 1 - #include #include -#include "sph_sha2.h" -#include - +#include "sha256-hash.h" typedef struct HMAC_SHA256Context { -#if defined(HMAC_SPH_SHA) - sph_sha256_context ictx; - sph_sha256_context octx; -#else - SHA256_CTX ictx; - SHA256_CTX octx; -#endif + sha256_context ictx; + sha256_context octx; } HMAC_SHA256_CTX; void SHA256_Buf( const void *, size_t len, uint8_t digest[32] ); void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t ); void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t ); -void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * ); +void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * ); void HMAC_SHA256_Buf( const void *, size_t Klen, const void *, size_t len, uint8_t digest[32] ); diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h index 3a0c61b0..6428e2ba 100644 --- a/algo/sha/sha-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -51,7 +51,6 @@ typedef struct { __m128i buf[64>>2]; __m128i val[8]; uint32_t count_high, count_low; - bool initialized; } sha256_4way_context __attribute__ ((aligned (64))); void sha256_4way_init( sha256_4way_context *sc ); @@ -59,6 +58,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ); void sha256_4way_close( sha256_4way_context *sc, void *dst ); void sha256_4way_full( void *dst, const void *data, size_t len ); +void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); +void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); +void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, + const __m128i *W, const __m128i *state_in ); +void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, + const __m128i *state_in, const __m128i *state_mid, const __m128i *X ); +int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); #endif // SSE2 @@ -70,13 +79,23 @@ typedef struct { __m256i buf[64>>2]; __m256i val[8]; uint32_t count_high, count_low; - bool initialized; } sha256_8way_context __attribute__ ((aligned (128))); void sha256_8way_init( sha256_8way_context *sc ); void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ); void sha256_8way_close( sha256_8way_context *sc, void *dst ); void sha256_8way_full( void *dst, const void *data, size_t len ); +void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); +void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); + +void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, + const __m256i *W, const __m256i *state_in ); +void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, + const __m256i *state_in, const __m256i *state_mid, const __m256i *X ); +int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); #endif // AVX2 @@ -88,13 +107,23 @@ typedef struct { __m512i buf[64>>2]; __m512i val[8]; uint32_t count_high, count_low; - bool initialized; } sha256_16way_context __attribute__ ((aligned (128))); void sha256_16way_init( sha256_16way_context *sc ); void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ); void sha256_16way_close( sha256_16way_context *sc, void *dst ); void sha256_16way_full( void *dst, const void *data, size_t len ); +void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); +void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); +void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, + const __m512i *W, const __m512i *state_in ); +void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, + const __m512i *state_in, const __m512i *state_mid, const __m512i *X ); + +int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); #endif // AVX512 diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c index 33cc6c12..ef152738 100644 --- a/algo/sha/sha2.c +++ b/algo/sha/sha2.c @@ -8,7 +8,7 @@ * any later version. See COPYING for more details. */ -#include "algo-gate-api.h" +#include "sha256d-4way.h" #include #include @@ -180,6 +180,9 @@ static const uint32_t sha256d_hash1[16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000100 }; +// this performs the entire hash all over again, why? +// because main function only does 56 rounds. + static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) { uint32_t S[16]; @@ -195,8 +198,29 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) hash[i] = swab32(hash[i]); } -extern void sha256d(unsigned char *hash, const unsigned char *data, int len) +/* +#if defined (__SHA__) + +#include "algo/sha/sph_sha2.h" + +void sha256d(unsigned char *hash, const unsigned char *data, int len) +{ + sph_sha256_context ctx __attribute__ ((aligned (64))); + + sph_sha256_init( &ctx ); + sph_sha256( &ctx, data, len ); + sph_sha256_close( &ctx, hash ); + + sph_sha256_init( &ctx ); + sph_sha256( &ctx, hash, 32 ); + sph_sha256_close( &ctx, hash ); +} + +#else + +void sha256d(unsigned char *hash, const unsigned char *data, int len) { + uint32_t S[16], T[16]; int i, r; @@ -220,6 +244,9 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len) be32enc((uint32_t *)hash + i, T[i]); } +#endif +*/ + static inline void sha256d_preextend(uint32_t *W) { W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; @@ -467,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W, void sha256d_ms_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate, const uint32_t *prehash); -static inline int scanhash_sha256d_4way( struct work *work, +static inline int scanhash_sha256d_4way_pooler( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; @@ -528,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work, void sha256d_ms_8way(uint32_t *hash, uint32_t *data, const uint32_t *midstate, const uint32_t *prehash); -static inline int scanhash_sha256d_8way( struct work *work, +static inline int scanhash_sha256d_8way_pooler( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; @@ -584,11 +611,11 @@ static inline int scanhash_sha256d_8way( struct work *work, #endif /* HAVE_SHA256_8WAY */ -int scanhash_sha256d( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) +int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t _ALIGN(128) data[64]; uint32_t _ALIGN(32) hash[8]; uint32_t _ALIGN(32) midstate[8]; @@ -599,12 +626,12 @@ int scanhash_sha256d( struct work *work, int thr_id = mythr->id; // thr_id arg is deprecated #ifdef HAVE_SHA256_8WAY - if (sha256_use_8way()) - return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr ); + if ( sha256_use_8way() ) + return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr ); #endif #ifdef HAVE_SHA256_4WAY - if (sha256_use_4way()) - return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr ); + if ( sha256_use_4way() ) + return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr ); #endif memcpy(data, pdata + 16, 64); @@ -631,6 +658,7 @@ int scanhash_sha256d( struct work *work, return 0; } +/* int scanhash_SHA256d( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { @@ -660,13 +688,20 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce, pdata[19] = n; return 0; } - +*/ bool register_sha256d_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AVX2_OPT; - gate->scanhash = (void*)&scanhash_sha256d; - gate->hash = (void*)&sha256d; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; +#if defined(SHA256D_16WAY) + gate->scanhash = (void*)&scanhash_sha256d_16way; +//#elif defined(SHA256D_8WAY) +// gate->scanhash = (void*)&scanhash_sha256d_8way; +#else + gate->scanhash = (void*)&scanhash_sha256d_pooler; +// gate->scanhash = (void*)&scanhash_sha256d_4way; +#endif + // gate->hash = (void*)&sha256d; return true; }; diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c new file mode 100644 index 00000000..7fc64ca3 --- /dev/null +++ b/algo/sha/sha256-hash-2way-ni.c @@ -0,0 +1,689 @@ +/* Intel SHA extensions using C intrinsics */ +/* Written and place in public domain by Jeffrey Walton */ +/* Based on code from Intel, and by Sean Gulley for */ +/* the miTLS project. */ + +// A stripped down version with byte swapping removed. + +#if defined(__SHA__) + +#include "sha256-hash.h" + +void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; + __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y; + + // Load initial values + TMP_X = _mm_load_si128((__m128i*) &in_X[0]); + STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); + TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); + STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); + + TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB + TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH + STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF + STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF + STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH + STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE_X = STATE0_X; + ABEF_SAVE_Y = STATE0_Y; + CDGH_SAVE_X = STATE1_X; + CDGH_SAVE_Y = STATE1_Y; + + // Rounds 0-3 + TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); + TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); + TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 4-7 + TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); + TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); + TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 8-11 + TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); + TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); + TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 12-15 + TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); + TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); + TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 16-19 + TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 20-23 + TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 24-27 + TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 28-31 + TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 32-35 + TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 36-39 + TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 40-43 + TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 44-47 + TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 48-51 + TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 52-55 + TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 56-59 + TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 60-63 + TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Add values back to state + STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); + STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); + STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); + STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); + + TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA + TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG + STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA + STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA + STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF + STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &out_X[0], STATE0_X); + _mm_store_si128((__m128i*) &out_X[4], STATE1_X); + _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); + _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); +} + +void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK; + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; + __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y; + + // Load initial values + TMP_X = _mm_load_si128((__m128i*) &in_X[0]); + STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); + TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); + STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); + MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB + TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH + STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF + STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF + STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH + STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE_X = STATE0_X; + ABEF_SAVE_Y = STATE0_Y; + CDGH_SAVE_X = STATE1_X; + CDGH_SAVE_Y = STATE1_Y; + + // Rounds 0-3 + TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); + TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); + TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); + TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK ); + TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK ); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 4-7 + TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); + TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); + TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); + TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK ); + TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 8-11 + TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); + TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); + TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); + TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK ); + TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 12-15 + TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); + TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); + TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); + TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK ); + TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 16-19 + TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 20-23 + TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 24-27 + TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 28-31 + TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 32-35 + TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 36-39 + TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 40-43 + TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 44-47 + TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 48-51 + TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 52-55 + TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 56-59 + TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 60-63 + TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Add values back to state + STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); + STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); + STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); + STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); + + TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA + TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG + STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA + STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA + STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF + STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &out_X[0], STATE0_X); + _mm_store_si128((__m128i*) &out_X[4], STATE1_X); + _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); + _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); +} + + +#endif + diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index d9fb503c..dd96d79d 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -1,34 +1,3 @@ -/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */ -/* - * SHA-384 / SHA-512 implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ #if defined(__SSE2__) @@ -66,17 +35,14 @@ static const uint32_t K256[64] = 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; -// SHA-256 4 way - -#define SHA2s_MEXP( a, b, c, d ) \ - mm128_add4_32( SSG2_1( W[a] ), W[b], SSG2_0( W[c] ), W[d] ); +// SHA-256 4 way SSE2 #define CHs(X, Y, Z) \ _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) #define MAJs(X, Y, Z) \ - _mm_or_si128( _mm_and_si128( X, Y ), \ - _mm_and_si128( _mm_or_si128( X, Y ), Z ) ) + _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \ + Y_xor_Z ) ) #define BSG2_0(x) \ _mm_xor_si128( _mm_xor_si128( \ @@ -94,37 +60,27 @@ static const uint32_t K256[64] = _mm_xor_si128( _mm_xor_si128( \ mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) ) -#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ -do { \ - __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \ - __m128i T1 = mm128_ror_32( E, 14 ); \ - __m128i T2 = mm128_ror_32( A, 9 ); \ - __m128i T3 = _mm_xor_si128( F, G ); \ - __m128i T4 = _mm_or_si128( A, B ); \ - __m128i T5 = _mm_and_si128( A, B ); \ - K = _mm_add_epi32( K, W[i] ); \ - T1 = _mm_xor_si128( T1, E ); \ - T2 = _mm_xor_si128( T2, A ); \ - T3 = _mm_and_si128( T3, E ); \ - T4 = _mm_and_si128( T4, C ); \ - K = _mm_add_epi32( H, K ); \ - T1 = mm128_ror_32( T1, 5 ); \ - T2 = mm128_ror_32( T2, 11 ); \ - T3 = _mm_xor_si128( T3, G ); \ - T4 = _mm_or_si128( T4, T5 ); \ - T1 = _mm_xor_si128( T1, E ); \ - T2 = _mm_xor_si128( T2, A ); \ - T1 = mm128_ror_32( T1, 6 ); \ - T2 = mm128_ror_32( T2, 2 ); \ - T1 = _mm_add_epi32( T1, T3 ); \ - T2 = _mm_add_epi32( T2, T4 ); \ - T1 = _mm_add_epi32( T1, K ); \ - H = _mm_add_epi32( T1, T2 ); \ - D = _mm_add_epi32( D, T1 ); \ -} while (0) - +#define SHA2s_MEXP( a, b, c, d ) \ + mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d ); + +#define SHA256x4_MSG_EXPANSION( W ) \ + W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \ + W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); \ + W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \ + W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \ + W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \ + W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \ + W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \ + W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \ + W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \ + W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \ + W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] ); \ + W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); \ + W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); \ + W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); \ + W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); \ + W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] ); -/* #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m128i T1, T2; \ @@ -132,47 +88,158 @@ do { \ T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \ K, W[i] ) ); \ T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ D = _mm_add_epi32( D, T1 ); \ H = _mm_add_epi32( T1, T2 ); \ } while (0) -*/ +#define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ +{ \ + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \ + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \ + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \ + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \ + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); \ + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); \ + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); \ + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); \ + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); \ + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); \ + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); \ + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \ + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \ + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \ + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \ + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \ + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); \ +} -static void -sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) +// LE data, no need to byte swap +static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W, + const __m128i *in ) { - register __m128i A, B, C, D, E, F, G, H; - __m128i W[16]; + __m128i A, B, C, D, E, F, G, H; + + A = in[0]; + B = in[1]; + C = in[2]; + D = in[3]; + E = in[4]; + F = in[5]; + G = in[6]; + H = in[7]; + + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); + + out[0] = _mm_add_epi32( in[0], A ); + out[1] = _mm_add_epi32( in[1], B ); + out[2] = _mm_add_epi32( in[2], C ); + out[3] = _mm_add_epi32( in[3], D ); + out[4] = _mm_add_epi32( in[4], E ); + out[5] = _mm_add_epi32( in[5], F ); + out[6] = _mm_add_epi32( in[6], G ); + out[7] = _mm_add_epi32( in[7], H ); +} - mm128_block_bswap_32( W, in ); - mm128_block_bswap_32( W+8, in+8 ); +// LE data, no need to byte swap +void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) +{ + __m128i W[16]; + memcpy_128( W, data, 16 ); + SHA256_4WAY_TRANSFORM( state_out, W, state_in ); +} - if ( ctx->initialized ) - { - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - } - else - { - A = m128_const1_64( 0x6A09E6676A09E667 ); - B = m128_const1_64( 0xBB67AE85BB67AE85 ); - C = m128_const1_64( 0x3C6EF3723C6EF372 ); - D = m128_const1_64( 0xA54FF53AA54FF53A ); - E = m128_const1_64( 0x510E527F510E527F ); - F = m128_const1_64( 0x9B05688C9B05688C ); - G = m128_const1_64( 0x1F83D9AB1F83D9AB ); - H = m128_const1_64( 0x5BE0CD195BE0CD19 ); - } +// BE data, need to byte swap input data +void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) +{ + __m128i W[16]; + mm128_block_bswap_32( W, data ); + mm128_block_bswap_32( W+8, data+8 ); + SHA256_4WAY_TRANSFORM( state_out, W, state_in ); +} +void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, + const __m128i *W, const __m128i *state_in ) +{ + __m128i A, B, C, D, E, F, G, H; + + // precalculate constant part msg expansion for second iteration. + X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); + X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ), + W[ 2] ); + X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ), + SSG2_0( W[ 4] ) ); + X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ), + W[ 4] ); + X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ), + W[ 5] ); + X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ), + W[ 6] ); + X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ), + W[ 7] ); + X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ), + W[ 8] ); + X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] ); + X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] ); + X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] ); + X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] ); + X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] ); + X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] ); + X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] ); + + A = _mm_load_si128( state_in ); + B = _mm_load_si128( state_in + 1 ); + C = _mm_load_si128( state_in + 2 ); + D = _mm_load_si128( state_in + 3 ); + E = _mm_load_si128( state_in + 4 ); + F = _mm_load_si128( state_in + 5 ); + G = _mm_load_si128( state_in + 6 ); + H = _mm_load_si128( state_in + 7 ); + + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + + _mm_store_si128( state_mid , A ); + _mm_store_si128( state_mid + 1, B ); + _mm_store_si128( state_mid + 2, C ); + _mm_store_si128( state_mid + 3, D ); + _mm_store_si128( state_mid + 4, E ); + _mm_store_si128( state_mid + 5, F ); + _mm_store_si128( state_mid + 6, G ); + _mm_store_si128( state_mid + 7, H ); +} + +void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, + const __m128i *state_in, const __m128i *state_mid, const __m128i *X ) +{ + __m128i A, B, C, D, E, F, G, H; + __m128i W[16]; + + memcpy_128( W, data, 16 ); + + A = _mm_load_si128( state_mid ); + B = _mm_load_si128( state_mid + 1 ); + C = _mm_load_si128( state_mid + 2 ); + D = _mm_load_si128( state_mid + 3 ); + E = _mm_load_si128( state_mid + 4 ); + F = _mm_load_si128( state_mid + 5 ); + G = _mm_load_si128( state_mid + 6 ); + H = _mm_load_si128( state_mid + 7 ); + + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); @@ -187,82 +254,168 @@ sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2s_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2s_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2s_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2s_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2s_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2s_MEXP( 13, 8, 0, 15 ); - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - if ( ctx->initialized ) - { - r[0] = _mm_add_epi32( r[0], A ); - r[1] = _mm_add_epi32( r[1], B ); - r[2] = _mm_add_epi32( r[2], C ); - r[3] = _mm_add_epi32( r[3], D ); - r[4] = _mm_add_epi32( r[4], E ); - r[5] = _mm_add_epi32( r[5], F ); - r[6] = _mm_add_epi32( r[6], G ); - r[7] = _mm_add_epi32( r[7], H ); - } - else - { - ctx->initialized = true; - r[0] = _mm_add_epi32( A, m128_const1_64( 0x6A09E6676A09E667 ) ); - r[1] = _mm_add_epi32( B, m128_const1_64( 0xBB67AE85BB67AE85 ) ); - r[2] = _mm_add_epi32( C, m128_const1_64( 0x3C6EF3723C6EF372 ) ); - r[3] = _mm_add_epi32( D, m128_const1_64( 0xA54FF53AA54FF53A ) ); - r[4] = _mm_add_epi32( E, m128_const1_64( 0x510E527F510E527F ) ); - r[5] = _mm_add_epi32( F, m128_const1_64( 0x9B05688C9B05688C ) ); - r[6] = _mm_add_epi32( G, m128_const1_64( 0x1F83D9AB1F83D9AB ) ); - r[7] = _mm_add_epi32( H, m128_const1_64( 0x5BE0CD195BE0CD19 ) ); - } + // update precalculated msg expansion with new nonce: W[3]. + W[ 0] = X[ 0]; + W[ 1] = X[ 1]; + W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) ); + W[ 3] = _mm_add_epi32( X[ 3], W[ 3] ); + W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) ); + W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) ); + W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) ); + W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) ); + W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) ); + W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ), + W[ 2] ) ); + W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ), + W[ 3] ) ); + W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ), + W[ 4] ) ); + W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ), + W[ 5] ) ); + W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ), + W[ 6] ) ); + W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ), + W[ 7] ) ); + W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ), + W[ 8] ) ); + + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); + + A = _mm_add_epi32( A, _mm_load_si128( state_in ) ); + B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) ); + C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) ); + D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) ); + E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) ); + F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) ); + G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) ); + H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) ); + + _mm_store_si128( state_out , A ); + _mm_store_si128( state_out + 1, B ); + _mm_store_si128( state_out + 2, C ); + _mm_store_si128( state_out + 3, D ); + _mm_store_si128( state_out + 4, E ); + _mm_store_si128( state_out + 5, F ); + _mm_store_si128( state_out + 6, G ); + _mm_store_si128( state_out + 7, H ); } +// returns 0 if hash aborted early and invalid. +int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) +{ + __m128i A, B, C, D, E, F, G, H; + __m128i W[16]; memcpy_128( W, data, 16 ); + // Value required by H after round 60 to produce valid final hash + const __m128i H_ = m128_const1_32( 0x136032ED ); + + A = _mm_load_si128( state_in ); + B = _mm_load_si128( state_in+1 ); + C = _mm_load_si128( state_in+2 ); + D = _mm_load_si128( state_in+3 ); + E = _mm_load_si128( state_in+4 ); + F = _mm_load_si128( state_in+5 ); + G = _mm_load_si128( state_in+6 ); + H = _mm_load_si128( state_in+7 ); + + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + + W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); + + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); + + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 48 ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 48 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 48 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 48 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 48 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 48 ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 ); + + __m128i T1_57 = _mm_add_epi32( G, + mm128_add4_32( BSG2_1( D ), CHs( D, E, F ), + _mm_set1_epi32( K256[57] ), W[ 9] ) ); + C = _mm_add_epi32( C, T1_57 ); + + __m128i T1_58 = _mm_add_epi32( F, + mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), + _mm_set1_epi32( K256[58] ), W[10] ) ); + B = _mm_add_epi32( B, T1_58 ); + + __m128i T1_59 = _mm_add_epi32( E, + mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), + _mm_set1_epi32( K256[59] ), W[11] ) ); + A = _mm_add_epi32( A, T1_59 ); + + __m128i T1_60 = mm128_add4_32( D, BSG2_1( A ), CHs( A, B, C ), W[12] ); + H = _mm_add_epi32( H, T1_60 ); + + if ( _mm_movemask_ps( (__m128)_mm_cmpeq_epi32( H, H_ ) ) == 0 ) + return 0; + + __m128i K60 = _mm_set1_epi32( K256[60] ); + H = _mm_add_epi32( H, K60 ); + + G = _mm_add_epi32( T1_57, _mm_add_epi32( BSG2_0( H ), + MAJs( H, A, B ) ) ); + F = _mm_add_epi32( T1_58, _mm_add_epi32( BSG2_0( G ), + MAJs( G, H, A ) ) ); + E = _mm_add_epi32( T1_59, _mm_add_epi32( BSG2_0( F ), + MAJs( F, G, H ) ) ); + D = mm128_add4_32( T1_60, BSG2_0( E ), MAJs( E, F, G ), K60 ); + + W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); + W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); + W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] ); + + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 ); + + state_out[0] = _mm_add_epi32( state_in[0], A ); + state_out[1] = _mm_add_epi32( state_in[1], B ); + state_out[2] = _mm_add_epi32( state_in[2], C ); + state_out[3] = _mm_add_epi32( state_in[3], D ); + state_out[4] = _mm_add_epi32( state_in[4], E ); + state_out[5] = _mm_add_epi32( state_in[5], F ); + state_out[6] = _mm_add_epi32( state_in[6], G ); + state_out[7] = _mm_add_epi32( state_in[7], H ); + return 1; +} + void sha256_4way_init( sha256_4way_context *sc ) { - sc->initialized = false; sc->count_high = sc->count_low = 0; -/* - sc->val[0] = _mm_set1_epi32( H256[0] ); - sc->val[1] = _mm_set1_epi32( H256[1] ); - sc->val[2] = _mm_set1_epi32( H256[2] ); - sc->val[3] = _mm_set1_epi32( H256[3] ); - sc->val[4] = _mm_set1_epi32( H256[4] ); - sc->val[5] = _mm_set1_epi32( H256[5] ); - sc->val[6] = _mm_set1_epi32( H256[6] ); - sc->val[7] = _mm_set1_epi32( H256[7] ); -*/ + sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 ); + sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + sc->val[4] = m128_const1_64( 0x510E527F510E527F ); + sc->val[5] = m128_const1_64( 0x9B05688C9B05688C ); + sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); } void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ) @@ -286,7 +439,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ) len -= clen; if ( ptr == buf_size ) { - sha256_4way_round( sc, sc->buf, sc->val ); + sha256_4way_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -311,7 +464,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_4way_round( sc, sc->buf, sc->val ); + sha256_4way_transform_be( sc->val, sc->buf, sc->val ); memset_zero_128( sc->buf, pad >> 2 ); } else @@ -321,11 +474,9 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) high = (sc->count_high << 3) | (low >> 29); low = low << 3; - sc->buf[ pad >> 2 ] = - mm128_bswap_32( m128_const1_32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm128_bswap_32( m128_const1_32( low ) ); - sha256_4way_round( sc, sc->buf, sc->val ); + sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) ); + sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) ); + sha256_4way_transform_be( sc->val, sc->buf, sc->val ); mm128_block_bswap_32( dst, sc->val ); } @@ -342,78 +493,300 @@ void sha256_4way_full( void *dst, const void *data, size_t len ) // SHA-256 8 way -#define CHx(X, Y, Z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) - -#define MAJx(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) - #define BSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 2), mm256_ror_32(x, 13) ), mm256_ror_32( x, 22) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \ + mm256_ror_32( x, 13 ) ), \ + mm256_ror_32( x, 22 ) ) #define BSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 6), mm256_ror_32(x, 11) ), mm256_ror_32( x, 25) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \ + mm256_ror_32( x, 11 ) ), \ + mm256_ror_32( x, 25 ) ) #define SSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 7), mm256_ror_32(x, 18) ), _mm256_srli_epi32(x, 3) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \ + mm256_ror_32( x, 18 ) ), \ + _mm256_srli_epi32( x, 3 ) ) #define SSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \ + mm256_ror_32( x, 19 ) ), \ + _mm256_srli_epi32( x, 10 ) ) #define SHA2x_MEXP( a, b, c, d ) \ - mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] ); + mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d ); + +#define SHA256x8_MSG_EXPANSION( W ) \ + W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \ + W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); \ + W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \ + W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \ + W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \ + W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \ + W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \ + W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \ + W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \ + W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \ + W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] ); \ + W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] ); \ + W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] ); \ + W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] ); \ + W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] ); \ + W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); + + +// With AVX512VL ternary logic optimizations are available. +// If not optimize by forwarding the result of X^Y in MAJ to the next round +// to avoid recalculating it as Y^Z. This optimization is not applicable +// when MAJ is optimized with ternary logic. + +#if defined(__AVX512VL__) + +#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca ) + +#define MAJx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 ) + +#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ +do { \ + __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \ + W[ i ] ); \ + __m256i T1 = BSG2_1x( E ); \ + __m256i T2 = BSG2_0x( A ); \ + T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ + T1 = _mm256_add_epi32( T1, H ); \ + T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ + D = _mm256_add_epi32( D, T1 ); \ + H = _mm256_add_epi32( T1, T2 ); \ +} while (0) -#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ +#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \ + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \ + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \ + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); \ + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); \ + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); \ + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); \ + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); \ + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); \ + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); \ + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \ + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \ + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \ + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \ + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \ + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + +#else // AVX2 + +#define CHx(X, Y, Z) \ + _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) + +// Use saved X_xor_Y from previous round, now called Y_xor_Z, +// and save new X_xor_Y, for next round. +#define MAJx(X, Y, Z) \ + _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ + Y_xor_Z ) ) + + +#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ do { \ - __m256i T1, T2; \ - __m256i K = _mm256_set1_epi32( K256[( (j)+(i) )] ); \ - T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ - K, W[i] ) ); \ - T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ + __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \ + __m256i T1 = BSG2_1x( E ); \ + __m256i T2 = BSG2_0x( A ); \ + T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ + T1 = _mm256_add_epi32( T1, H ); \ + T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ + Y_xor_Z = X_xor_Y; \ + D = _mm256_add_epi32( D, T1 ); \ + H = _mm256_add_epi32( T1, T2 ); \ +} while (0) + + +// read Y_xor_Z, update X_xor_Y +#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \ + _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ + Y_xor_Z ) ) + +// start with toc initialized to y^z: toc = B ^ C +// First round reads toc as Y_xor_Z and saves X_xor_Y as tic. +// Second round reads tic as Y_xor_Z and saves X_xor_Y as toc. + +#define SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, i0, i1, j ) \ +do { \ + __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \ + W[ i0 ] ); \ + __m256i T1 = BSG2_1x( E ); \ + __m256i T2 = BSG2_0x( A ); \ + T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ + T1 = _mm256_add_epi32( T1, H ); \ + T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ D = _mm256_add_epi32( D, T1 ); \ H = _mm256_add_epi32( T1, T2 ); \ +\ + T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \ + W[ (i1) ] ); \ + T1 = BSG2_1x( D ); \ + T2 = BSG2_0x( H ); \ + T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \ + T1 = _mm256_add_epi32( T1, G ); \ + T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ + C = _mm256_add_epi32( C, T1 ); \ + G = _mm256_add_epi32( T1, T2 ); \ } while (0) -static void -sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) +#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ +{ \ + __m256i tic, toc = _mm256_xor_si256( B, C ); \ + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 0, 1, j ); \ + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 2, 3, j ); \ + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 4, 5, j ); \ + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 6, 7, j ); \ + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 8, 9, j ); \ + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j ); \ + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j ); \ + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j ); \ +} + +#endif // AVX512VL else AVX2 + +static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W, + const __m256i *in ) \ { - register __m256i A, B, C, D, E, F, G, H; - __m256i W[16]; + __m256i A, B, C, D, E, F, G, H; - mm256_block_bswap_32( W , in ); - mm256_block_bswap_32( W+8, in+8 ); + A = _mm256_load_si256( in ); + B = _mm256_load_si256( in+1 ); + C = _mm256_load_si256( in+2 ); + D = _mm256_load_si256( in+3 ); + E = _mm256_load_si256( in+4 ); + F = _mm256_load_si256( in+5 ); + G = _mm256_load_si256( in+6 ); + H = _mm256_load_si256( in+7 ); - if ( ctx->initialized ) - { - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - } - else + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + + for ( int j = 16; j < 64; j += 16 ) { - A = m256_const1_64( 0x6A09E6676A09E667 ); - B = m256_const1_64( 0xBB67AE85BB67AE85 ); - C = m256_const1_64( 0x3C6EF3723C6EF372 ); - D = m256_const1_64( 0xA54FF53AA54FF53A ); - E = m256_const1_64( 0x510E527F510E527F ); - F = m256_const1_64( 0x9B05688C9B05688C ); - G = m256_const1_64( 0x1F83D9AB1F83D9AB ); - H = m256_const1_64( 0x5BE0CD195BE0CD19 ); + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ); } + out[0] = _mm256_add_epi32( in[0], A ); + out[1] = _mm256_add_epi32( in[1], B ); + out[2] = _mm256_add_epi32( in[2], C ); + out[3] = _mm256_add_epi32( in[3], D ); + out[4] = _mm256_add_epi32( in[4], E ); + out[5] = _mm256_add_epi32( in[5], F ); + out[6] = _mm256_add_epi32( in[6], G ); + out[7] = _mm256_add_epi32( in[7], H ); +} + +// accepts LE input data +void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, + const __m256i *state_in ) +{ + __m256i W[16]; + memcpy_256( W, data, 16 ); + SHA256_8WAY_TRANSFORM( state_out, W, state_in ); +} + +// Accepts BE input data, need to bswap +void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, + const __m256i *state_in ) +{ + __m256i W[16]; + mm256_block_bswap_32( W , data ); + mm256_block_bswap_32( W+8, data+8 ); + SHA256_8WAY_TRANSFORM( state_out, W, state_in ); +} + +// Aggressive prehashing, LE byte order +void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, + const __m256i *W, const __m256i *state_in ) +{ + __m256i A, B, C, D, E, F, G, H; + + X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); + X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ), + W[ 2] ); + X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ), + SSG2_0x( W[ 4] ) ); + X[ 4] = _mm256_add_epi32( _mm256_add_epi32( W[13], SSG2_0x( W[ 5] ) ), + W[ 4] ); + X[ 5] = _mm256_add_epi32( _mm256_add_epi32( W[14], SSG2_0x( W[ 6] ) ), + W[ 5] ); + X [6] = _mm256_add_epi32( _mm256_add_epi32( W[15], SSG2_0x( W[ 7] ) ), + W[ 6] ); + X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ), + W[ 7] ); + X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ), + W[ 8] ); + X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] ); + X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] ); + X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] ); + X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] ); + X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] ); + X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] ); + X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] ); + + A = _mm256_load_si256( state_in ); + B = _mm256_load_si256( state_in + 1 ); + C = _mm256_load_si256( state_in + 2 ); + D = _mm256_load_si256( state_in + 3 ); + E = _mm256_load_si256( state_in + 4 ); + F = _mm256_load_si256( state_in + 5 ); + G = _mm256_load_si256( state_in + 6 ); + H = _mm256_load_si256( state_in + 7 ); + +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); +#endif + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + + _mm256_store_si256( state_mid , A ); + _mm256_store_si256( state_mid + 1, B ); + _mm256_store_si256( state_mid + 2, C ); + _mm256_store_si256( state_mid + 3, D ); + _mm256_store_si256( state_mid + 4, E ); + _mm256_store_si256( state_mid + 5, F ); + _mm256_store_si256( state_mid + 6, G ); + _mm256_store_si256( state_mid + 7, H ); +} + +void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, + const __m256i *state_in, const __m256i *state_mid, const __m256i *X ) +{ + __m256i A, B, C, D, E, F, G, H; + __m256i W[16]; + + memcpy_256( W, data, 16 ); + + A = _mm256_load_si256( state_mid ); + B = _mm256_load_si256( state_mid + 1 ); + C = _mm256_load_si256( state_mid + 2 ); + D = _mm256_load_si256( state_mid + 3 ); + E = _mm256_load_si256( state_mid + 4 ); + F = _mm256_load_si256( state_mid + 5 ); + G = _mm256_load_si256( state_mid + 6 ); + H = _mm256_load_si256( state_mid + 7 ); + +// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); +// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); +// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H ); +#endif + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); @@ -428,85 +801,186 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } + W[ 0] = X[ 0]; + W[ 1] = X[ 1]; + W[ 2] = _mm256_add_epi32( X[ 2], SSG2_0x( W[ 3] ) ); + W[ 3] = _mm256_add_epi32( X[ 3], W[ 3] ); + W[ 4] = _mm256_add_epi32( X[ 4], SSG2_1x( W[ 2] ) ); + W[ 5] = _mm256_add_epi32( X[ 5], SSG2_1x( W[ 3] ) ); + W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) ); + W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) ); + W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) ); + W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ), + W[ 2] ) ); + W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ), + W[ 3] ) ); + W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ), + W[ 4] ) ); + W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ), + W[ 5] ) ); + W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ), + W[ 6] ) ); + W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ), + W[ 7] ) ); + W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ), + W[ 8] ) ); + + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); + + A = _mm256_add_epi32( A, _mm256_load_si256( state_in ) ); + B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) ); + C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) ); + D = _mm256_add_epi32( D, _mm256_load_si256( state_in + 3 ) ); + E = _mm256_add_epi32( E, _mm256_load_si256( state_in + 4 ) ); + F = _mm256_add_epi32( F, _mm256_load_si256( state_in + 5 ) ); + G = _mm256_add_epi32( G, _mm256_load_si256( state_in + 6 ) ); + H = _mm256_add_epi32( H, _mm256_load_si256( state_in + 7 ) ); + + _mm256_store_si256( state_out , A ); + _mm256_store_si256( state_out + 1, B ); + _mm256_store_si256( state_out + 2, C ); + _mm256_store_si256( state_out + 3, D ); + _mm256_store_si256( state_out + 4, E ); + _mm256_store_si256( state_out + 5, F ); + _mm256_store_si256( state_out + 6, G ); + _mm256_store_si256( state_out + 7, H ); +} - if ( ctx->initialized ) - { - r[0] = _mm256_add_epi32( r[0], A ); - r[1] = _mm256_add_epi32( r[1], B ); - r[2] = _mm256_add_epi32( r[2], C ); - r[3] = _mm256_add_epi32( r[3], D ); - r[4] = _mm256_add_epi32( r[4], E ); - r[5] = _mm256_add_epi32( r[5], F ); - r[6] = _mm256_add_epi32( r[6], G ); - r[7] = _mm256_add_epi32( r[7], H ); - } - else +int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, + const __m256i *state_in ) +{ + __m256i A, B, C, D, E, F, G, H; + __m256i W[16]; memcpy_256( W, data, 16 ); + const __m256i H_ = m256_const1_32( 0x136032ED ); + + A = _mm256_load_si256( state_in ); + B = _mm256_load_si256( state_in+1 ); + C = _mm256_load_si256( state_in+2 ); + D = _mm256_load_si256( state_in+3 ); + E = _mm256_load_si256( state_in+4 ); + F = _mm256_load_si256( state_in+5 ); + G = _mm256_load_si256( state_in+6 ); + H = _mm256_load_si256( state_in+7 ); + + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + + for ( int j = 16; j < 48; j += 16 ) { - ctx->initialized = true; - r[0] = _mm256_add_epi32( A, m256_const1_64( 0x6A09E6676A09E667 ) ); - r[1] = _mm256_add_epi32( B, m256_const1_64( 0xBB67AE85BB67AE85 ) ); - r[2] = _mm256_add_epi32( C, m256_const1_64( 0x3C6EF3723C6EF372 ) ); - r[3] = _mm256_add_epi32( D, m256_const1_64( 0xA54FF53AA54FF53A ) ); - r[4] = _mm256_add_epi32( E, m256_const1_64( 0x510E527F510E527F ) ); - r[5] = _mm256_add_epi32( F, m256_const1_64( 0x9B05688C9B05688C ) ); - r[6] = _mm256_add_epi32( G, m256_const1_64( 0x1F83D9AB1F83D9AB ) ); - r[7] = _mm256_add_epi32( H, m256_const1_64( 0x5BE0CD195BE0CD19 ) ); + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ); } + + W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] ); + +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); +#endif + + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 48 ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 48 ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 48 ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 48 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 48 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 48 ); + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 ); + + __m256i T1_57 = _mm256_add_epi32( G, + mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ), + _mm256_set1_epi32( K256[57] ), W[ 9] ) ); + C = _mm256_add_epi32( C, T1_57 ); + + __m256i T1_58 = _mm256_add_epi32( F, + mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), + _mm256_set1_epi32( K256[58] ), W[10] ) ); + B = _mm256_add_epi32( B, T1_58 ); + + __m256i T1_59 = _mm256_add_epi32( E, + mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), + _mm256_set1_epi32( K256[59] ), W[11] ) ); + A = _mm256_add_epi32( A, T1_59 ); + + __m256i T1_60 = mm256_add4_32( D, BSG2_1x( A ), CHx( A, B, C ), W[12] ); + H = _mm256_add_epi32( H, T1_60 ); + + if ( _mm256_movemask_ps( (__m256)_mm256_cmpeq_epi32( H, H_ ) ) == 0 ) + return 0; + + __m256i K60 = _mm256_set1_epi32( K256[60] ); + H = _mm256_add_epi32( H, K60 ); + + G = _mm256_add_epi32( T1_57, _mm256_add_epi32( BSG2_0x( H ), + MAJx( H, A, B ) ) ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + F = _mm256_add_epi32( T1_58, _mm256_add_epi32( BSG2_0x( G ), + MAJx( G, H, A ) ) ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + E = _mm256_add_epi32( T1_59, _mm256_add_epi32( BSG2_0x( F ), + MAJx( F, G, H ) ) ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + D = mm256_add4_32( T1_60, BSG2_0x( E ), MAJx( E, F, G ), K60 ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + W[13] = SHA2x_MEXP( W[11], W[6], W[14], W[13] ); + W[14] = SHA2x_MEXP( W[12], W[7], W[15], W[14] ); + W[15] = SHA2x_MEXP( W[13], W[8], W[ 0], W[15] ); + + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 ); + + state_out[0] = _mm256_add_epi32( state_in[0], A ); + state_out[1] = _mm256_add_epi32( state_in[1], B ); + state_out[2] = _mm256_add_epi32( state_in[2], C ); + state_out[3] = _mm256_add_epi32( state_in[3], D ); + state_out[4] = _mm256_add_epi32( state_in[4], E ); + state_out[5] = _mm256_add_epi32( state_in[5], F ); + state_out[6] = _mm256_add_epi32( state_in[6], G ); + state_out[7] = _mm256_add_epi32( state_in[7], H ); + return 1; } void sha256_8way_init( sha256_8way_context *sc ) { - sc->initialized = false; sc->count_high = sc->count_low = 0; -/* - sc->val[0] = _mm256_set1_epi32( H256[0] ); - sc->val[1] = _mm256_set1_epi32( H256[1] ); - sc->val[2] = _mm256_set1_epi32( H256[2] ); - sc->val[3] = _mm256_set1_epi32( H256[3] ); - sc->val[4] = _mm256_set1_epi32( H256[4] ); - sc->val[5] = _mm256_set1_epi32( H256[5] ); - sc->val[6] = _mm256_set1_epi32( H256[6] ); - sc->val[7] = _mm256_set1_epi32( H256[7] ); -*/ + sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 ); + sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); + sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); + sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A ); + sc->val[4] = m256_const1_64( 0x510E527F510E527F ); + sc->val[5] = m256_const1_64( 0x9B05688C9B05688C ); + sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); + sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); } - // need to handle odd byte length for yespower. // Assume only last update is odd. @@ -531,7 +1005,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ) len -= clen; if ( ptr == buf_size ) { - sha256_8way_round( sc, sc->buf, sc->val ); + sha256_8way_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -556,7 +1030,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_8way_round( sc, sc->buf, sc->val ); + sha256_8way_transform_be( sc->val, sc->buf, sc->val ); memset_zero_256( sc->buf, pad >> 2 ); } else @@ -566,12 +1040,10 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) high = (sc->count_high << 3) | (low >> 29); low = low << 3; - sc->buf[ pad >> 2 ] = - mm256_bswap_32( m256_const1_32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm256_bswap_32( m256_const1_32( low ) ); + sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) ); + sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) ); - sha256_8way_round( sc, sc->buf, sc->val ); + sha256_8way_transform_be( sc->val, sc->buf, sc->val ); mm256_block_bswap_32( dst, sc->val ); } @@ -588,32 +1060,61 @@ void sha256_8way_full( void *dst, const void *data, size_t len ) // SHA-256 16 way -#define CHx16(X, Y, Z) \ - _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) +#define CHx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xca ) -#define MAJx16(X, Y, Z) \ - _mm512_or_si512( _mm512_and_si512( X, Y ), \ - _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) +#define MAJx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 ) -#define BSG2_0x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) ) +#define BSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 2 ), \ + _mm512_ror_epi32( x, 13 ), \ + _mm512_ror_epi32( x, 22 ) ) -#define BSG2_1x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) ) +#define BSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 6 ), \ + _mm512_ror_epi32( x, 11 ), \ + _mm512_ror_epi32( x, 25 ) ) -#define SSG2_0x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) +#define SSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 7 ), \ + _mm512_ror_epi32( x, 18 ), \ + _mm512_srli_epi32( x, 3 ) ) -#define SSG2_1x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) ) +#define SSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 17 ), \ + _mm512_ror_epi32( x, 19 ), \ + _mm512_srli_epi32( x, 10 ) ) #define SHA2x16_MEXP( a, b, c, d ) \ - mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] ); - + mm512_add4_32( SSG2_1x16( a ), b, SSG2_0x16( c ), d ); + +#define SHA256x16_MSG_EXPANSION( W ) \ + W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \ + W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); \ + W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \ + W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \ + W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \ + W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \ + W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \ + W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \ + W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \ + W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \ + W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] ); \ + W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] ); \ + W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] ); \ + W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] ); \ + W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] ); \ + W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] ); + +#define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ +do { \ + __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \ + __m512i T1 = BSG2_1x16( E ); \ + __m512i T2 = BSG2_0x16( A ); \ + T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \ + T1 = _mm512_add_epi32( T1, H ); \ + T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \ + T1 = _mm512_add_epi32( T1, T0 ); \ + D = _mm512_add_epi32( D, T1 ); \ + H = _mm512_add_epi32( T1, T2 ); \ +} while (0) + +/* #define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m512i T1, T2; \ @@ -624,42 +1125,147 @@ do { \ D = _mm512_add_epi32( D, T1 ); \ H = _mm512_add_epi32( T1, T2 ); \ } while (0) +*/ -static void -sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) +#define SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \ + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \ + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \ + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); \ + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); \ + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); \ + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); \ + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); \ + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); \ + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); \ + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \ + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \ + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \ + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \ + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \ + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + +static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W, + const __m512i *in ) \ { - register __m512i A, B, C, D, E, F, G, H; - __m512i W[16]; + __m512i A, B, C, D, E, F, G, H; + A = _mm512_load_si512( in ); + B = _mm512_load_si512( in+1 ); + C = _mm512_load_si512( in+2 ); + D = _mm512_load_si512( in+3 ); + E = _mm512_load_si512( in+4 ); + F = _mm512_load_si512( in+5 ); + G = _mm512_load_si512( in+6 ); + H = _mm512_load_si512( in+7 ); + + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); + + out[0] = _mm512_add_epi32( in[0], A ); + out[1] = _mm512_add_epi32( in[1], B ); + out[2] = _mm512_add_epi32( in[2], C ); + out[3] = _mm512_add_epi32( in[3], D ); + out[4] = _mm512_add_epi32( in[4], E ); + out[5] = _mm512_add_epi32( in[5], F ); + out[6] = _mm512_add_epi32( in[6], G ); + out[7] = _mm512_add_epi32( in[7], H ); +} - mm512_block_bswap_32( W , in ); - mm512_block_bswap_32( W+8, in+8 ); +// accepts LE input data +void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, + const __m512i *state_in ) +{ + __m512i W[16]; + memcpy_512( W, data, 16 ); + SHA256_16WAY_TRANSFORM( state_out, W, state_in ); +} - if ( ctx->initialized ) - { - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - } - else - { - A = m512_const1_64( 0x6A09E6676A09E667 ); - B = m512_const1_64( 0xBB67AE85BB67AE85 ); - C = m512_const1_64( 0x3C6EF3723C6EF372 ); - D = m512_const1_64( 0xA54FF53AA54FF53A ); - E = m512_const1_64( 0x510E527F510E527F ); - F = m512_const1_64( 0x9B05688C9B05688C ); - G = m512_const1_64( 0x1F83D9AB1F83D9AB ); - H = m512_const1_64( 0x5BE0CD195BE0CD19 ); - } +// Accepts BE input data, need to bswap +void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, + const __m512i *state_in ) +{ + __m512i W[16]; + mm512_block_bswap_32( W , data ); + mm512_block_bswap_32( W+8, data+8 ); + SHA256_16WAY_TRANSFORM( state_out, W, state_in ); +} + +// Aggressive prehashing, LE byte order +void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, + const __m512i *W, const __m512i *state_in ) +{ + __m512i A, B, C, D, E, F, G, H; + + // precalculate constant part msg expansion for second iteration. + X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); + X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ), + W[ 2] ); + X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ), + SSG2_0x16( W[ 4] ) ); + X[ 4] = _mm512_add_epi32( _mm512_add_epi32( W[13], SSG2_0x16( W[ 5] ) ), + W[ 4] ); + X[ 5] = _mm512_add_epi32( _mm512_add_epi32( W[14], SSG2_0x16( W[ 6] ) ), + W[ 5] ); + X [6] = _mm512_add_epi32( _mm512_add_epi32( W[15], SSG2_0x16( W[ 7] ) ), + W[ 6] ); + X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ), + W[ 7] ); + X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ), + W[ 8] ); + X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] ); + X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] ); + X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] ); + X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] ); + X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] ); + X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] ); + X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] ); + + A = _mm512_load_si512( state_in ); + B = _mm512_load_si512( state_in + 1 ); + C = _mm512_load_si512( state_in + 2 ); + D = _mm512_load_si512( state_in + 3 ); + E = _mm512_load_si512( state_in + 4 ); + F = _mm512_load_si512( state_in + 5 ); + G = _mm512_load_si512( state_in + 6 ); + H = _mm512_load_si512( state_in + 7 ); SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + + _mm512_store_si512( state_mid , A ); + _mm512_store_si512( state_mid + 1, B ); + _mm512_store_si512( state_mid + 2, C ); + _mm512_store_si512( state_mid + 3, D ); + _mm512_store_si512( state_mid + 4, E ); + _mm512_store_si512( state_mid + 5, F ); + _mm512_store_si512( state_mid + 6, G ); + _mm512_store_si512( state_mid + 7, H ); +} + +void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, + const __m512i *state_in, const __m512i *state_mid, const __m512i *X ) +{ + __m512i A, B, C, D, E, F, G, H; + __m512i W[16]; + + memcpy_512( W, data, 16 ); + + A = _mm512_load_si512( state_mid ); + B = _mm512_load_si512( state_mid + 1 ); + C = _mm512_load_si512( state_mid + 2 ); + D = _mm512_load_si512( state_mid + 3 ); + E = _mm512_load_si512( state_mid + 4 ); + F = _mm512_load_si512( state_mid + 5 ); + G = _mm512_load_si512( state_mid + 6 ); + H = _mm512_load_si512( state_mid + 7 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); @@ -674,75 +1280,168 @@ sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); - - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - if ( ctx->initialized ) - { - r[0] = _mm512_add_epi32( r[0], A ); - r[1] = _mm512_add_epi32( r[1], B ); - r[2] = _mm512_add_epi32( r[2], C ); - r[3] = _mm512_add_epi32( r[3], D ); - r[4] = _mm512_add_epi32( r[4], E ); - r[5] = _mm512_add_epi32( r[5], F ); - r[6] = _mm512_add_epi32( r[6], G ); - r[7] = _mm512_add_epi32( r[7], H ); - } - else - { - ctx->initialized = true; - r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) ); - r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) ); - r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) ); - r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) ); - r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) ); - r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) ); - r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) ); - r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) ); - } + // update precalculated msg expansion with new nonce: W[3]. + W[ 0] = X[ 0]; + W[ 1] = X[ 1]; + W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) ); + W[ 3] = _mm512_add_epi32( X[ 3], W[ 3] ); + W[ 4] = _mm512_add_epi32( X[ 4], SSG2_1x16( W[ 2] ) ); + W[ 5] = _mm512_add_epi32( X[ 5], SSG2_1x16( W[ 3] ) ); + W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) ); + W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) ); + W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) ); + W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ), + W[ 2] ) ); + W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ), + W[ 3] ) ); + W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ), + W[ 4] ) ); + W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ), + W[ 5] ) ); + W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ), + W[ 6] ) ); + W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ), + W[ 7] ) ); + W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ), + W[ 8] ) ); + + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); + + A = _mm512_add_epi32( A, _mm512_load_si512( state_in ) ); + B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) ); + C = _mm512_add_epi32( C, _mm512_load_si512( state_in + 2 ) ); + D = _mm512_add_epi32( D, _mm512_load_si512( state_in + 3 ) ); + E = _mm512_add_epi32( E, _mm512_load_si512( state_in + 4 ) ); + F = _mm512_add_epi32( F, _mm512_load_si512( state_in + 5 ) ); + G = _mm512_add_epi32( G, _mm512_load_si512( state_in + 6 ) ); + H = _mm512_add_epi32( H, _mm512_load_si512( state_in + 7 ) ); + + _mm512_store_si512( state_out , A ); + _mm512_store_si512( state_out + 1, B ); + _mm512_store_si512( state_out + 2, C ); + _mm512_store_si512( state_out + 3, D ); + _mm512_store_si512( state_out + 4, E ); + _mm512_store_si512( state_out + 5, F ); + _mm512_store_si512( state_out + 6, G ); + _mm512_store_si512( state_out + 7, H ); } +// returns 0 if hash aborted early and invalid. +int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, + const __m512i *state_in ) +{ + __m512i A, B, C, D, E, F, G, H; + __m512i W[16]; memcpy_512( W, data, 16 ); + // Value for H at round 60, before adding K, to produce valid final hash + //where H == 0. + // H_ = -( H256[7] + K256[60] ); + const __m512i H_ = m512_const1_32( 0x136032ED ); + + A = _mm512_load_si512( state_in ); + B = _mm512_load_si512( state_in+1 ); + C = _mm512_load_si512( state_in+2 ); + D = _mm512_load_si512( state_in+3 ); + E = _mm512_load_si512( state_in+4 ); + F = _mm512_load_si512( state_in+5 ); + G = _mm512_load_si512( state_in+6 ); + H = _mm512_load_si512( state_in+7 ); + + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + + W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] ); + + // Rounds 48 to 56 + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 48 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 48 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 48 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 48 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 48 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 48 ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 ); + + // Rounds 57 to 60 part 1 + __m512i T1_57 = _mm512_add_epi32( _mm512_set1_epi32( K256[57] ), + mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) ); + C = _mm512_add_epi32( C, T1_57 ); + __m512i T1_58 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ), + mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) ); + B = _mm512_add_epi32( B, T1_58 ); + __m512i T1_59 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ), + mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) ); + A = _mm512_add_epi32( A, T1_59 ); + __m512i T1_60 = mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ); + H = _mm512_add_epi32( H, T1_60 ); + + // give up? + if ( _mm512_cmpeq_epi32_mask( H, H_ ) == 0 ) return 0; + + // Rounds 57 to 60 part 2 + __m512i K60 = _mm512_set1_epi32( K256[60] ); + H = _mm512_add_epi32( H, K60 ); + + G = _mm512_add_epi32( T1_57, _mm512_add_epi32( BSG2_0x16( H ), + MAJx16( H, A, B ) ) ); + F = _mm512_add_epi32( T1_58, _mm512_add_epi32( BSG2_0x16( G ), + MAJx16( G, H, A ) ) ); + E = _mm512_add_epi32( T1_59, _mm512_add_epi32( BSG2_0x16( F ), + MAJx16( F, G, H ) ) ); + D = mm512_add4_32( T1_60, BSG2_0x16( E ), MAJx16( E, F, G ), K60 ); + + // Rounds 61 to 63 + W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] ); + W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] ); + W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] ); + + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 ); + + state_out[0] = _mm512_add_epi32( state_in[0], A ); + state_out[1] = _mm512_add_epi32( state_in[1], B ); + state_out[2] = _mm512_add_epi32( state_in[2], C ); + state_out[3] = _mm512_add_epi32( state_in[3], D ); + state_out[4] = _mm512_add_epi32( state_in[4], E ); + state_out[5] = _mm512_add_epi32( state_in[5], F ); + state_out[6] = _mm512_add_epi32( state_in[6], G ); + state_out[7] = _mm512_add_epi32( state_in[7], H ); + return 1; +} + void sha256_16way_init( sha256_16way_context *sc ) { - sc->initialized = false; sc->count_high = sc->count_low = 0; + sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 ); + sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); + sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 ); + sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A ); + sc->val[4] = m512_const1_64( 0x510E527F510E527F ); + sc->val[5] = m512_const1_64( 0x9B05688C9B05688C ); + sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); + sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); } - void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ) { @@ -765,7 +1464,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data, len -= clen; if ( ptr == buf_size ) { - sha256_16way_round( sc, sc->buf, sc->val ); + sha256_16way_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -790,7 +1489,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_16way_round( sc, sc->buf, sc->val ); + sha256_16way_transform_be( sc->val, sc->buf, sc->val ); memset_zero_512( sc->buf, pad >> 2 ); } else @@ -800,12 +1499,10 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst ) high = (sc->count_high << 3) | (low >> 29); low = low << 3; - sc->buf[ pad >> 2 ] = - mm512_bswap_32( m512_const1_32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm512_bswap_32( m512_const1_32( low ) ); + sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) ); + sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) ); - sha256_16way_round( sc, sc->buf, sc->val ); + sha256_16way_transform_be( sc->val, sc->buf, sc->val ); mm512_block_bswap_32( dst, sc->val ); } diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c index fb049b1f..e08dd60b 100644 --- a/algo/sha/sha256-hash-opt.c +++ b/algo/sha/sha256-hash-opt.c @@ -3,13 +3,203 @@ /* Based on code from Intel, and by Sean Gulley for */ /* the miTLS project. */ -// A drop in replacement for the function of the same name in sph_sha2.c. +// A stripped down version with byte swapping removed. #if defined(__SHA__) -#include "simd-utils.h" +#include "sha256-hash.h" -static void sha2_round( const uint8_t input[], uint32_t state[8] ) +void sha256_opt_transform_le( uint32_t *state_out, const void *input, + const uint32_t *state_in ) +{ + __m128i STATE0, STATE1; + __m128i MSG, TMP; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial values + TMP = _mm_load_si128((__m128i*) &state_in[0]); + STATE1 = _mm_load_si128((__m128i*) &state_in[4]); +// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + TMSG0 = _mm_load_si128((const __m128i*) (input+0)); +// TMSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_load_si128((const __m128i*) (input+16)); +// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 8-11 + TMSG2 = _mm_load_si128((const __m128i*) (input+32)); +// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_load_si128((const __m128i*) (input+48)); +// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add values back to state + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &state_out[0], STATE0); + _mm_store_si128((__m128i*) &state_out[4], STATE1); +} + + +void sha256_opt_transform_be( uint32_t *state_out, const void *input, + const uint32_t *state_in ) { __m128i STATE0, STATE1; __m128i MSG, TMP, MASK; @@ -17,8 +207,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) __m128i ABEF_SAVE, CDGH_SAVE; // Load initial values - TMP = _mm_load_si128((__m128i*) &state[0]); - STATE1 = _mm_load_si128((__m128i*) &state[4]); + TMP = _mm_load_si128((__m128i*) &state_in[0]); + STATE1 = _mm_load_si128((__m128i*) &state_in[4]); MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB @@ -31,8 +221,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) CDGH_SAVE = STATE1; // Rounds 0-3 - MSG = _mm_load_si128((const __m128i*) (input+0)); - TMSG0 = _mm_shuffle_epi8(MSG, MASK); + TMSG0 = _mm_load_si128((const __m128i*) (input+0)); + TMSG0 = _mm_shuffle_epi8( TMSG0, MASK ); MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); MSG = _mm_shuffle_epi32(MSG, 0x0E); @@ -46,7 +236,6 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) MSG = _mm_shuffle_epi32(MSG, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - // Rounds 8-11 TMSG2 = _mm_load_si128((const __m128i*) (input+32)); TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); @@ -192,9 +381,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF // Save state - _mm_store_si128((__m128i*) &state[0], STATE0); - _mm_store_si128((__m128i*) &state[4], STATE1); + _mm_store_si128((__m128i*) &state_out[0], STATE0); + _mm_store_si128((__m128i*) &state_out[4], STATE1); } - #endif diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c new file mode 100644 index 00000000..ddbaacc9 --- /dev/null +++ b/algo/sha/sha256-hash.c @@ -0,0 +1,142 @@ +#include "sha256-hash.h" + +static const uint32_t SHA256_IV[8] = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +/* +static const uint8_t SHA256_PAD[64] = +{ + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +*/ + +void sha256_ctx_init( sha256_context *ctx ) +{ + memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV ); + ctx->count = 0; +} + +void sha256_update( sha256_context *ctx, const void *data, size_t len ) +{ + int ptr = ctx->count & 0x3f; + const uint8_t *src = data; + + ctx->count += (uint64_t)len; + + if ( len < 64 - ptr ) + { + memcpy( ctx->buf + ptr, src, len ); + return; + } + + memcpy( ctx->buf + ptr, src, 64 - ptr ); + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + src += 64 - ptr; + len -= 64 - ptr; + + while ( len >= 64 ) + { + sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state ); + src += 64; + len -= 64; + } + + memcpy( ctx->buf, src, len ); +} + +#if 0 +void sha256_final( sha256_context *ctx, uint32_t *hash ) +{ + size_t r; + + + /* Figure out how many bytes we have buffered. */ + r = ctx->count & 0x3f; +// r = ( ctx->count >> 3 ) & 0x3f; + +//printf("final: count= %d, r= %d\n", ctx->count, r ); + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if ( r < 56 ) + { + /* Pad to 56 mod 64. */ + memcpy( &ctx->buf[r], SHA256_PAD, 56 - r ); + } + else + { + /* Finish the current block and mix. */ + memcpy( &ctx->buf[r], SHA256_PAD, 64 - r ); + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + +// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + + /* The start of the final block is all zeroes. */ + memset( &ctx->buf[0], 0, 56 ); + } + + /* Add the terminating bit-count. */ + ctx->buf[56] = bswap_64( ctx->count << 3 ); +// ctx->buf[56] = bswap_64( ctx->count ); +// be64enc( &ctx->buf[56], ctx->count ); + + /* Mix in the final block. */ + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + +// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + + for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] ); + +// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i ); + +/* +// be32enc_vect(digest, ctx->state, 4); +// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len) + // Encode vector, two words at a time. + do { + be32enc(&dst[0], src[0]); + be32enc(&dst[4], src[1]); + src += 2; + dst += 8; + } while (--len); +*/ + +} +#endif + +void sha256_final( sha256_context *ctx, void *hash ) +{ + int ptr = ctx->count & 0x3f; + + ctx->buf[ ptr++ ] = 0x80; + + if ( ptr > 56 ) + { + memset( ctx->buf + ptr, 0, 64 - ptr ); + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + memset( ctx->buf, 0, 56 ); + } + else + memset( ctx->buf + ptr, 0, 56 - ptr ); + + *(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 ); + + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + + for ( int i = 0; i < 8; i++ ) + ( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] ); +} + +void sha256_full( void *hash, const void *data, size_t len ) +{ + sha256_context ctx; + sha256_ctx_init( &ctx ); + sha256_update( &ctx, data, len ); + sha256_final( &ctx, hash ); +} + diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h new file mode 100644 index 00000000..410ca90f --- /dev/null +++ b/algo/sha/sha256-hash.h @@ -0,0 +1,60 @@ +#ifndef SHA256_HASH_H__ +#define SHA256_HASH_H__ 1 + +#include +#include "simd-utils.h" +#include "cpuminer-config.h" +#include "sph_sha2.h" + + +// generic interface + +typedef struct { + unsigned char buf[64]; /* first field, for alignment */ + uint32_t state[8]; + uint64_t count; +} sha256_context __attribute__((aligned(64))); + +void sha256_full( void *hash, const void *data, size_t len ); +void sha256_update( sha256_context *ctx, const void *data, size_t len ); +void sha256_final( sha256_context *ctx, void *hash ); +void sha256_ctx_init( sha256_context *ctx ); +void sha256_transform_le( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); +void sha256_transform_be( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); + +#if defined(__SHA__) + +void sha256_opt_transform_le( uint32_t *state_out, const void *input, + const uint32_t *state_in ); + +void sha256_opt_transform_be( uint32_t *state_out, const void *input, + const uint32_t *state_in ); + +// 2 way with interleaved instructions +void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ); + +void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ); + +// Select target +// with SHA... +#define sha256_transform_le sha256_opt_transform_le +#define sha256_transform_be sha256_opt_transform_be + +#else + +// without SHA... +#define sha256_transform_le sph_sha256_transform_le +#define sha256_transform_be sph_sha256_transform_be + +#endif + +// SHA can't do only 3 rounds +#define sha256_prehash_3rounds sph_sha256_prehash_3rounds + +#endif diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c new file mode 100644 index 00000000..18eceffe --- /dev/null +++ b/algo/sha/sha256d-4way.c @@ -0,0 +1,288 @@ +#include "sha256d-4way.h" +#include +#include +#include +#include +#include "sha-hash-4way.h" + +#if defined(SHA256D_16WAY) + +int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + __m512i vdata[32] __attribute__ ((aligned (128))); + __m512i block[16] __attribute__ ((aligned (64))); + __m512i hash32[8] __attribute__ ((aligned (64))); + __m512i initstate[8] __attribute__ ((aligned (64))); + __m512i midstate1[8] __attribute__ ((aligned (64))); + __m512i midstate2[8] __attribute__ ((aligned (64))); + __m512i mexp_pre[16] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 16; + uint32_t n = first_nonce; + __m512i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m512i last_byte = m512_const1_32( 0x80000000 ); + const __m512i sixteen = m512_const1_32( 16 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m512_const1_32( pdata[i] ); + + *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, + n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + + vdata[16+4] = last_byte; + memset_zero_512( vdata+16 + 5, 10 ); + vdata[16+15] = m512_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_512( block + 9, 6 ); + block[15] = m512_const1_32( 32*8 ); // bit count + + // initialize state + initstate[0] = m512_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m512_const1_64( 0x510E527F510E527F ); + initstate[5] = m512_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); + + sha256_16way_transform_le( midstate1, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 ); + + do + { + // 1. final 16 bytes of data, with padding + sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); + + // 2. 32 byte hash from 1. + if ( sha256_16way_transform_le_short( hash32, block, initstate ) ) + { + // byte swap final hash for testing + mm512_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 16; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_16x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + } + *noncev = _mm512_add_epi32( *noncev, sixteen ); + n += 16; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + +#if defined(SHA256D_8WAY) + +int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + __m256i vdata[32] __attribute__ ((aligned (64))); + __m256i block[16] __attribute__ ((aligned (32))); + __m256i hash32[8] __attribute__ ((aligned (32))); + __m256i initstate[8] __attribute__ ((aligned (32))); + __m256i midstate1[8] __attribute__ ((aligned (32))); + __m256i midstate2[8] __attribute__ ((aligned (32))); + __m256i mexp_pre[16] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + __m256i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m256i last_byte = m256_const1_32( 0x80000000 ); + const __m256i eight = m256_const1_32( 8 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m256_const1_32( pdata[i] ); + + *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + + vdata[16+4] = last_byte; + memset_zero_256( vdata+16 + 5, 10 ); + vdata[16+15] = m256_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_256( block + 9, 6 ); + block[15] = m256_const1_32( 32*8 ); // bit count + + // initialize state + initstate[0] = m256_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m256_const1_64( 0x510E527F510E527F ); + initstate[5] = m256_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); + + sha256_8way_transform_le( midstate1, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); + + do + { + // 1. final 16 bytes of data, with padding + sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); + + // 2. 32 byte hash from 1. + if ( unlikely( + sha256_8way_transform_le_short( hash32, block, initstate ) ) ) + { + // byte swap final hash for testing + mm256_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + } + *noncev = _mm256_add_epi32( *noncev, eight ); + n += 8; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + +#if defined(SHA256D_4WAY) + +int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + __m128i vdata[32] __attribute__ ((aligned (64))); + __m128i block[16] __attribute__ ((aligned (32))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i initstate[8] __attribute__ ((aligned (32))); + __m128i midstate1[8] __attribute__ ((aligned (32))); + __m128i midstate2[8] __attribute__ ((aligned (32))); + __m128i mexp_pre[16] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + uint32_t n = first_nonce; + __m128i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i last_byte = m128_const1_32( 0x80000000 ); + const __m128i four = m128_const1_32( 4 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m128_const1_32( pdata[i] ); + + *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + + vdata[16+4] = last_byte; + memset_zero_128( vdata+16 + 5, 10 ); + vdata[16+15] = m128_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + + // initialize state + initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m128_const1_64( 0x510E527F510E527F ); + initstate[5] = m128_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_4way_transform_le( midstate1, vdata, initstate ); + // Do 3 rounds on the first 12 bytes of the next block + sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); + + do + { + // 1. final 16 bytes of data, with padding + sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); + + // 2. 32 byte hash from 1. + if ( unlikely( + sha256_4way_transform_le_short( hash32, block, initstate ) ) ) + { + // byte swap final hash for testing + mm128_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_4x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + } + *noncev = _mm_add_epi32( *noncev, four ); + n += 4; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + +/* +bool register_sha256d_algo( algo_gate_t* gate ) +{ + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; +#if defined(SHA256D_16WAY) + gate->scanhash = (void*)&scanhash_sha256d_16way; +#elif defined(SHA256D_8WAY) + gate->scanhash = (void*)&scanhash_sha256d_8way; +#elif defined(SHA256D_4WAY) + gate->scanhash = (void*)&scanhash_sha256d_4way; +#endif + +// gate->hash = (void*)&sha256d; + return true; +}; +*/ + diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h new file mode 100644 index 00000000..bae02148 --- /dev/null +++ b/algo/sha/sha256d-4way.h @@ -0,0 +1,46 @@ +#ifndef __SHA256D_4WAY_H__ +#define __SHA256D_4WAY_H__ 1 + +#include +#include "algo-gate-api.h" + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define SHA256D_16WAY 1 +#elif defined(__AVX2__) + #define SHA256D_8WAY 1 +#else + #define SHA256D_4WAY 1 +#endif + +bool register_sha256d_algo( algo_gate_t* gate ); + +#if defined(SHA256D_16WAY) + +int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +#endif + +#if defined(SHA256D_8WAY) + +int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +#endif + +#if defined(SHA256D_4WAY) + +int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +#endif + + +/* +#if defined(__SHA__) + +int scanhash_sha256d( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif +*/ + +#endif + diff --git a/algo/sha/sha256d.c b/algo/sha/sha256d.c new file mode 100644 index 00000000..ed4bd60d --- /dev/null +++ b/algo/sha/sha256d.c @@ -0,0 +1,8 @@ +#include "sha256d.h" + +void sha256d( void *hash, const void *data, int len ) +{ + sha256_full( hash, data, len ); + sha256_full( hash, hash, 32 ); +} + diff --git a/algo/sha/sha256d.h b/algo/sha/sha256d.h new file mode 100644 index 00000000..71f78eeb --- /dev/null +++ b/algo/sha/sha256d.h @@ -0,0 +1,7 @@ +#include "algo-gate-api.h" +#include +#include +#include "sha256-hash.h" + +void sha256d( void *hash, const void *data, int len ); + diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c index cf9890e7..90a2b7bb 100644 --- a/algo/sha/sha256q.c +++ b/algo/sha/sha256q.c @@ -3,14 +3,14 @@ #include #include #include -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" -static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64))); +static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64))); void sha256q_midstate( const void* input ) { - sph_sha256_init( &sha256q_ctx ); - sph_sha256( &sha256q_ctx, input, 64 ); + sha256_ctx_init( &sha256q_ctx ); + sha256_update( &sha256q_ctx, input, 64 ); } int sha256q_hash( void* output, const void* input ) @@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input ) const int midlen = 64; // bytes const int tail = 80 - midlen; // 16 - sph_sha256_context ctx __attribute__ ((aligned (64))); + sha256_context ctx __attribute__ ((aligned (64))); memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx ); - sph_sha256( &ctx, input + midlen, tail ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, output ); + sha256_update( &ctx, input + midlen, tail ); + sha256_final( &ctx, hash ); + sha256_full( hash, hash, 32 ); + sha256_full( hash, hash, 32 ); + sha256_full( output, hash, 32 ); + return 1; } diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index eb11744f..9c1677b1 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -7,64 +7,86 @@ #if defined(SHA256T_16WAY) -static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64))); - -void sha256t_16way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*16] __attribute__ ((aligned (64))); - sha256_16way_context ctx; - memcpy( &ctx, &sha256_ctx16, sizeof ctx ); - - sha256_16way_update( &ctx, input + (64<<4), 16 ); - sha256_16way_close( &ctx, vhash ); - - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, vhash, 32 ); - sha256_16way_close( &ctx, vhash ); - - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, vhash, 32 ); - sha256_16way_close( &ctx, output ); -} - int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*16] __attribute__ ((aligned (64))); - uint32_t hash32[8*16] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash32_d7 = &(hash32[7<<4]); + __m512i vdata[32] __attribute__ ((aligned (128))); + __m512i block[16] __attribute__ ((aligned (64))); + __m512i hash32[8] __attribute__ ((aligned (64))); + __m512i initstate[8] __attribute__ ((aligned (64))); + __m512i midstate1[8] __attribute__ ((aligned (64))); + __m512i midstate2[8] __attribute__ ((aligned (64))); + __m512i mexp_pre[16] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 16; uint32_t n = first_nonce; - __m512i *noncev = (__m512i*)vdata + 19; // aligned + __m512i *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; + const __m512i last_byte = m512_const1_32( 0x80000000 ); + const __m512i sixteen = m512_const1_32( 16 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m512_const1_32( pdata[i] ); - mm512_bswap32_intrlv80_16x32( vdata, pdata ); *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); - sha256_16way_init( &sha256_ctx16 ); - sha256_16way_update( &sha256_ctx16, vdata, 64 ); + + vdata[16+4] = last_byte; + memset_zero_512( vdata+16 + 5, 10 ); + vdata[16+15] = m512_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_512( block + 9, 6 ); + block[15] = m512_const1_32( 32*8 ); // bit count + + initstate[0] = m512_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m512_const1_64( 0x510E527F510E527F ); + initstate[5] = m512_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); + + sha256_16way_transform_le( midstate1, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 ); do { - pdata[19] = n; - sha256t_16way_hash( hash32, vdata ); - for ( int lane = 0; lane < 16; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) - { - extr_lane_16x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n + lane ); - submit_solution( work, lane_hash, mythr ); - } + // 1. final 16 bytes of data, pre-padded + sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); + + // 2. 32 byte hash from 1. + sha256_16way_transform_le( block, block, initstate ); + + // 3. 32 byte hash from 2. + if ( unlikely( + sha256_16way_transform_le_short( hash32, block, initstate ) ) ) + { + // byte swap final hash for testing + mm512_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 16; lane++ ) + if ( hash32_d7[ lane ] <= targ32_d7 ) + { + extr_lane_16x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } } - *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) ); + *noncev = _mm512_add_epi32( *noncev, sixteen ); n += 16; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; @@ -72,67 +94,91 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, return 0; } + #endif #if defined(SHA256T_8WAY) -static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64))); - -void sha256t_8way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - sha256_8way_context ctx; - memcpy( &ctx, &sha256_ctx8, sizeof ctx ); - - sha256_8way_update( &ctx, input + (64<<3), 16 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, output ); -} - int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t hash32[8*8] __attribute__ ((aligned (32))); + __m256i vdata[32] __attribute__ ((aligned (64))); + __m256i block[16] __attribute__ ((aligned (32))); + __m256i hash32[8] __attribute__ ((aligned (32))); + __m256i initstate[8] __attribute__ ((aligned (32))); + __m256i midstate1[8] __attribute__ ((aligned (32))); + __m256i midstate2[8] __attribute__ ((aligned (32))); + __m256i mexp_pre[16] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash32_d7 = &(hash32[7<<3]); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 8; uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 19; // aligned + __m256i *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; - - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); - sha256_8way_init( &sha256_ctx8 ); - sha256_8way_update( &sha256_ctx8, vdata, 64 ); - + const __m256i last_byte = m256_const1_32( 0x80000000 ); + const __m256i eight = m256_const1_32( 8 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m256_const1_32( pdata[i] ); + + *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + + vdata[16+4] = last_byte; + memset_zero_256( vdata+16 + 5, 10 ); + vdata[16+15] = m256_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_256( block + 9, 6 ); + block[15] = m256_const1_32( 32*8 ); // bit count + + // initialize state + initstate[0] = m256_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m256_const1_64( 0x510E527F510E527F ); + initstate[5] = m256_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); + + sha256_8way_transform_le( midstate1, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); + do { - pdata[19] = n; - sha256t_8way_hash( hash32, vdata ); - for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) - { - extr_lane_8x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n + lane ); - submit_solution( work, lane_hash, mythr ); - } + // 1. final 16 bytes of data, with padding + sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); + + // 2. 32 byte hash from 1. + sha256_8way_transform_le( block, block, initstate ); + + // 3. 32 byte hash from 2. + if ( unlikely( + sha256_8way_transform_le_short( hash32, block, initstate ) ) ) + { + // byte swap final hash for testing + mm256_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash32_d7[ lane ] <= targ32_d7 ) + { + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } } - *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) ); + *noncev = _mm256_add_epi32( *noncev, eight ); n += 8; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; @@ -142,86 +188,174 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, #endif -#if defined(SHA256T_4WAY) -static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64))); +#if defined(SHA256T_4WAY) -void sha256t_4way_hash( void* output, const void* input ) +// Optimizations are slower with AVX/SSE2 +// https://github.com/JayDDee/cpuminer-opt/issues/344 +/* +int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - sha256_4way_context ctx; - memcpy( &ctx, &sha256_ctx4, sizeof ctx ); + __m128i vdata[32] __attribute__ ((aligned (64))); + __m128i block[16] __attribute__ ((aligned (32))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i initstate[8] __attribute__ ((aligned (32))); + __m128i midstate1[8] __attribute__ ((aligned (32))); + __m128i midstate2[8] __attribute__ ((aligned (32))); + __m128i mexp_pre[16] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + uint32_t n = first_nonce; + __m128i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i last_byte = m128_const1_32( 0x80000000 ); + const __m128i four = m128_const1_32( 4 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m128_const1_32( pdata[i] ); + + *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + + vdata[16+4] = last_byte; + memset_zero_128( vdata+16 + 5, 10 ); + vdata[16+15] = m128_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + + // initialize state + initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m128_const1_64( 0x510E527F510E527F ); + initstate[5] = m128_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_4way_transform_le( midstate1, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); + + do + { + // 1. final 16 bytes of data, with padding + sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); - sha256_4way_update( &ctx, input + (64<<2), 16 ); - sha256_4way_close( &ctx, vhash ); + // 2. 32 byte hash from 1. + sha256_4way_transform_le( block, block, initstate ); - sha256_4way_init( &ctx ); - sha256_4way_update( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, vhash ); + // 3. 32 byte hash from 2. + if ( unlikely( + sha256_4way_transform_le_short( hash32, block, initstate ) ) ) + { + // byte swap final hash for testing + mm128_block_bswap_32( hash32, hash32 ); - sha256_4way_init( &ctx ); - sha256_4way_update( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, output ); + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_4x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + } + *noncev = _mm_add_epi32( *noncev, four ); + n += 4; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; } +*/ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<2]); + __m128i vdata[32] __attribute__ ((aligned (64))); + __m128i block[16] __attribute__ ((aligned (32))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i initstate[8] __attribute__ ((aligned (32))); + __m128i midstate[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; + const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned + __m128i *noncev = vdata + 19; const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i last_byte = m128_const1_32( 0x80000000 ); + const __m128i four = m128_const1_32( 4 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m128_const1_32( pdata[i] ); + + *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + + vdata[16+4] = last_byte; + memset_zero_128( vdata+16 + 5, 10 ); + vdata[16+15] = m128_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + + // initialize state + initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m128_const1_64( 0x510E527F510E527F ); + initstate[5] = m128_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_4way_transform_le( midstate, vdata, initstate ); - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - sha256_4way_init( &sha256_ctx4 ); - sha256_4way_update( &sha256_ctx4, vdata, 64 ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) + do { - const uint32_t mask = masks[m]; - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); - pdata[19] = n; - - sha256t_4way_hash( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( !( hash7[ lane ] & mask ) ) + sha256_4way_transform_le( block, vdata+16, midstate ); + sha256_4way_transform_le( block, block, initstate ); + sha256_4way_transform_le( hash32, block, initstate ); + mm128_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_4x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); - } - } - n += 4; - } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm_add_epi32( *noncev, four ); + n += 4; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } + #endif diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c index 166efe22..e05c7060 100644 --- a/algo/sha/sha256t-gate.c +++ b/algo/sha/sha256t-gate.c @@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; #if defined(SHA256T_16WAY) gate->scanhash = (void*)&scanhash_sha256t_16way; - gate->hash = (void*)&sha256t_16way_hash; #elif defined(__SHA__) gate->optimizations = SHA_OPT; gate->scanhash = (void*)&scanhash_sha256t; - gate->hash = (void*)&sha256t_hash; #elif defined(SHA256T_8WAY) gate->scanhash = (void*)&scanhash_sha256t_8way; - gate->hash = (void*)&sha256t_8way_hash; #else gate->scanhash = (void*)&scanhash_sha256t_4way; - gate->hash = (void*)&sha256t_4way_hash; #endif return true; } diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index 46266f2b..e74cfd1d 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate ); #if defined(SHA256T_16WAY) -void sha256t_16way_hash( void *output, const void *input ); int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void sha256q_16way_hash( void *output, const void *input ); @@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce, #if defined(SHA256T_8WAY) -void sha256t_8way_hash( void *output, const void *input ); int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void sha256q_8way_hash( void *output, const void *input ); @@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce, #if defined(SHA256T_4WAY) -void sha256t_4way_hash( void *output, const void *input ); int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void sha256q_4way_hash( void *output, const void *input ); @@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif +#if defined(__SHA__) -int sha256t_hash( void *output, const void *input ); int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + int sha256q_hash( void *output, const void *input ); int scanhash_sha256q( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c index bd4edf0f..c528d279 100644 --- a/algo/sha/sha256t.c +++ b/algo/sha/sha256t.c @@ -3,46 +3,23 @@ #include #include #include -#include "algo/sha/sph_sha2.h" +//#include "algo/sha/sph_sha2.h" +#include "sha256-hash.h" -// Only used on CPUs with SHA - -static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64))); - -void sha256t_midstate( const void* input ) -{ - sph_sha256_init( &sha256t_ctx ); - sph_sha256( &sha256t_ctx, input, 64 ); -} +#if defined(__SHA__) -int sha256t_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(64) hash[16]; - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - sph_sha256_context ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx ); - - sph_sha256( &ctx, input + midlen, tail ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, output ); +// Only used on CPUs with SHA - return 1; -} int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t edata[20] __attribute__((aligned(64))); - uint32_t hash[8] __attribute__((aligned(64))); + uint32_t block0[16] __attribute__ ((aligned (64))); + uint32_t block1[16] __attribute__ ((aligned (64))); + uint32_t hash0[8] __attribute__ ((aligned (32))); + uint32_t hash1[8] __attribute__ ((aligned (32))); + uint32_t initstate[8] __attribute__ ((aligned (32))); + uint32_t midstate[8] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -50,24 +27,76 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; const bool bench = opt_benchmark; + __m128i shuf_bswap32 = + _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + + // initialize state + initstate[0] = 0x6A09E667; + initstate[1] = 0xBB67AE85; + initstate[2] = 0x3C6EF372; + initstate[3] = 0xA54FF53A; + initstate[4] = 0x510E527F; + initstate[5] = 0x9B05688C; + initstate[6] = 0x1F83D9AB; + initstate[7] = 0x5BE0CD19; - mm128_bswap32_80( edata, pdata ); - sha256t_midstate( edata ); + // hash first 64 bytes of data + sha256_opt_transform_le( midstate, pdata, initstate ); do { - edata[19] = n; - if ( likely( sha256t_hash( hash, edata ) ) ) - if ( unlikely( valid_hash( hash, ptarget ) && !bench ) ) + // 1. final 16 bytes of data, with padding + memcpy( block0, pdata + 16, 16 ); + memcpy( block1, pdata + 16, 16 ); + block0[ 3] = n; + block1[ 3] = n+1; + block0[ 4] = block1[ 4] = 0x80000000; + memset( block0 + 5, 0, 40 ); + memset( block1 + 5, 0, 40 ); + block0[15] = block1[15] = 80*8; // bit count + sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate ); + + // 2. 32 byte hash from 1. + memcpy( block0, hash0, 32 ); + memcpy( block1, hash1, 32 ); + block0[ 8] = block1[ 8] = 0x80000000; + memset( block0 + 9, 0, 24 ); + memset( block1 + 9, 0, 24 ); + block0[15] = block1[15] = 32*8; // bit count + sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate ); + + // 3. 32 byte hash from 2. + memcpy( block0, hash0, 32 ); + memcpy( block1, hash1, 32 ); + sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate ); + + // byte swap final hash for testing + casti_m128i( hash0, 0 ) = + _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 ); + casti_m128i( hash0, 1 ) = + _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 ); + casti_m128i( hash1, 0 ) = + _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 ); + casti_m128i( hash1, 1 ) = + _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 ); + + if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) ) { - pdata[19] = bswap_32( n ); - submit_solution( work, hash, mythr ); + pdata[19] = n; + submit_solution( work, hash0, mythr ); } - n++; - } while ( n < last_nonce && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce; + if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) ) + { + pdata[19] = n+1; + submit_solution( work, hash1, mythr ); + } + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } +#endif diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 9f5349b0..7c96d2eb 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -95,84 +95,36 @@ static const uint64_t K512[80] = // SHA-512 8 way 64 bit -#define CH8W(X, Y, Z) \ - _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) +#define CH8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca ) -#define MAJ8W(X, Y, Z) \ - _mm512_or_si512( _mm512_and_si512( X, Y ), \ - _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) +#define MAJ8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 ) -#define BSG8W_5_0(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) ) +#define BSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 28 ), \ + _mm512_ror_epi64( x, 34 ), \ + _mm512_ror_epi64( x, 39 ) ) -#define BSG8W_5_1(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) ) +#define BSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 14 ), \ + _mm512_ror_epi64( x, 18 ), \ + _mm512_ror_epi64( x, 41 ) ) -#define SSG8W_5_0(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) ) +#define SSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 1 ), \ + _mm512_ror_epi64( x, 8 ), \ + _mm512_srli_epi64( x, 7 ) ) -#define SSG8W_5_1(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) ) +#define SSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 19 ), \ + _mm512_ror_epi64( x, 61 ), \ + _mm512_srli_epi64( x, 6 ) ) -static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 ) -{ - __m512i w0a, w1a, w0b, w1b; - w0a = mm512_ror_64( w0, 1 ); - w1a = mm512_ror_64( w1,19 ); - w0b = mm512_ror_64( w0, 8 ); - w1b = mm512_ror_64( w1,61 ); - w0a = _mm512_xor_si512( w0a, w0b ); - w1a = _mm512_xor_si512( w1a, w1b ); - w0b = _mm512_srli_epi64( w0, 7 ); - w1b = _mm512_srli_epi64( w1, 6 ); - w0a = _mm512_xor_si512( w0a, w0b ); - w1a = _mm512_xor_si512( w1a, w1b ); - return _mm512_add_epi64( w0a, w1a ); -} - - -#define SSG8W_512x2_0( w0, w1, i ) do \ -{ \ - __m512i X0a, X1a, X0b, X1b; \ - X0a = mm512_ror_64( W[i-15], 1 ); \ - X1a = mm512_ror_64( W[i-14], 1 ); \ - X0b = mm512_ror_64( W[i-15], 8 ); \ - X1b = mm512_ror_64( W[i-14], 8 ); \ - X0a = _mm512_xor_si512( X0a, X0b ); \ - X1a = _mm512_xor_si512( X1a, X1b ); \ - X0b = _mm512_srli_epi64( W[i-15], 7 ); \ - X1b = _mm512_srli_epi64( W[i-14], 7 ); \ - w0 = _mm512_xor_si512( X0a, X0b ); \ - w1 = _mm512_xor_si512( X1a, X1b ); \ -} while(0) - -#define SSG8W_512x2_1( w0, w1, i ) do \ -{ \ - __m512i X0a, X1a, X0b, X1b; \ - X0a = mm512_ror_64( W[i-2],19 ); \ - X1a = mm512_ror_64( W[i-1],19 ); \ - X0b = mm512_ror_64( W[i-2],61 ); \ - X1b = mm512_ror_64( W[i-1],61 ); \ - X0a = _mm512_xor_si512( X0a, X0b ); \ - X1a = _mm512_xor_si512( X1a, X1b ); \ - X0b = _mm512_srli_epi64( W[i-2], 6 ); \ - X1b = _mm512_srli_epi64( W[i-1], 6 ); \ - w0 = _mm512_xor_si512( X0a, X0b ); \ - w1 = _mm512_xor_si512( X1a, X1b ); \ -} while(0) - -#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \ +#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \ do { \ - __m512i T1, T2; \ - __m512i K = _mm512_set1_epi64( K512[ i ] ); \ - T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \ - K, W[i] ) ); \ - T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \ - D = _mm512_add_epi64( D, T1 ); \ + __m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \ + __m512i T1 = BSG8W_5_1( E ); \ + __m512i T2 = BSG8W_5_0( A ); \ + T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \ + T1 = _mm512_add_epi64( T1, H ); \ + T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \ + T1 = _mm512_add_epi64( T1, T0 ); \ + D = _mm512_add_epi64( D, T1 ); \ H = _mm512_add_epi64( T1, T2 ); \ } while (0) @@ -187,8 +139,8 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] ) mm512_block_bswap_64( W+8, in+8 ); for ( i = 16; i < 80; i++ ) - W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ), - _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) ); + W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ), + W[ i- 7 ], W[ i-16 ] ); if ( ctx->initialized ) { @@ -319,14 +271,13 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst ) // SHA-512 4 way 64 bit -/* #define CH(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) #define MAJ(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) - + _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ + Y_xor_Z ) ) + #define BSG5_0(x) \ mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \ _mm256_xor_si256( mm256_ror_64( x, 5 ), x ), 6 ), x ), 28 ) @@ -334,16 +285,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst ) #define BSG5_1(x) \ mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \ _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 ) -*/ -/* -#define BSG5_0(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) ) -#define BSG5_1(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) ) -*/ /* #define SSG5_0(x) \ _mm256_xor_si256( _mm256_xor_si256( \ @@ -371,98 +313,25 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 ) return _mm256_add_epi64( w0a, w1a ); } -/* -#define SSG512x2_0( w0, w1, i ) do \ -{ \ - __m256i X0a, X1a, X0b, X1b; \ - X0a = mm256_ror_64( W[i-15], 1 ); \ - X1a = mm256_ror_64( W[i-14], 1 ); \ - X0b = mm256_ror_64( W[i-15], 8 ); \ - X1b = mm256_ror_64( W[i-14], 8 ); \ - X0a = _mm256_xor_si256( X0a, X0b ); \ - X1a = _mm256_xor_si256( X1a, X1b ); \ - X0b = _mm256_srli_epi64( W[i-15], 7 ); \ - X1b = _mm256_srli_epi64( W[i-14], 7 ); \ - w0 = _mm256_xor_si256( X0a, X0b ); \ - w1 = _mm256_xor_si256( X1a, X1b ); \ -} while(0) - -#define SSG512x2_1( w0, w1, i ) do \ -{ \ - __m256i X0a, X1a, X0b, X1b; \ - X0a = mm256_ror_64( W[i-2],19 ); \ - X1a = mm256_ror_64( W[i-1],19 ); \ - X0b = mm256_ror_64( W[i-2],61 ); \ - X1b = mm256_ror_64( W[i-1],61 ); \ - X0a = _mm256_xor_si256( X0a, X0b ); \ - X1a = _mm256_xor_si256( X1a, X1b ); \ - X0b = _mm256_srli_epi64( W[i-2], 6 ); \ - X1b = _mm256_srli_epi64( W[i-1], 6 ); \ - w0 = _mm256_xor_si256( X0a, X0b ); \ - w1 = _mm256_xor_si256( X1a, X1b ); \ -} while(0) -*/ - -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ -do { \ - __m256i K = _mm256_set1_epi64x( K512[ i ] ); \ - __m256i T1 = mm256_ror_64( E, 23 ); \ - __m256i T2 = mm256_ror_64( A, 5 ); \ - __m256i T3 = _mm256_xor_si256( F, G ); \ - __m256i T4 = _mm256_or_si256( A, B ); \ - __m256i T5 = _mm256_and_si256( A, B ); \ - K = _mm256_add_epi64( K, W[i] ); \ - T1 = _mm256_xor_si256( T1, E ); \ - T2 = _mm256_xor_si256( T2, A ); \ - T3 = _mm256_and_si256( T3, E ); \ - T4 = _mm256_and_si256( T4, C ); \ - K = _mm256_add_epi64( H, K ); \ - T1 = mm256_ror_64( T1, 4 ); \ - T2 = mm256_ror_64( T2, 6 ); \ - T3 = _mm256_xor_si256( T3, G ); \ - T4 = _mm256_or_si256( T4, T5 ); \ - T1 = _mm256_xor_si256( T1, E ); \ - T2 = _mm256_xor_si256( T2, A ); \ - T1 = mm256_ror_64( T1, 14 ); \ - T2 = mm256_ror_64( T2, 28 ); \ - T1 = _mm256_add_epi64( T1, T3 ); \ - T2 = _mm256_add_epi64( T2, T4 ); \ - T1 = _mm256_add_epi64( T1, K ); \ - H = _mm256_add_epi64( T1, T2 ); \ - D = _mm256_add_epi64( D, T1 ); \ -} while (0) - -/* -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ -do { \ - __m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \ - __m256i T1 = BSG5_1(E); \ - __m256i T2 = BSG5_0(A); \ - T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \ - T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \ - D = _mm256_add_epi64( D, T1 ); \ - H = _mm256_add_epi64( T1, T2 ); \ -} while (0) -*/ - -/* -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ +#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \ do { \ - __m256i T1, T2; \ - __m256i K = _mm256_set1_epi64x( K512[ i ] ); \ - T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \ - K, W[i] ) ); \ - T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ - D = _mm256_add_epi64( D, T1 ); \ + __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \ + __m256i T1 = BSG5_1( E ); \ + __m256i T2 = BSG5_0( A ); \ + T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \ + T1 = _mm256_add_epi64( T1, H ); \ + T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \ + T1 = _mm256_add_epi64( T1, T0 ); \ + Y_xor_Z = X_xor_Y; \ + D = _mm256_add_epi64( D, T1 ); \ H = _mm256_add_epi64( T1, T2 ); \ } while (0) -*/ static void sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] ) { int i; - register __m256i A, B, C, D, E, F, G, H; + register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m256i W[80]; mm256_block_bswap_64( W , in ); @@ -495,6 +364,8 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] ) H = m256_const1_64( 0x5BE0CD19137E2179 ); } + Y_xor_Z = _mm256_xor_si256( B, C ); + for ( i = 0; i < 80; i += 8 ) { SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c index e87936dd..cab78589 100644 --- a/algo/sha/sph_sha2.c +++ b/algo/sha/sph_sha2.c @@ -40,8 +40,8 @@ #endif #define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) -#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) - +//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) +#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) ) #define ROTR SPH_ROTR32 #define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) @@ -71,12 +71,8 @@ static const sph_u32 H256[8] = { * of the compression function. */ -#if defined(__SHA__) - -#include "sha256-hash-opt.c" - -#else // no SHA +/* static const sph_u32 K[64] = { SPH_C32(0x428A2F98), SPH_C32(0x71374491), SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5), @@ -111,6 +107,7 @@ static const sph_u32 K[64] = { SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB), SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2) }; +*/ #if SPH_SMALL_FOOTPRINT_SHA2 @@ -130,6 +127,7 @@ static const sph_u32 K[64] = { t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \ + K[pcount + (pc)] + W[(pc) & 0x0F]); \ t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \ + Y_xor_Z = X_xor_Y; \ d = SPH_T32(d + t1); \ h = SPH_T32(t1 + t2); \ } while (0) @@ -140,7 +138,7 @@ static const sph_u32 K[64] = { SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc) #define SHA2_ROUND_BODY(in, r) do { \ - sph_u32 A, B, C, D, E, F, G, H; \ + sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \ sph_u32 W[16]; \ unsigned pcount; \ \ @@ -153,6 +151,7 @@ static const sph_u32 K[64] = { G = (r)[6]; \ H = (r)[7]; \ pcount = 0; \ + Y_xor_Z = B ^ C; \ SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \ SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \ SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \ @@ -200,7 +199,7 @@ static const sph_u32 K[64] = { #else // large footprint (default) #define SHA2_ROUND_BODY(in, r) do { \ - sph_u32 A, B, C, D, E, F, G, H, T1, T2; \ + sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \ sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \ sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \ \ @@ -212,388 +211,453 @@ static const sph_u32 K[64] = { F = (r)[5]; \ G = (r)[6]; \ H = (r)[7]; \ + Y_xor_Z = B ^ C; \ W00 = in(0); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x428A2F98) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = in(1); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x71374491) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = in(2); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0xB5C0FBCF) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = in(3); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0xE9B5DBA5) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = in(4); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x3956C25B) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = in(5); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x59F111F1) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = in(6); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x923F82A4) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = in(7); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0xAB1C5ED5) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = in(8); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0xD807AA98) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = in(9); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x12835B01) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = in(10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x243185BE) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = in(11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x550C7DC3) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = in(12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x72BE5D74) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = in(13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x80DEB1FE) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = in(14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x9BDC06A7) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = in(15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0xC19BF174) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0xE49B69C1) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0xEFBE4786) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x0FC19DC6) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x240CA1CC) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x2DE92C6F) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x4A7484AA) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x5CB0A9DC) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x76F988DA) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x983E5152) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0xA831C66D) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0xB00327C8) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0xBF597FC7) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0xC6E00BF3) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0xD5A79147) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x06CA6351) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x14292967) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x27B70A85) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x2E1B2138) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x4D2C6DFC) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x53380D13) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x650A7354) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x766A0ABB) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x81C2C92E) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x92722C85) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0xA2BFE8A1) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0xA81A664B) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0xC24B8B70) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0xC76C51A3) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0xD192E819) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0xD6990624) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0xF40E3585) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x106AA070) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x19A4C116) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x1E376C08) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x2748774C) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x34B0BCB5) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x391C0CB3) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x4ED8AA4A) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x5B9CCA4F) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x682E6FF3) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x748F82EE) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x78A5636F) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x84C87814) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x8CC70208) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x90BEFFFA) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0xA4506CEB) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0xBEF9A3F7) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0xC67178F2) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ (r)[0] = SPH_T32((r)[0] + A); \ @@ -619,8 +683,54 @@ sha2_round(const unsigned char *data, sph_u32 r[8]) #undef SHA2_IN } -#endif // SHA else +void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ) +{ +memcpy( state_out, state_in, 32 ); +#define SHA2_IN(x) (data[x]) + SHA2_ROUND_BODY( SHA2_IN, state_out ); +#undef SHA2_IN +} + +void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ) +{ +memcpy( state_out, state_in, 32 ); +#define SHA2_IN(x) sph_dec32be_aligned( data+(x) ) + SHA2_ROUND_BODY( SHA2_IN, state_out ); +#undef SHA2_IN +} + +void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ) +{ + uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2]; + memcpy( state_out, state_in, 32 ); + + t1 = state_out[7] + BSG2_1( state_out[4] ) + + CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0]; + t2 = BSG2_0( state_out[0] ) + + MAJ( state_out[0], state_out[1], state_out[2] ); + Y_xor_Z = X_xor_Y; + state_out[3] += t1; + state_out[7] = t1 + t2; + + t1 = state_out[6] + BSG2_1( state_out[3] ) + + CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1]; + t2 = BSG2_0( state_out[7] ) + + MAJ( state_out[7], state_out[0], state_out[1] ); + Y_xor_Z = X_xor_Y; + state_out[2] += t1; + state_out[6] = t1 + t2; + + t1 = state_out[5] + BSG2_1( state_out[2] ) + + CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2]; + t2 = BSG2_0( state_out[6] ) + + MAJ( state_out[6], state_out[7], state_out[0] ); + state_out[1] += t1; + state_out[5] = t1 + t2; +} /* see sph_sha2.h */ void @@ -689,6 +799,14 @@ sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) // sph_sha256_init(cc); } +void sph_sha256_full( void *dst, const void *data, size_t len ) +{ + sph_sha256_context cc; + sph_sha256_init( &cc ); + sph_sha256( &cc, data, len ); + sph_sha256_close( &cc, dst ); +} + /* see sph_sha2.h */ //void //sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]) diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h index df0e8369..ab05423e 100644 --- a/algo/sha/sph_sha2.h +++ b/algo/sha/sph_sha2.h @@ -205,6 +205,20 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]); #define sph_sha256_comp sph_sha224_comp #endif +void sph_sha256_full( void *dst, const void *data, size_t len ); + +// These shouldn't be called directly, use sha256-hash.h generic functions +// sha256_transform_le & sha256_transform_be instead. +void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); + +void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); + +void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); + + #if SPH_64 /** diff --git a/algo/sha/sph_sha2big.c b/algo/sha/sph_sha2big.c index 8ea292f6..06d2d16e 100644 --- a/algo/sha/sph_sha2big.c +++ b/algo/sha/sph_sha2big.c @@ -38,7 +38,8 @@ #if SPH_64 #define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) -#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) +//#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) +#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) ) #define ROTR64 SPH_ROTR64 diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index dffa18d1..06116ff7 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -62,32 +62,34 @@ extern "C"{ #if defined(__AVX2__) #define DECL_STATE8 \ - __m256i A00, A01, A02, A03, A04, A05, A06, A07, \ - A08, A09, A0A, A0B; \ + __m256i A0, A1, A2, A3, A4, A5, A6, A7, \ + A8, A9, AA, AB; \ __m256i B0, B1, B2, B3, B4, B5, B6, B7, \ B8, B9, BA, BB, BC, BD, BE, BF; \ __m256i C0, C1, C2, C3, C4, C5, C6, C7, \ C8, C9, CA, CB, CC, CD, CE, CF; \ __m256i M0, M1, M2, M3, M4, M5, M6, M7, \ M8, M9, MA, MB, MC, MD, ME, MF; \ + const __m256i FIVE = _mm256_set1_epi32( 5 ); \ + const __m256i THREE = _mm256_set1_epi32( 3 ); \ sph_u32 Wlow, Whigh; #define READ_STATE8(state) do \ { \ if ( (state)->state_loaded ) \ { \ - A00 = (state)->A[0]; \ - A01 = (state)->A[1]; \ - A02 = (state)->A[2]; \ - A03 = (state)->A[3]; \ - A04 = (state)->A[4]; \ - A05 = (state)->A[5]; \ - A06 = (state)->A[6]; \ - A07 = (state)->A[7]; \ - A08 = (state)->A[8]; \ - A09 = (state)->A[9]; \ - A0A = (state)->A[10]; \ - A0B = (state)->A[11]; \ + A0 = (state)->A[0]; \ + A1 = (state)->A[1]; \ + A2 = (state)->A[2]; \ + A3 = (state)->A[3]; \ + A4 = (state)->A[4]; \ + A5 = (state)->A[5]; \ + A6 = (state)->A[6]; \ + A7 = (state)->A[7]; \ + A8 = (state)->A[8]; \ + A9 = (state)->A[9]; \ + AA = (state)->A[10]; \ + AB = (state)->A[11]; \ B0 = (state)->B[0]; \ B1 = (state)->B[1]; \ B2 = (state)->B[2]; \ @@ -124,18 +126,18 @@ extern "C"{ else \ { \ (state)->state_loaded = true; \ - A00 = m256_const1_64( 0x20728DFD20728DFD ); \ - A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \ - A02 = m256_const1_64( 0xE782B699E782B699 ); \ - A03 = m256_const1_64( 0x5530463255304632 ); \ - A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \ - A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \ - A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \ - A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \ - A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \ - A09 = m256_const1_64( 0x8BD144108BD14410 ); \ - A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \ - A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \ + A0 = m256_const1_64( 0x20728DFD20728DFD ); \ + A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \ + A2 = m256_const1_64( 0xE782B699E782B699 ); \ + A3 = m256_const1_64( 0x5530463255304632 ); \ + A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \ + A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \ + A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \ + A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \ + A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \ + A9 = m256_const1_64( 0x8BD144108BD14410 ); \ + AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \ + AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \ B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \ B1 = m256_const1_64( 0x07B385F307B385F3 ); \ B2 = m256_const1_64( 0xE7442C26E7442C26 ); \ @@ -174,18 +176,18 @@ extern "C"{ } while (0) #define WRITE_STATE8(state) do { \ - (state)->A[0] = A00; \ - (state)->A[1] = A01; \ - (state)->A[2] = A02; \ - (state)->A[3] = A03; \ - (state)->A[4] = A04; \ - (state)->A[5] = A05; \ - (state)->A[6] = A06; \ - (state)->A[7] = A07; \ - (state)->A[8] = A08; \ - (state)->A[9] = A09; \ - (state)->A[10] = A0A; \ - (state)->A[11] = A0B; \ + (state)->A[0] = A0; \ + (state)->A[1] = A1; \ + (state)->A[2] = A2; \ + (state)->A[3] = A3; \ + (state)->A[4] = A4; \ + (state)->A[5] = A5; \ + (state)->A[6] = A6; \ + (state)->A[7] = A7; \ + (state)->A[8] = A8; \ + (state)->A[9] = A9; \ + (state)->A[10] = AA; \ + (state)->A[11] = AB; \ (state)->B[0] = B0; \ (state)->B[1] = B1; \ (state)->B[2] = B2; \ @@ -284,8 +286,8 @@ do { \ #define XOR_W8 \ do { \ - A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \ - A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \ + A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \ + A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \ } while (0) #define SWAP_BC8 \ @@ -310,69 +312,69 @@ do { \ #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ do { \ - xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \ + xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \ _mm256_andnot_si256( xb3, xb2 ), \ - _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \ - _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \ - ) ), _mm256_set1_epi32(3UL) ) ) ) ); \ - xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \ + _mm256_mullo_epi32( mm256_xor3( xa0, xc, \ + _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \ + FIVE ) ), THREE ) ) ); \ + xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \ } while (0) #define PERM_STEP_0_8 do { \ - PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \ + PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_1_8 do { \ - PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \ + PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_2_8 do { \ - PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \ } while (0) #define APPLY_P8 \ @@ -396,42 +398,42 @@ do { \ PERM_STEP_0_8; \ PERM_STEP_1_8; \ PERM_STEP_2_8; \ - A0B = _mm256_add_epi32( A0B, C6 ); \ - A0A = _mm256_add_epi32( A0A, C5 ); \ - A09 = _mm256_add_epi32( A09, C4 ); \ - A08 = _mm256_add_epi32( A08, C3 ); \ - A07 = _mm256_add_epi32( A07, C2 ); \ - A06 = _mm256_add_epi32( A06, C1 ); \ - A05 = _mm256_add_epi32( A05, C0 ); \ - A04 = _mm256_add_epi32( A04, CF ); \ - A03 = _mm256_add_epi32( A03, CE ); \ - A02 = _mm256_add_epi32( A02, CD ); \ - A01 = _mm256_add_epi32( A01, CC ); \ - A00 = _mm256_add_epi32( A00, CB ); \ - A0B = _mm256_add_epi32( A0B, CA ); \ - A0A = _mm256_add_epi32( A0A, C9 ); \ - A09 = _mm256_add_epi32( A09, C8 ); \ - A08 = _mm256_add_epi32( A08, C7 ); \ - A07 = _mm256_add_epi32( A07, C6 ); \ - A06 = _mm256_add_epi32( A06, C5 ); \ - A05 = _mm256_add_epi32( A05, C4 ); \ - A04 = _mm256_add_epi32( A04, C3 ); \ - A03 = _mm256_add_epi32( A03, C2 ); \ - A02 = _mm256_add_epi32( A02, C1 ); \ - A01 = _mm256_add_epi32( A01, C0 ); \ - A00 = _mm256_add_epi32( A00, CF ); \ - A0B = _mm256_add_epi32( A0B, CE ); \ - A0A = _mm256_add_epi32( A0A, CD ); \ - A09 = _mm256_add_epi32( A09, CC ); \ - A08 = _mm256_add_epi32( A08, CB ); \ - A07 = _mm256_add_epi32( A07, CA ); \ - A06 = _mm256_add_epi32( A06, C9 ); \ - A05 = _mm256_add_epi32( A05, C8 ); \ - A04 = _mm256_add_epi32( A04, C7 ); \ - A03 = _mm256_add_epi32( A03, C6 ); \ - A02 = _mm256_add_epi32( A02, C5 ); \ - A01 = _mm256_add_epi32( A01, C4 ); \ - A00 = _mm256_add_epi32( A00, C3 ); \ + AB = _mm256_add_epi32( AB, C6 ); \ + AA = _mm256_add_epi32( AA, C5 ); \ + A9 = _mm256_add_epi32( A9, C4 ); \ + A8 = _mm256_add_epi32( A8, C3 ); \ + A7 = _mm256_add_epi32( A7, C2 ); \ + A6 = _mm256_add_epi32( A6, C1 ); \ + A5 = _mm256_add_epi32( A5, C0 ); \ + A4 = _mm256_add_epi32( A4, CF ); \ + A3 = _mm256_add_epi32( A3, CE ); \ + A2 = _mm256_add_epi32( A2, CD ); \ + A1 = _mm256_add_epi32( A1, CC ); \ + A0 = _mm256_add_epi32( A0, CB ); \ + AB = _mm256_add_epi32( AB, CA ); \ + AA = _mm256_add_epi32( AA, C9 ); \ + A9 = _mm256_add_epi32( A9, C8 ); \ + A8 = _mm256_add_epi32( A8, C7 ); \ + A7 = _mm256_add_epi32( A7, C6 ); \ + A6 = _mm256_add_epi32( A6, C5 ); \ + A5 = _mm256_add_epi32( A5, C4 ); \ + A4 = _mm256_add_epi32( A4, C3 ); \ + A3 = _mm256_add_epi32( A3, C2 ); \ + A2 = _mm256_add_epi32( A2, C1 ); \ + A1 = _mm256_add_epi32( A1, C0 ); \ + A0 = _mm256_add_epi32( A0, CF ); \ + AB = _mm256_add_epi32( AB, CE ); \ + AA = _mm256_add_epi32( AA, CD ); \ + A9 = _mm256_add_epi32( A9, CC ); \ + A8 = _mm256_add_epi32( A8, CB ); \ + A7 = _mm256_add_epi32( A7, CA ); \ + A6 = _mm256_add_epi32( A6, C9 ); \ + A5 = _mm256_add_epi32( A5, C8 ); \ + A4 = _mm256_add_epi32( A4, C7 ); \ + A3 = _mm256_add_epi32( A3, C6 ); \ + A2 = _mm256_add_epi32( A2, C5 ); \ + A1 = _mm256_add_epi32( A1, C4 ); \ + A0 = _mm256_add_epi32( A0, C3 ); \ } while (0) #define INCR_W8 do { \ @@ -658,32 +660,34 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #define DECL_STATE \ - __m128i A00, A01, A02, A03, A04, A05, A06, A07, \ - A08, A09, A0A, A0B; \ + __m128i A0, A1, A2, A3, A4, A5, A6, A7, \ + A8, A9, AA, AB; \ __m128i B0, B1, B2, B3, B4, B5, B6, B7, \ B8, B9, BA, BB, BC, BD, BE, BF; \ __m128i C0, C1, C2, C3, C4, C5, C6, C7, \ C8, C9, CA, CB, CC, CD, CE, CF; \ __m128i M0, M1, M2, M3, M4, M5, M6, M7, \ M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 Wlow, Whigh; + const __m128i FIVE = _mm_set1_epi32( 5 ); \ + const __m128i THREE = _mm_set1_epi32( 3 ); \ + sph_u32 Wlow, Whigh; #define READ_STATE(state) do \ { \ if ( (state)->state_loaded ) \ { \ - A00 = (state)->A[0]; \ - A01 = (state)->A[1]; \ - A02 = (state)->A[2]; \ - A03 = (state)->A[3]; \ - A04 = (state)->A[4]; \ - A05 = (state)->A[5]; \ - A06 = (state)->A[6]; \ - A07 = (state)->A[7]; \ - A08 = (state)->A[8]; \ - A09 = (state)->A[9]; \ - A0A = (state)->A[10]; \ - A0B = (state)->A[11]; \ + A0 = (state)->A[0]; \ + A1 = (state)->A[1]; \ + A2 = (state)->A[2]; \ + A3 = (state)->A[3]; \ + A4 = (state)->A[4]; \ + A5 = (state)->A[5]; \ + A6 = (state)->A[6]; \ + A7 = (state)->A[7]; \ + A8 = (state)->A[8]; \ + A9 = (state)->A[9]; \ + AA = (state)->A[10]; \ + AB = (state)->A[11]; \ B0 = (state)->B[0]; \ B1 = (state)->B[1]; \ B2 = (state)->B[2]; \ @@ -720,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) else \ { \ (state)->state_loaded = true; \ - A00 = m128_const1_64( 0x20728DFD20728DFD ); \ - A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \ - A02 = m128_const1_64( 0xE782B699E782B699 ); \ - A03 = m128_const1_64( 0x5530463255304632 ); \ - A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \ - A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \ - A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \ - A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \ - A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \ - A09 = m128_const1_64( 0x8BD144108BD14410 ); \ - A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \ - A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \ + A0 = m128_const1_64( 0x20728DFD20728DFD ); \ + A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \ + A2 = m128_const1_64( 0xE782B699E782B699 ); \ + A3 = m128_const1_64( 0x5530463255304632 ); \ + A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \ + A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \ + A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \ + A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \ + A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \ + A9 = m128_const1_64( 0x8BD144108BD14410 ); \ + AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \ + AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \ B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \ B1 = m128_const1_64( 0x07B385F307B385F3 ); \ B2 = m128_const1_64( 0xE7442C26E7442C26 ); \ @@ -770,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) } while (0) #define WRITE_STATE(state) do { \ - (state)->A[0] = A00; \ - (state)->A[1] = A01; \ - (state)->A[2] = A02; \ - (state)->A[3] = A03; \ - (state)->A[4] = A04; \ - (state)->A[5] = A05; \ - (state)->A[6] = A06; \ - (state)->A[7] = A07; \ - (state)->A[8] = A08; \ - (state)->A[9] = A09; \ - (state)->A[10] = A0A; \ - (state)->A[11] = A0B; \ + (state)->A[0] = A0; \ + (state)->A[1] = A1; \ + (state)->A[2] = A2; \ + (state)->A[3] = A3; \ + (state)->A[4] = A4; \ + (state)->A[5] = A5; \ + (state)->A[6] = A6; \ + (state)->A[7] = A7; \ + (state)->A[8] = A8; \ + (state)->A[9] = A9; \ + (state)->A[10] = AA; \ + (state)->A[11] = AB; \ (state)->B[0] = B0; \ (state)->B[1] = B1; \ (state)->B[2] = B2; \ @@ -880,8 +884,8 @@ do { \ #define XOR_W \ do { \ - A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \ - A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \ + A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \ + A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \ } while (0) @@ -930,66 +934,66 @@ do { \ xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \ _mm_andnot_si128( xb3, xb2 ), \ _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \ - _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \ - ) ), _mm_set1_epi32(3UL) ) ) ) ); \ + _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \ + ) ), THREE ) ) ) ); \ xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \ } while (0) #define PERM_STEP_0 do { \ - PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ + PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_1 do { \ - PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ + PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_2 do { \ - PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \ } while (0) #define APPLY_P \ @@ -1013,42 +1017,42 @@ do { \ PERM_STEP_0; \ PERM_STEP_1; \ PERM_STEP_2; \ - A0B = _mm_add_epi32( A0B, C6 ); \ - A0A = _mm_add_epi32( A0A, C5 ); \ - A09 = _mm_add_epi32( A09, C4 ); \ - A08 = _mm_add_epi32( A08, C3 ); \ - A07 = _mm_add_epi32( A07, C2 ); \ - A06 = _mm_add_epi32( A06, C1 ); \ - A05 = _mm_add_epi32( A05, C0 ); \ - A04 = _mm_add_epi32( A04, CF ); \ - A03 = _mm_add_epi32( A03, CE ); \ - A02 = _mm_add_epi32( A02, CD ); \ - A01 = _mm_add_epi32( A01, CC ); \ - A00 = _mm_add_epi32( A00, CB ); \ - A0B = _mm_add_epi32( A0B, CA ); \ - A0A = _mm_add_epi32( A0A, C9 ); \ - A09 = _mm_add_epi32( A09, C8 ); \ - A08 = _mm_add_epi32( A08, C7 ); \ - A07 = _mm_add_epi32( A07, C6 ); \ - A06 = _mm_add_epi32( A06, C5 ); \ - A05 = _mm_add_epi32( A05, C4 ); \ - A04 = _mm_add_epi32( A04, C3 ); \ - A03 = _mm_add_epi32( A03, C2 ); \ - A02 = _mm_add_epi32( A02, C1 ); \ - A01 = _mm_add_epi32( A01, C0 ); \ - A00 = _mm_add_epi32( A00, CF ); \ - A0B = _mm_add_epi32( A0B, CE ); \ - A0A = _mm_add_epi32( A0A, CD ); \ - A09 = _mm_add_epi32( A09, CC ); \ - A08 = _mm_add_epi32( A08, CB ); \ - A07 = _mm_add_epi32( A07, CA ); \ - A06 = _mm_add_epi32( A06, C9 ); \ - A05 = _mm_add_epi32( A05, C8 ); \ - A04 = _mm_add_epi32( A04, C7 ); \ - A03 = _mm_add_epi32( A03, C6 ); \ - A02 = _mm_add_epi32( A02, C5 ); \ - A01 = _mm_add_epi32( A01, C4 ); \ - A00 = _mm_add_epi32( A00, C3 ); \ + AB = _mm_add_epi32( AB, C6 ); \ + AA = _mm_add_epi32( AA, C5 ); \ + A9 = _mm_add_epi32( A9, C4 ); \ + A8 = _mm_add_epi32( A8, C3 ); \ + A7 = _mm_add_epi32( A7, C2 ); \ + A6 = _mm_add_epi32( A6, C1 ); \ + A5 = _mm_add_epi32( A5, C0 ); \ + A4 = _mm_add_epi32( A4, CF ); \ + A3 = _mm_add_epi32( A3, CE ); \ + A2 = _mm_add_epi32( A2, CD ); \ + A1 = _mm_add_epi32( A1, CC ); \ + A0 = _mm_add_epi32( A0, CB ); \ + AB = _mm_add_epi32( AB, CA ); \ + AA = _mm_add_epi32( AA, C9 ); \ + A9 = _mm_add_epi32( A9, C8 ); \ + A8 = _mm_add_epi32( A8, C7 ); \ + A7 = _mm_add_epi32( A7, C6 ); \ + A6 = _mm_add_epi32( A6, C5 ); \ + A5 = _mm_add_epi32( A5, C4 ); \ + A4 = _mm_add_epi32( A4, C3 ); \ + A3 = _mm_add_epi32( A3, C2 ); \ + A2 = _mm_add_epi32( A2, C1 ); \ + A1 = _mm_add_epi32( A1, C0 ); \ + A0 = _mm_add_epi32( A0, CF ); \ + AB = _mm_add_epi32( AB, CE ); \ + AA = _mm_add_epi32( AA, CD ); \ + A9 = _mm_add_epi32( A9, CC ); \ + A8 = _mm_add_epi32( A8, CB ); \ + A7 = _mm_add_epi32( A7, CA ); \ + A6 = _mm_add_epi32( A6, C9 ); \ + A5 = _mm_add_epi32( A5, C8 ); \ + A4 = _mm_add_epi32( A4, C7 ); \ + A3 = _mm_add_epi32( A3, C6 ); \ + A2 = _mm_add_epi32( A2, C5 ); \ + A1 = _mm_add_epi32( A1, C4 ); \ + A0 = _mm_add_epi32( A0, C3 ); \ } while (0) #define INCR_W do { \ diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index 83f3e66b..7bf01d14 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -18,19 +18,31 @@ static const uint32_t IV512[] = 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A }; - +/* #define mm256_ror2x256hi_1x32( a, b ) \ - _mm256_blend_epi32( mm256_ror128_32( a ), \ - mm256_ror128_32( b ), 0x88 ) + _mm256_blend_epi32( mm256_shuflr128_32( a ), \ + mm256_shuflr128_32( b ), 0x88 ) +*/ + +//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 ) + +#if defined(__VAES__) + +#define mm256_aesenc_2x128( x, k ) \ + _mm256_aesenc_epi128( x, _mm256_castsi128_si256( k ) ) + +#else + +#define mm256_aesenc_2x128( x, k ) \ + mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \ + _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) ) + +#endif static void c512_2way( shavite512_2way_context *ctx, const void *msg ) { -#if defined(__VAES__) - const __m256i zero = _mm256_setzero_si256(); -#else const __m128i zero = _mm_setzero_si128(); -#endif __m256i p0, p1, p2, p3, x; __m256i k00, k01, k02, k03, k10, k11, k12, k13; __m256i *m = (__m256i*)msg; @@ -69,7 +81,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) { // round 1, 5, 9 - k00 = _mm256_xor_si256( k13, mm256_ror128_32( + k00 = _mm256_xor_si256( k13, mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ) ); if ( r == 0 ) @@ -79,7 +91,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); k01 = _mm256_xor_si256( k00, - mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) ); if ( r == 1 ) k01 = _mm256_xor_si256( k01, _mm256_set_epi32( @@ -88,25 +100,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( k01, - mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( k02, - mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); k10 = _mm256_xor_si256( k03, - mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); k11 = _mm256_xor_si256( k10, - mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = _mm256_xor_si256( k11, - mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( k12, - mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) ); if ( r == 2 ) k13 = _mm256_xor_si256( k13, _mm256_set_epi32( @@ -118,55 +130,55 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) // round 2, 6, 10 - k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) ); + k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero ); - k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) ); + k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) ); + k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) ); + k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p2 = _mm256_xor_si256( p2, x ); - k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) ); + k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero ); - k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) ); + k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) ); + k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) ); + k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p0 = _mm256_xor_si256( p0, x ); // round 3, 7, 11 - k00 = _mm256_xor_si256( mm256_ror128_32( + k00 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror128_32( + k01 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror128_32( + k02 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror128_32( + k03 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p1 = _mm256_xor_si256( p1, x ); - k10 = _mm256_xor_si256( mm256_ror128_32( + k10 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror128_32( + k11 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( mm256_ror128_32( + k12 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ), k11 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror128_32( + k13 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); @@ -174,24 +186,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) // round 4, 8, 12 - k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) ); + k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero ); - k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) ); + k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) ); + k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) ); + k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p0 = _mm256_xor_si256( p0, x ); - k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) ); + k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero ); - k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) ); + k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) ); + k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) ); + k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p2 = _mm256_xor_si256( p2, x ); @@ -200,35 +212,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) // round 13 - k00 = _mm256_xor_si256( mm256_ror128_32( + k00 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror128_32( + k01 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror128_32( + k02 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror128_32( + k03 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); - k10 = _mm256_xor_si256( mm256_ror128_32( + k10 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror128_32( + k11 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ); + k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ); k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1, ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror128_32( + k13 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); @@ -308,7 +320,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst ) uint32_t vp = ctx->ptr>>5; // Terminating byte then zero pad - casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 ); + casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 ); // Zero pad full vectors up to count for ( ; vp < 6; vp++ ) @@ -388,13 +400,13 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst, if ( vp == 0 ) // empty buf, xevan. { - casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 ); + casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + 1, 5 ); ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; } else // half full buf, everyone else. { - casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 ); + casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + vp, 6 - vp ); } @@ -478,13 +490,13 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst, if ( vp == 0 ) // empty buf, xevan. { - casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 ); + casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + 1, 5 ); ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; } else // half full buf, everyone else. { - casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 ); + casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + vp, 6 - vp ); } diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c index eed4ba14..4dd9b490 100644 --- a/algo/shavite/shavite-hash-4way.c +++ b/algo/shavite/shavite-hash-4way.c @@ -11,10 +11,6 @@ static const uint32_t IV512[] = 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A }; -#define mm512_ror2x512hi_1x32( a, b ) \ - _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \ - mm512_ror128_32( b ) ) - static void c512_4way( shavite512_4way_context *ctx, const void *msg ) { @@ -23,6 +19,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) register __m512i K0, K1, K2, K3, K4, K5, K6, K7; __m512i *M = (__m512i*)msg; __m512i *H = (__m512i*)ctx->h; + const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2, + ctx->count1, ctx->count0 ); int r; P0 = H[0]; @@ -58,101 +56,101 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) { // round 1, 5, 9 - K0 = _mm512_xor_si512( K7, mm512_ror128_32( + K0 = _mm512_xor_si512( K7, mm512_shuflr128_32( _mm512_aesenc_epi128( K0, m512_zero ) ) ); if ( r == 0 ) - K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( - ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) ); + K0 = _mm512_xor_si512( K0, + _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); K1 = _mm512_xor_si512( K0, - mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); if ( r == 1 ) - K1 = _mm512_xor_si512( K1, _mm512_set4_epi32( - ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); + K1 = _mm512_xor_si512( K1, mm512_shuflr128_32( + _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); K2 = _mm512_xor_si512( K1, - mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); K3 = _mm512_xor_si512( K2, - mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P3 = _mm512_xor_si512( P3, X ); K4 = _mm512_xor_si512( K3, - mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); K5 = _mm512_xor_si512( K4, - mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); K6 = _mm512_xor_si512( K5, - mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); K7 = _mm512_xor_si512( K6, - mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); if ( r == 2 ) - K7 = _mm512_xor_si512( K7, _mm512_set4_epi32( - ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) ); + K7 = _mm512_xor_si512( K7, mm512_swap128_64( + _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P1 = _mm512_xor_si512( P1, X ); // round 2, 6, 10 - K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) ); + K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero ); - K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) ); + K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); - K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) ); + K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); - K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) ); + K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P2 = _mm512_xor_si512( P2, X ); - K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) ); + K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero ); - K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) ); + K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) ); + K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); - K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) ); + K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P0 = _mm512_xor_si512( P0, X ); // round 3, 7, 11 - K0 = _mm512_xor_si512( mm512_ror128_32( + K0 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero ); - K1 = _mm512_xor_si512( mm512_ror128_32( + K1 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); - K2 = _mm512_xor_si512( mm512_ror128_32( + K2 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); - K3 = _mm512_xor_si512( mm512_ror128_32( + K3 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P1 = _mm512_xor_si512( P1, X ); - K4 = _mm512_xor_si512( mm512_ror128_32( + K4 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero ); - K5 = _mm512_xor_si512( mm512_ror128_32( + K5 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - K6 = _mm512_xor_si512( mm512_ror128_32( + K6 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ), K5 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); - K7 = _mm512_xor_si512( mm512_ror128_32( + K7 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); @@ -160,24 +158,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) // round 4, 8, 12 - K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) ); + K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero ); - K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) ); + K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); - K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) ); + K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); - K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) ); + K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P0 = _mm512_xor_si512( P0, X ); - K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) ); + K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero ); - K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) ); + K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) ); + K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); - K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) ); + K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P2 = _mm512_xor_si512( P2, X ); @@ -185,34 +183,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) // round 13 - K0 = _mm512_xor_si512( mm512_ror128_32( + K0 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); - K1 = _mm512_xor_si512( mm512_ror128_32( + K1 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); - K2 = _mm512_xor_si512( mm512_ror128_32( + K2 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); - K3 = _mm512_xor_si512( mm512_ror128_32( + K3 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P3 = _mm512_xor_si512( P3, X ); - K4 = _mm512_xor_si512( mm512_ror128_32( + K4 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); - K5 = _mm512_xor_si512( mm512_ror128_32( + K5 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ); + K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ); K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); - K7= _mm512_xor_si512( mm512_ror128_32( + K7= _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); @@ -292,7 +290,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst ) uint32_t vp = ctx->ptr>>6; // Terminating byte then zero pad - casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 ); + casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 ); // Zero pad full vectors up to count for ( ; vp < 6; vp++ ) @@ -372,13 +370,13 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst, if ( vp == 0 ) // empty buf, xevan. { - casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 ); + casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 ); memset_zero_512( (__m512i*)buf + 1, 5 ); ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; } else // half full buf, everyone else. { - casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 ); + casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 ); memset_zero_512( (__m512i*)buf + vp, 6 - vp ); } @@ -463,13 +461,13 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst, if ( vp == 0 ) // empty buf, xevan. { - casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 ); + casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 ); memset_zero_512( (__m512i*)buf + 1, 5 ); ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; } else // half full buf, everyone else. { - casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 ); + casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 ); memset_zero_512( (__m512i*)buf + vp, 6 - vp ); } diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index e047d778..eaa63067 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -59,30 +59,6 @@ static const sph_u32 IV512[] = { C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) }; -// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector -// and return the rotated 128 bit vector a. -// a[3:0] = { b[0], a[3], a[2], a[1] } -#if defined(__SSSE3__) - -#define mm128_ror256hi_1x32( a, b ) _mm_alignr_epi8( b, a, 4 ) - -#else // SSE2 - -#define mm128_ror256hi_1x32( a, b ) \ - _mm_or_si128( _mm_srli_si128( a, 4 ), \ - _mm_slli_si128( b, 12 ) ) - -#endif - -#if defined(__AVX2__) -// 2 way version of above -// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] } - -#define mm256_ror2x256hi_1x32( a, b ) \ - _mm256_blend_epi32( mm256_ror256_1x32( a ), \ - mm256_rol256_3x32( b ), 0x88 ) - -#endif static void c512( sph_shavite_big_context *sc, const void *msg ) @@ -101,15 +77,6 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round -// working proof of concept -/* - __m512i K = m512_const1_128( m[0] ); - __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K ); - X = _mm512_aesenc_epi128( X, m512_zero ); - k00 = _mm512_castsi512_si128( K ); - x = _mm512_castsi512_si128( X ); -*/ - k00 = m[0]; x = _mm_xor_si128( p1, k00 ); x = _mm_aesenc_si128( x, zero ); @@ -144,7 +111,7 @@ c512( sph_shavite_big_context *sc, const void *msg ) for ( r = 0; r < 3; r ++ ) { // round 1, 5, 9 - k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) ); + k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); k00 = _mm_xor_si128( k00, k13 ); if ( r == 0 ) @@ -153,7 +120,7 @@ c512( sph_shavite_big_context *sc, const void *msg ) x = _mm_xor_si128( p0, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); + k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); k01 = _mm_xor_si128( k01, k00 ); if ( r == 1 ) @@ -162,31 +129,31 @@ c512( sph_shavite_big_context *sc, const void *msg ) x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) ); + k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) ); + k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p3 = _mm_xor_si128( p3, x ); - k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) ); + k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p2, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) ); + k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) ); + k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); k12 = _mm_xor_si128( k12, k11 ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) ); + k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); k13 = _mm_xor_si128( k13, k12 ); if ( r == 2 ) @@ -199,31 +166,31 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round 2, 6, 10 - k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) ); + k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) ); x = _mm_xor_si128( p3, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) ); + k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) ); x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) ); + k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) ); + k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p2 = _mm_xor_si128( p2, x ); - k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) ); + k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) ); x = _mm_xor_si128( p1, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) ); + k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) ); + k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) ); + k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) ); x = _mm_xor_si128( x, k13 ); x = _mm_aesenc_si128( x, zero ); @@ -231,38 +198,38 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round 3, 7, 11 - k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) ); + k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); k00 = _mm_xor_si128( k00, k13 ); x = _mm_xor_si128( p2, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); + k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); k01 = _mm_xor_si128( k01, k00 ); x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) ); + k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) ); + k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p1 = _mm_xor_si128( p1, x ); - k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) ); + k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p0, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) ); + k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) ); + k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); k12 = _mm_xor_si128( k12, k11 ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) ); + k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); k13 = _mm_xor_si128( k13, k12 ); x = _mm_xor_si128( x, k13 ); x = _mm_aesenc_si128( x, zero ); @@ -271,31 +238,31 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round 4, 8, 12 - k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) ); + k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) ); x = _mm_xor_si128( p1, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) ); + k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) ); x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) ); + k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) ); + k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p0 = _mm_xor_si128( p0, x ); - k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) ); + k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) ); x = _mm_xor_si128( p3, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) ); + k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) ); + k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) ); + k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) ); x = _mm_xor_si128( x, k13 ); x = _mm_aesenc_si128( x, zero ); @@ -304,39 +271,39 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round 13 - k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) ); + k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); k00 = _mm_xor_si128( k00, k13 ); x = _mm_xor_si128( p0, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); + k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); k01 = _mm_xor_si128( k01, k00 ); x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) ); + k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) ); + k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p3 = _mm_xor_si128( p3, x ); - k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) ); + k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p2, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) ); + k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) ); + k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32( ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) ); + k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); k13 = _mm_xor_si128( k13, k12 ); x = _mm_xor_si128( x, k13 ); x = _mm_aesenc_si128( x, zero ); diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c index 41988f97..3d7c8286 100644 --- a/algo/shavite/sph_shavite.c +++ b/algo/shavite/sph_shavite.c @@ -35,7 +35,7 @@ #include "sph_shavite.h" -#if !defined(__AES__) +#if !(defined(__AES__) && defined(__SSSE3__)) #ifdef __cplusplus extern "C"{ diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h index cca59726..f30f4dfb 100644 --- a/algo/shavite/sph_shavite.h +++ b/algo/shavite/sph_shavite.h @@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); //Don't call these directly from application code, use the macros below. -#ifdef __AES__ +#if defined(__AES__) && defined(__SSSE3__) void sph_shavite512_aesni_init(void *cc); void sph_shavite512_aesni(void *cc, const void *data, size_t len); diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index f2652f35..856a07f7 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) }; - -// static const m512_v16 code[] = { c1_16(185), c1_16(233), -// c1_16(185), c1_16(233) }; - - S0l = _mm512_xor_si512( S[0], M[0] ); S0h = _mm512_xor_si512( S[1], M[1] ); S1l = _mm512_xor_si512( S[2], M[2] ); @@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) // targetted, local macros don't need a unique name #define S(i) S##i +#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca ) +#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 ) + +/* #define F_0(B, C, D) \ _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D ) #define F_1(B, C, D) \ _mm512_or_si512( _mm512_and_si512( D, C ),\ _mm512_and_si512( _mm512_or_si512( D,C ), B ) ) +*/ #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l) #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h) diff --git a/algo/simd/vector.c b/algo/simd/vector.c index 12692db6..60f0cc76 100644 --- a/algo/simd/vector.c +++ b/algo/simd/vector.c @@ -6,10 +6,6 @@ #define PRINT_SOME 0 -/* JDD all ocurrances of macro X in this file renamed to XX - * due to name conflict - */ - int SupportedLength(int hashbitlen) { if (hashbitlen <= 0 || hashbitlen > 512) return 0; diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index a12af435..5a7cdbda 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -3,7 +3,7 @@ #include #include "skein-hash-4way.h" #include "algo/sha/sha-hash-4way.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #if defined (SKEIN_8WAY) @@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input ) uint32_t hash1[16] __attribute__ ((aligned (64))); uint32_t hash2[16] __attribute__ ((aligned (64))); uint32_t hash3[16] __attribute__ ((aligned (64))); - sph_sha256_context ctx_sha256; #else uint32_t vhash32[16*4] __attribute__ ((aligned (64))); sha256_4way_context ctx_sha256; @@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input ) #if defined(__SHA__) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash0, 64 ); - sph_sha256_close( &ctx_sha256, hash0 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash1, 64 ); - sph_sha256_close( &ctx_sha256, hash1 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash2, 64 ); - sph_sha256_close( &ctx_sha256, hash2 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash3, 64 ); - sph_sha256_close( &ctx_sha256, hash3 ); + + sha256_full( hash0, hash0, 64 ); + sha256_full( hash1, hash1, 64 ); + sha256_full( hash2, hash2, 64 ); + sha256_full( hash3, hash3, 64 ); + intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 ); #else diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index d7cd4705..711d8ac2 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -309,22 +309,16 @@ static const uint64_t IV512[] = { sc->bcount = bcount; \ } while (0) -// AVX2 all scalar vars are now vectors representing 4 nonces in parallel - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \ do { \ - k8 = _mm512_xor_si512( _mm512_xor_si512( \ - _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \ - _mm512_xor_si512( k2, k3 ) ), \ - _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \ - _mm512_xor_si512( k6, k7 ) ) ), \ - m512_const1_64( 0x1BD11BDAA9FC1A22) ); \ + k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \ + mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\ t2 = t0 ^ t1; \ } while (0) - + #define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \ do { \ w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \ @@ -340,7 +334,6 @@ do { \ m512_const1_64( s ) ) ); \ } while (0) - #define TFBIG_MIX_8WAY(x0, x1, rc) \ do { \ x0 = _mm512_add_epi64( x0, x1 ); \ diff --git a/algo/skein/skein.c b/algo/skein/skein.c index 91eb3252..be9bb82b 100644 --- a/algo/skein/skein.c +++ b/algo/skein/skein.c @@ -5,21 +5,18 @@ #include #include #include "sph_skein.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" void skeinhash(void *state, const void *input) { uint32_t hash[16] __attribute__ ((aligned (64))); sph_skein512_context ctx_skein; - sph_sha256_context ctx_sha256; sph_skein512_init( &ctx_skein ); sph_skein512( &ctx_skein, input, 80 ); sph_skein512_close( &ctx_skein, hash ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash, 64 ); - sph_sha256_close( &ctx_sha256, hash ); + sha256_full( hash, hash, 64 ); memcpy(state, hash, 32); } @@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input) int scanhash_skein( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t hash64[8] __attribute__ ((aligned (64))); uint32_t endiandata[20] __attribute__ ((aligned (64))); const uint32_t Htarg = ptarget[7]; @@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; int thr_id = mythr->id; // thr_id arg is deprecated - swab32_array( endiandata, pdata, 20 ); + swab32_array( endiandata, pdata, 20 ); do { be32enc(&endiandata[19], n); diff --git a/algo/sm3/sph_sm3.h b/algo/sm3/sph_sm3.h index eab61d36..3d69e55a 100644 --- a/algo/sm3/sph_sm3.h +++ b/algo/sm3/sph_sm3.h @@ -74,7 +74,7 @@ typedef struct { void sm3_init(sm3_ctx_t *ctx); void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len); -void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]); +void sm3_final(sm3_ctx_t *ctx, unsigned char *digest); void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]); void sm3(const unsigned char *data, size_t datalen, unsigned char digest[SM3_DIGEST_LENGTH]); diff --git a/algo/swifftx/inttypes.h b/algo/swifftx/inttypes.h index 2b6b941b..9f74eee2 100644 --- a/algo/swifftx/inttypes.h +++ b/algo/swifftx/inttypes.h @@ -18,16 +18,20 @@ #ifndef __INTTYPES_H_ #define __INTTYPES_H_ +#include + /* Use [u]intN_t if you need exactly N bits. XXX - doesn't handle the -mint8 option. */ typedef signed char swift_int8_t; typedef unsigned char swift_uint8_t; - typedef int swift_int16_t; + typedef int32_t swift_int16_t; +// typedef int swift_int16_t; typedef unsigned int swift_uint16_t; - typedef long swift_int32_t; + typedef int32_t swift_int32_t; +// typedef long swift_int32_t; typedef unsigned long swift_uint32_t; typedef long long swift_int64_t; diff --git a/algo/swifftx/stdbool.h b/algo/swifftx/stdbool.h deleted file mode 100644 index d6396c35..00000000 --- a/algo/swifftx/stdbool.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2000 Jeroen Ruigrok van der Werven - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: src/include/stdbool.h,v 1.6 2002/08/16 07:33:14 alfred Exp $ - */ - -#ifndef _STDBOOL_H_ -#define _STDBOOL_H_ - -#define __bool_true_false_are_defined 1 - -#ifndef __cplusplus - -#define false 0 -#define true 1 - -//#define bool _Bool -//#if __STDC_VERSION__ < 199901L && __GNUC__ < 3 -//typedef int _Bool; -//#endif -typedef int bool; - -#endif /* !__cplusplus */ - -#endif /* !_STDBOOL_H_ */ diff --git a/algo/swifftx/swifftx-4way.c b/algo/swifftx/swifftx-4way.c deleted file mode 100644 index cc003f02..00000000 --- a/algo/swifftx/swifftx-4way.c +++ /dev/null @@ -1,912 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////////////////////// -// -// SWIFFTX ANSI C OPTIMIZED 32BIT IMPLEMENTATION FOR NIST SHA-3 COMPETITION -// -// SWIFFTX.c -// -// October 2008 -// -// This is the source file of the OPTIMIZED 32BIT implementation of SWIFFTX hash function. -// SWIFFTX is a candidate function for SHA-3 NIST competition. -// More details about SWIFFTX can be found in the accompanying submission documents. -// -/////////////////////////////////////////////////////////////////////////////////////////////// -#include "swifftx.h" -// See the remarks concerning compatibility issues inside stdint.h. -#include "stdint.h" -// Remove this while using gcc: -//#include "stdbool.h" -#include - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Constants and static tables portion. -/////////////////////////////////////////////////////////////////////////////////////////////// - -// In SWIFFTX we work over Z_257, so this is the modulus and the arithmetic is performed modulo -// this number. -#define FIELD_SIZE 257 - -// The size of FFT we use: -#define N 64 - -#define LOGN 6 - -#define EIGHTH_N (N / 8) - -// The number of FFTS done on the input. -#define M (SWIFFTX_INPUT_BLOCK_SIZE / 8) // 32 - -// Omega is the 128th root of unity in Z_257. -// We choose w = 42. -#define OMEGA 42 - -// The size of the inner FFT lookup table: -#define W 8 - -// Calculates the sum and the difference of two numbers. -// -// Parameters: -// - A: the first operand. After the operation stores the sum of the two operands. -// - B: the second operand. After the operation stores the difference between the first and the -// second operands. -#define ADD_SUB_4WAY( A, B ) \ -{ \ - __m128i temp = B; \ - B = _mm_sub_epi32( A, B ); \ - A = _mm_add_epi32( A, temp ); \ -} - - -//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));} - -// Quickly reduces an integer modulo 257. -// -// Parameters: -// - A: the input. - -#define Q_REDUCE( A ) ( _mm_sub_epi32( \ - _mm_and_epi32( A, m128_const1_32( 0xff ) ), \ - _mm_srli_epi32( A, 8 ) ) ) - -//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8)) - -// Since we need to do the setup only once, this is the indicator variable: -static bool wasSetupDone = false; - -// This array stores the powers of omegas that correspond to the indices, which are the input -// values. Known also as the "outer FFT twiddle factors". -swift_int16_t multipliers[N]; - -// This array stores the powers of omegas, multiplied by the corresponding values. -// We store this table to save computation time. -// -// To calculate the intermediate value of the compression function (the first out of two -// stages), we multiply the k-th bit of x_i by w^[(2i + 1) * k]. {x_i} is the input to the -// compression function, i is between 0 and 31, x_i is a 64-bit value. -// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper -- -// formula (2), section 3, page 6. -swift_int16_t fftTable[256 * EIGHTH_N]; - -// The A's we use in SWIFFTX shall be random elements of Z_257. -// We generated these A's from the decimal expansion of PI as follows: we converted each -// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A -// element, otherwise move to the next triple of digits in the expansion. This guarntees that -// the A's are random, provided that PI digits are. -const swift_int16_t As[3 * M * N] = -{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78, - 50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93, - 95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105, - 45, 130, 108, 124, 171, 151, 189, 128, 218, 134, 233, 165, 14, 201, 145, 134, - 52, 203, 91, 96, 197, 69, 134, 213, 136, 93, 3, 249, 141, 16, 210, 73, - 6, 92, 58, 74, 174, 6, 254, 91, 201, 107, 110, 76, 103, 11, 73, 16, - 34, 209, 7, 127, 146, 254, 95, 176, 57, 13, 108, 245, 77, 92, 186, 117, - 124, 97, 105, 118, 34, 74, 205, 122, 235, 53, 94, 238, 210, 227, 183, 11, - 129, 159, 105, 183, 142, 129, 86, 21, 137, 138, 224, 223, 190, 188, 179, 188, - 256, 25, 217, 176, 36, 176, 238, 127, 160, 210, 155, 148, 132, 0, 54, 127, - 145, 6, 46, 85, 243, 95, 173, 123, 178, 207, 211, 183, 224, 173, 146, 35, - 71, 114, 50, 22, 175, 1, 28, 19, 112, 129, 21, 34, 161, 159, 115, 52, - 4, 193, 211, 92, 115, 49, 59, 217, 218, 96, 61, 81, 24, 202, 198, 89, - 45, 128, 8, 51, 253, 87, 171, 35, 4, 188, 171, 10, 3, 137, 238, 73, - 19, 208, 124, 163, 103, 177, 155, 147, 46, 84, 253, 233, 171, 241, 211, 217, - 159, 48, 96, 79, 237, 18, 171, 226, 99, 1, 97, 195, 216, 163, 198, 95, - 0, 201, 65, 228, 21, 153, 124, 230, 44, 35, 44, 108, 85, 156, 249, 207, - 26, 222, 131, 1, 60, 242, 197, 150, 181, 19, 116, 213, 75, 98, 124, 240, - 123, 207, 62, 255, 60, 143, 187, 157, 139, 9, 12, 104, 89, 49, 193, 146, - 104, 196, 181, 82, 198, 253, 192, 191, 255, 122, 212, 104, 47, 20, 132, 208, - 46, 170, 2, 69, 234, 36, 56, 163, 28, 152, 104, 238, 162, 56, 24, 58, - 38, 150, 193, 254, 253, 125, 173, 35, 73, 126, 247, 239, 216, 6, 199, 15, - 90, 12, 97, 122, 9, 84, 207, 127, 219, 72, 58, 30, 29, 182, 41, 192, - 235, 248, 237, 74, 72, 176, 210, 252, 45, 64, 165, 87, 202, 241, 236, 223, - 151, 242, 119, 239, 52, 112, 169, 28, 13, 37, 160, 60, 158, 81, 133, 60, - 16, 145, 249, 192, 173, 217, 214, 93, 141, 184, 54, 34, 161, 104, 157, 95, - 38, 133, 218, 227, 211, 181, 9, 66, 137, 143, 77, 33, 248, 159, 4, 55, - 228, 48, 99, 219, 222, 184, 15, 36, 254, 256, 157, 237, 87, 139, 209, 113, - 232, 85, 126, 167, 197, 100, 103, 166, 64, 225, 125, 205, 117, 135, 84, 128, - 231, 112, 90, 241, 28, 22, 210, 147, 186, 49, 230, 21, 108, 39, 194, 47, - 123, 199, 107, 114, 30, 210, 250, 143, 59, 156, 131, 133, 221, 27, 76, 99, - 208, 250, 78, 12, 211, 141, 95, 81, 195, 106, 8, 232, 150, 212, 205, 221, - 11, 225, 87, 219, 126, 136, 137, 180, 198, 48, 68, 203, 239, 252, 194, 235, - 142, 137, 174, 172, 190, 145, 250, 221, 182, 204, 1, 195, 130, 153, 83, 241, - 161, 239, 211, 138, 11, 169, 155, 245, 174, 49, 10, 166, 16, 130, 181, 139, - 222, 222, 112, 99, 124, 94, 51, 243, 133, 194, 244, 136, 35, 248, 201, 177, - 178, 186, 129, 102, 89, 184, 180, 41, 149, 96, 165, 72, 225, 231, 134, 158, - 199, 28, 249, 16, 225, 195, 10, 210, 164, 252, 138, 8, 35, 152, 213, 199, - 82, 116, 97, 230, 63, 199, 241, 35, 79, 120, 54, 174, 67, 112, 1, 76, - 69, 222, 194, 96, 82, 94, 25, 228, 196, 145, 155, 136, 228, 234, 46, 101, - 246, 51, 103, 166, 246, 75, 9, 200, 161, 4, 108, 35, 129, 168, 208, 144, - 50, 14, 13, 220, 41, 132, 122, 127, 194, 9, 232, 234, 107, 28, 187, 8, - 51, 141, 97, 221, 225, 9, 113, 170, 166, 102, 135, 22, 231, 185, 227, 187, - 110, 145, 251, 146, 76, 22, 146, 228, 7, 53, 64, 25, 62, 198, 130, 190, - 221, 232, 169, 64, 188, 199, 237, 249, 173, 218, 196, 191, 48, 224, 5, 113, - 100, 166, 160, 21, 191, 197, 61, 162, 149, 171, 240, 183, 129, 231, 123, 204, - 192, 179, 134, 15, 47, 161, 142, 177, 239, 234, 186, 237, 231, 53, 208, 95, - 146, 36, 225, 231, 89, 142, 93, 248, 137, 124, 83, 39, 69, 77, 89, 208, - 182, 48, 85, 147, 244, 164, 246, 68, 38, 190, 220, 35, 202, 91, 157, 151, - 201, 240, 185, 218, 4, 152, 2, 132, 177, 88, 190, 196, 229, 74, 220, 135, - 137, 196, 11, 47, 5, 251, 106, 144, 163, 60, 222, 127, 52, 57, 202, 102, - 64, 140, 110, 206, 23, 182, 39, 245, 1, 163, 157, 186, 163, 80, 7, 230, - 44, 249, 176, 102, 164, 125, 147, 120, 18, 191, 186, 125, 64, 65, 198, 157, - 164, 213, 95, 61, 13, 181, 208, 91, 242, 197, 158, 34, 98, 169, 91, 14, - 17, 93, 157, 17, 65, 30, 183, 6, 139, 58, 255, 108, 100, 136, 209, 144, - 164, 6, 237, 33, 210, 110, 57, 126, 197, 136, 125, 244, 165, 151, 168, 3, - 143, 251, 247, 155, 136, 130, 88, 14, 74, 121, 250, 133, 21, 226, 185, 232, - 118, 132, 89, 64, 204, 161, 2, 70, 224, 159, 35, 204, 123, 180, 13, 52, - 231, 57, 25, 78, 66, 69, 97, 42, 198, 84, 176, 59, 8, 232, 125, 134, - 193, 2, 232, 109, 216, 69, 90, 142, 32, 38, 249, 37, 75, 180, 184, 188, - 19, 47, 120, 87, 146, 70, 232, 120, 191, 45, 33, 38, 19, 248, 110, 110, - 44, 64, 2, 84, 244, 228, 252, 228, 170, 123, 38, 144, 213, 144, 171, 212, - 243, 87, 189, 46, 128, 110, 84, 77, 65, 183, 61, 184, 101, 44, 168, 68, - 14, 106, 105, 8, 227, 211, 166, 39, 152, 43, 52, 254, 197, 55, 119, 89, - 168, 65, 53, 138, 177, 56, 219, 0, 58, 121, 148, 18, 44, 100, 215, 103, - 145, 229, 117, 196, 91, 89, 113, 143, 172, 239, 249, 184, 154, 39, 112, 65, - 204, 42, 84, 38, 155, 151, 151, 16, 100, 87, 174, 162, 145, 147, 149, 186, - 237, 145, 134, 144, 198, 235, 213, 163, 48, 230, 24, 47, 57, 71, 127, 0, - 150, 219, 12, 81, 197, 150, 131, 13, 169, 63, 175, 184, 48, 235, 65, 243, - 149, 200, 163, 254, 202, 114, 247, 67, 143, 250, 126, 228, 80, 130, 216, 214, - 36, 2, 230, 33, 119, 125, 3, 142, 237, 100, 3, 152, 197, 174, 244, 129, - 232, 30, 206, 199, 39, 210, 220, 43, 237, 221, 201, 54, 179, 42, 28, 133, - 246, 203, 198, 177, 0, 28, 194, 85, 223, 109, 155, 147, 221, 60, 133, 108, - 157, 254, 26, 75, 157, 185, 49, 142, 31, 137, 71, 43, 63, 64, 237, 148, - 237, 172, 159, 160, 155, 254, 234, 224, 140, 193, 114, 140, 62, 109, 136, 39, - 255, 8, 158, 146, 128, 49, 222, 96, 57, 209, 180, 249, 202, 127, 113, 231, - 78, 178, 46, 33, 228, 215, 104, 31, 207, 186, 82, 41, 42, 39, 103, 119, - 123, 133, 243, 254, 238, 156, 90, 186, 37, 212, 33, 107, 252, 51, 177, 36, - 237, 76, 159, 245, 93, 214, 97, 56, 190, 38, 160, 94, 105, 222, 220, 158, - 49, 16, 191, 52, 120, 87, 179, 2, 27, 144, 223, 230, 184, 6, 129, 227, - 69, 47, 215, 181, 162, 139, 72, 200, 45, 163, 159, 62, 2, 221, 124, 40, - 159, 242, 35, 208, 179, 166, 98, 67, 178, 68, 143, 225, 178, 146, 187, 159, - 57, 66, 176, 192, 236, 250, 168, 224, 122, 43, 159, 120, 133, 165, 122, 64, - 87, 74, 161, 241, 9, 87, 90, 24, 255, 113, 203, 220, 57, 139, 197, 159, - 31, 151, 27, 140, 77, 162, 7, 27, 84, 228, 187, 220, 53, 126, 162, 242, - 84, 181, 223, 103, 86, 177, 207, 31, 140, 18, 207, 256, 201, 166, 96, 23, - 233, 103, 197, 84, 161, 75, 59, 149, 138, 154, 119, 92, 16, 53, 116, 97, - 220, 114, 35, 45, 77, 209, 40, 196, 71, 22, 81, 178, 110, 14, 3, 180, - 110, 129, 112, 47, 18, 61, 134, 78, 73, 79, 254, 232, 125, 180, 205, 54, - 220, 119, 63, 89, 181, 52, 77, 109, 151, 77, 80, 207, 144, 25, 20, 6, - 208, 47, 201, 206, 192, 14, 73, 176, 256, 201, 207, 87, 216, 60, 56, 73, - 92, 243, 179, 113, 49, 59, 55, 168, 121, 137, 69, 154, 95, 57, 187, 47, - 129, 4, 15, 92, 6, 116, 69, 196, 48, 134, 84, 81, 111, 56, 38, 176, - 239, 6, 128, 72, 242, 134, 36, 221, 59, 48, 242, 68, 130, 110, 171, 89, - 13, 220, 48, 29, 5, 75, 104, 233, 91, 129, 105, 162, 44, 113, 163, 163, - 85, 147, 190, 111, 197, 80, 213, 153, 81, 68, 203, 33, 161, 165, 10, 61, - 120, 252, 0, 205, 28, 42, 193, 64, 39, 37, 83, 175, 5, 218, 215, 174, - 128, 121, 231, 11, 150, 145, 135, 197, 136, 91, 193, 5, 107, 88, 82, 6, - 4, 188, 256, 70, 40, 2, 167, 57, 169, 203, 115, 254, 215, 172, 84, 80, - 188, 167, 34, 137, 43, 243, 2, 79, 178, 38, 188, 135, 233, 194, 208, 13, - 11, 151, 231, 196, 12, 122, 162, 56, 17, 114, 191, 207, 90, 132, 64, 238, - 187, 6, 198, 176, 240, 88, 118, 236, 15, 226, 166, 22, 193, 229, 82, 246, - 213, 64, 37, 63, 31, 243, 252, 37, 156, 38, 175, 204, 138, 141, 211, 82, - 106, 217, 97, 139, 153, 56, 129, 218, 158, 9, 83, 26, 87, 112, 71, 21, - 250, 5, 65, 141, 68, 116, 231, 113, 10, 218, 99, 205, 201, 92, 157, 4, - 97, 46, 49, 220, 72, 139, 103, 171, 149, 129, 193, 19, 69, 245, 43, 31, - 58, 68, 36, 195, 159, 22, 54, 34, 233, 141, 205, 100, 226, 96, 22, 192, - 41, 231, 24, 79, 234, 138, 30, 120, 117, 216, 172, 197, 172, 107, 86, 29, - 181, 151, 0, 6, 146, 186, 68, 55, 54, 58, 213, 182, 60, 231, 33, 232, - 77, 210, 216, 154, 80, 51, 141, 122, 68, 148, 219, 122, 254, 48, 64, 175, - 41, 115, 62, 243, 141, 81, 119, 121, 5, 68, 121, 88, 239, 29, 230, 90, - 135, 159, 35, 223, 168, 112, 49, 37, 146, 60, 126, 134, 42, 145, 115, 90, - 73, 133, 211, 86, 120, 141, 122, 241, 127, 56, 130, 36, 174, 75, 83, 246, - 112, 45, 136, 194, 201, 115, 1, 156, 114, 167, 208, 12, 176, 147, 32, 170, - 251, 100, 102, 220, 122, 210, 6, 49, 75, 201, 38, 105, 132, 135, 126, 102, - 13, 121, 76, 228, 202, 20, 61, 213, 246, 13, 207, 42, 148, 168, 37, 253, - 34, 94, 141, 185, 18, 234, 157, 109, 104, 64, 250, 125, 49, 236, 86, 48, - 196, 77, 75, 237, 156, 103, 225, 19, 110, 229, 22, 68, 177, 93, 221, 181, - 152, 153, 61, 108, 101, 74, 247, 195, 127, 216, 30, 166, 168, 61, 83, 229, - 120, 156, 96, 120, 201, 124, 43, 27, 253, 250, 120, 143, 89, 235, 189, 243, - 150, 7, 127, 119, 149, 244, 84, 185, 134, 34, 128, 193, 236, 234, 132, 117, - 137, 32, 145, 184, 44, 121, 51, 76, 11, 228, 142, 251, 39, 77, 228, 251, - 41, 58, 246, 107, 125, 187, 9, 240, 35, 8, 11, 162, 242, 220, 158, 163, - 2, 184, 163, 227, 242, 2, 100, 101, 2, 78, 129, 34, 89, 28, 26, 157, - 79, 31, 107, 250, 194, 156, 186, 69, 212, 66, 41, 180, 139, 42, 211, 253, - 256, 239, 29, 129, 104, 248, 182, 68, 1, 189, 48, 226, 36, 229, 3, 158, - 41, 53, 241, 22, 115, 174, 16, 163, 224, 19, 112, 219, 177, 233, 42, 27, - 250, 134, 18, 28, 145, 122, 68, 34, 134, 31, 147, 17, 39, 188, 150, 76, - 45, 42, 167, 249, 12, 16, 23, 182, 13, 79, 121, 3, 70, 197, 239, 44, - 86, 177, 255, 81, 64, 171, 138, 131, 73, 110, 44, 201, 254, 198, 146, 91, - 48, 9, 104, 31, 29, 161, 101, 31, 138, 180, 231, 233, 79, 137, 61, 236, - 140, 15, 249, 218, 234, 119, 99, 195, 110, 137, 237, 207, 8, 31, 45, 24, - 90, 155, 203, 253, 192, 203, 65, 176, 210, 171, 142, 214, 220, 122, 136, 237, - 189, 186, 147, 40, 80, 254, 173, 33, 191, 46, 192, 26, 108, 255, 228, 205, - 61, 76, 39, 107, 225, 126, 228, 182, 140, 251, 143, 134, 252, 168, 221, 8, - 185, 85, 60, 233, 147, 244, 87, 137, 8, 140, 96, 80, 53, 45, 175, 160, - 124, 189, 112, 37, 144, 19, 70, 17, 170, 242, 2, 3, 28, 95, 120, 199, - 212, 43, 9, 117, 86, 151, 101, 241, 200, 145, 241, 19, 178, 69, 204, 197, - 227, 166, 94, 7, 193, 45, 247, 234, 19, 187, 212, 212, 236, 125, 33, 95, - 198, 121, 122, 103, 77, 155, 235, 49, 25, 237, 249, 11, 162, 7, 238, 24, - 16, 150, 129, 25, 152, 17, 42, 67, 247, 162, 77, 154, 31, 133, 55, 137, - 79, 119, 153, 10, 86, 28, 244, 186, 41, 169, 106, 44, 10, 49, 110, 179, - 32, 133, 155, 244, 61, 70, 131, 168, 170, 39, 231, 252, 32, 69, 92, 238, - 239, 35, 132, 136, 236, 167, 90, 32, 123, 88, 69, 22, 20, 89, 145, 166, - 30, 118, 75, 4, 49, 31, 225, 54, 11, 50, 56, 191, 246, 1, 187, 33, - 119, 107, 139, 68, 19, 240, 131, 55, 94, 113, 31, 252, 12, 179, 121, 2, - 120, 252, 0, 76, 41, 80, 185, 42, 62, 121, 105, 159, 121, 109, 111, 98, - 7, 118, 86, 29, 210, 70, 231, 179, 223, 229, 164, 70, 62, 47, 0, 206, - 204, 178, 168, 120, 224, 166, 99, 25, 103, 63, 246, 224, 117, 204, 75, 124, - 140, 133, 110, 110, 222, 88, 151, 118, 46, 37, 22, 143, 158, 40, 2, 50, - 153, 94, 190, 199, 13, 198, 127, 211, 180, 90, 183, 98, 0, 142, 210, 154, - 100, 187, 67, 231, 202, 100, 198, 235, 252, 160, 247, 124, 247, 14, 121, 221, - 57, 88, 253, 243, 185, 89, 45, 249, 221, 194, 108, 175, 193, 119, 50, 141, - 223, 133, 136, 64, 176, 250, 129, 100, 124, 94, 181, 159, 99, 185, 177, 240, - 135, 42, 103, 52, 202, 208, 143, 186, 193, 103, 154, 237, 102, 88, 225, 161, - 50, 188, 191, 109, 12, 87, 19, 227, 247, 183, 13, 52, 205, 170, 205, 146, - 89, 160, 18, 105, 192, 73, 231, 225, 184, 157, 252, 220, 61, 59, 169, 183, - 221, 20, 141, 20, 158, 101, 245, 7, 245, 225, 118, 137, 84, 55, 19, 27, - 164, 110, 35, 25, 202, 94, 150, 46, 91, 152, 130, 1, 7, 46, 16, 237, - 171, 109, 19, 200, 65, 38, 10, 213, 70, 96, 126, 226, 185, 225, 181, 46, - 10, 165, 11, 123, 53, 158, 22, 147, 64, 22, 227, 69, 182, 237, 197, 37, - 39, 49, 186, 223, 139, 128, 55, 36, 166, 178, 220, 20, 98, 172, 166, 253, - 45, 0, 120, 180, 189, 185, 158, 159, 196, 6, 214, 79, 141, 52, 156, 107, - 5, 109, 142, 159, 33, 64, 190, 133, 95, 132, 95, 202, 160, 63, 186, 23, - 231, 107, 163, 33, 234, 15, 244, 77, 108, 49, 51, 7, 164, 87, 142, 99, - 240, 202, 47, 256, 118, 190, 196, 178, 217, 42, 39, 153, 21, 192, 232, 202, - 14, 82, 179, 64, 233, 4, 219, 10, 133, 78, 43, 144, 146, 216, 202, 81, - 71, 252, 8, 201, 68, 256, 85, 233, 164, 88, 176, 30, 5, 152, 126, 179, - 249, 84, 140, 190, 159, 54, 118, 98, 2, 159, 27, 133, 74, 121, 239, 196, - 71, 149, 119, 135, 102, 20, 87, 112, 44, 75, 221, 3, 151, 158, 5, 98, - 152, 25, 97, 106, 63, 171, 240, 79, 234, 240, 230, 92, 76, 70, 173, 196, - 36, 225, 218, 133, 64, 240, 150, 41, 146, 66, 133, 51, 134, 73, 170, 238, - 140, 90, 45, 89, 46, 147, 96, 169, 174, 174, 244, 151, 90, 40, 32, 74, - 38, 154, 246, 57, 31, 14, 189, 151, 83, 243, 197, 183, 220, 185, 53, 225, - 51, 106, 188, 208, 222, 248, 93, 13, 93, 215, 131, 25, 142, 185, 113, 222, - 131, 215, 149, 50, 159, 85, 32, 5, 205, 192, 2, 227, 42, 214, 197, 42, - 126, 182, 68, 123, 109, 36, 237, 179, 170, 199, 77, 256, 5, 128, 214, 243, - 137, 177, 170, 253, 179, 180, 153, 236, 100, 196, 216, 231, 198, 37, 192, 80, - 121, 221, 246, 1, 16, 246, 29, 78, 64, 148, 124, 38, 96, 125, 28, 20, - 48, 51, 73, 187, 139, 208, 98, 253, 221, 188, 84, 129, 1, 205, 95, 205, - 117, 79, 71, 126, 134, 237, 19, 184, 137, 125, 129, 178, 223, 54, 188, 112, - 30, 7, 225, 228, 205, 184, 233, 87, 117, 22, 58, 10, 8, 42, 2, 114, - 254, 19, 17, 13, 150, 92, 233, 179, 63, 12, 60, 171, 127, 35, 50, 5, - 195, 113, 241, 25, 249, 184, 166, 44, 221, 35, 151, 116, 8, 54, 195, 89, - 218, 186, 132, 5, 41, 89, 226, 177, 11, 41, 87, 172, 5, 23, 20, 59, - 228, 94, 76, 33, 137, 43, 151, 221, 61, 232, 4, 120, 93, 217, 80, 228, - 228, 6, 58, 25, 62, 84, 91, 48, 209, 20, 247, 243, 55, 106, 80, 79, - 235, 34, 20, 180, 146, 2, 236, 13, 236, 206, 243, 222, 204, 83, 148, 213, - 214, 117, 237, 98, 0, 90, 204, 168, 32, 41, 126, 67, 191, 74, 27, 255, - 26, 75, 240, 113, 185, 105, 167, 154, 112, 67, 151, 63, 161, 134, 239, 176, - 42, 87, 249, 130, 45, 242, 17, 100, 107, 120, 212, 218, 237, 76, 231, 162, - 175, 172, 118, 155, 92, 36, 124, 17, 121, 71, 13, 9, 82, 126, 147, 142, - 218, 148, 138, 80, 163, 106, 164, 123, 140, 129, 35, 42, 186, 154, 228, 214, - 75, 73, 8, 253, 42, 153, 232, 164, 95, 24, 110, 90, 231, 197, 90, 196, - 57, 164, 252, 181, 31, 7, 97, 256, 35, 77, 200, 212, 99, 179, 92, 227, - 17, 180, 49, 176, 9, 188, 13, 182, 93, 44, 128, 219, 134, 92, 151, 6, - 23, 126, 200, 109, 66, 30, 140, 180, 146, 134, 67, 200, 7, 9, 223, 168, - 186, 221, 3, 154, 150, 165, 43, 53, 138, 27, 86, 213, 235, 160, 70, 2, - 240, 20, 89, 212, 84, 141, 168, 246, 183, 227, 30, 167, 138, 185, 253, 83, - 52, 143, 236, 94, 59, 65, 89, 218, 194, 157, 164, 156, 111, 95, 202, 168, - 245, 256, 151, 28, 222, 194, 72, 130, 217, 134, 253, 77, 246, 100, 76, 32, - 254, 174, 182, 193, 14, 237, 74, 1, 74, 26, 135, 216, 152, 208, 112, 38, - 181, 62, 25, 71, 61, 234, 254, 97, 191, 23, 92, 256, 190, 205, 6, 16, - 134, 147, 210, 219, 148, 59, 73, 185, 24, 247, 174, 143, 116, 220, 128, 144, - 111, 126, 101, 98, 130, 136, 101, 102, 69, 127, 24, 168, 146, 226, 226, 207, - 176, 122, 149, 254, 134, 196, 22, 151, 197, 21, 50, 205, 116, 154, 65, 116, - 177, 224, 127, 77, 177, 159, 225, 69, 176, 54, 100, 104, 140, 8, 11, 126, - 11, 188, 185, 159, 107, 16, 254, 142, 80, 28, 5, 157, 104, 57, 109, 82, - 102, 80, 173, 242, 238, 207, 57, 105, 237, 160, 59, 189, 189, 199, 26, 11, - 190, 156, 97, 118, 20, 12, 254, 189, 165, 147, 142, 199, 5, 213, 64, 133, - 108, 217, 133, 60, 94, 28, 116, 136, 47, 165, 125, 42, 183, 143, 14, 129, - 223, 70, 212, 205, 181, 180, 3, 201, 182, 46, 57, 104, 239, 60, 99, 181, - 220, 231, 45, 79, 156, 89, 149, 143, 190, 103, 153, 61, 235, 73, 136, 20, - 89, 243, 16, 130, 247, 141, 134, 93, 80, 68, 85, 84, 8, 72, 194, 4, - 242, 110, 19, 133, 199, 70, 172, 92, 132, 254, 67, 74, 36, 94, 13, 90, - 154, 184, 9, 109, 118, 243, 214, 71, 36, 95, 0, 90, 201, 105, 112, 215, - 69, 196, 224, 210, 236, 242, 155, 211, 37, 134, 69, 113, 157, 97, 68, 26, - 230, 149, 219, 180, 20, 76, 172, 145, 154, 40, 129, 8, 93, 56, 162, 124, - 207, 233, 105, 19, 3, 183, 155, 134, 8, 244, 213, 78, 139, 88, 156, 37, - 51, 152, 111, 102, 112, 250, 114, 252, 201, 241, 133, 24, 136, 153, 5, 90, - 210, 197, 216, 24, 131, 17, 147, 246, 13, 86, 3, 253, 179, 237, 101, 114, - 243, 191, 207, 2, 220, 133, 244, 53, 87, 125, 154, 158, 197, 20, 8, 83, - 32, 191, 38, 241, 204, 22, 168, 59, 217, 123, 162, 82, 21, 50, 130, 89, - 239, 253, 195, 56, 253, 74, 147, 125, 234, 199, 250, 28, 65, 193, 22, 237, - 193, 94, 58, 229, 139, 176, 69, 42, 179, 164, 150, 168, 246, 214, 86, 174, - 59, 117, 15, 19, 76, 37, 214, 238, 153, 226, 154, 45, 109, 114, 198, 107, - 45, 70, 238, 196, 142, 252, 244, 71, 123, 136, 134, 188, 99, 132, 25, 42, - 240, 0, 196, 33, 26, 124, 256, 145, 27, 102, 153, 35, 28, 132, 221, 167, - 138, 133, 41, 170, 95, 224, 40, 139, 239, 153, 1, 106, 255, 106, 170, 163, - 127, 44, 155, 232, 194, 119, 232, 117, 239, 143, 108, 41, 3, 9, 180, 256, - 144, 113, 133, 200, 79, 69, 128, 216, 31, 50, 102, 209, 249, 136, 150, 154, - 182, 51, 228, 39, 127, 142, 87, 15, 94, 92, 187, 245, 31, 236, 64, 58, - 114, 11, 17, 166, 189, 152, 218, 34, 123, 39, 58, 37, 153, 91, 63, 121, - 31, 34, 12, 254, 106, 96, 171, 14, 155, 247, 214, 69, 24, 98, 3, 204, - 202, 194, 207, 30, 253, 44, 119, 70, 14, 96, 82, 250, 63, 6, 232, 38, - 89, 144, 102, 191, 82, 254, 20, 222, 96, 162, 110, 6, 159, 58, 200, 226, - 98, 128, 42, 70, 84, 247, 128, 211, 136, 54, 143, 166, 60, 118, 99, 218, - 27, 193, 85, 81, 219, 223, 46, 41, 23, 233, 152, 222, 36, 236, 54, 181, - 56, 50, 4, 207, 129, 92, 78, 88, 197, 251, 131, 105, 31, 172, 38, 131, - 19, 204, 129, 47, 227, 106, 202, 183, 23, 6, 77, 224, 102, 147, 11, 218, - 131, 132, 60, 192, 208, 223, 236, 23, 103, 115, 89, 18, 185, 171, 70, 174, - 139, 0, 100, 160, 221, 11, 228, 60, 12, 122, 114, 12, 157, 235, 148, 57, - 83, 62, 173, 131, 169, 126, 85, 99, 93, 243, 81, 80, 29, 245, 206, 82, - 236, 227, 166, 14, 230, 213, 144, 97, 27, 111, 99, 164, 105, 150, 89, 111, - 252, 118, 140, 232, 120, 183, 137, 213, 232, 157, 224, 33, 134, 118, 186, 80, - 159, 2, 186, 193, 54, 242, 25, 237, 232, 249, 226, 213, 90, 149, 90, 160, - 118, 69, 64, 37, 10, 183, 109, 246, 30, 52, 219, 69, 189, 26, 116, 220, - 50, 244, 243, 243, 139, 137, 232, 98, 38, 45, 256, 143, 171, 101, 73, 238, - 123, 45, 194, 167, 250, 123, 12, 29, 136, 237, 141, 21, 89, 96, 199, 44, - 8, 214, 208, 17, 113, 41, 137, 26, 166, 155, 89, 85, 54, 58, 97, 160, - 50, 239, 58, 71, 21, 157, 139, 12, 37, 198, 182, 131, 149, 134, 16, 204, - 164, 181, 248, 166, 52, 216, 136, 201, 37, 255, 187, 240, 5, 101, 147, 231, - 14, 163, 253, 134, 146, 216, 8, 54, 224, 90, 220, 195, 75, 215, 186, 58, - 71, 204, 124, 105, 239, 53, 16, 85, 69, 163, 195, 223, 33, 38, 69, 88, - 88, 203, 99, 55, 176, 13, 156, 204, 236, 99, 194, 134, 75, 247, 126, 129, - 160, 124, 233, 206, 139, 144, 154, 45, 233, 51, 206, 61, 60, 55, 205, 107, - 84, 108, 96, 188, 203, 31, 89, 20, 115, 144, 137, 90, 237, 78, 231, 185, - 120, 217, 1, 176, 169, 30, 155, 176, 100, 113, 53, 42, 193, 108, 14, 121, - 176, 158, 137, 92, 178, 44, 110, 249, 108, 234, 94, 101, 128, 12, 250, 173, - 72, 202, 232, 66, 139, 152, 189, 18, 32, 197, 9, 238, 246, 55, 119, 183, - 196, 119, 113, 247, 191, 100, 200, 245, 46, 16, 234, 112, 136, 116, 232, 48, - 176, 108, 11, 237, 14, 153, 93, 177, 124, 72, 67, 121, 135, 143, 45, 18, - 97, 251, 184, 172, 136, 55, 213, 8, 103, 12, 221, 212, 13, 160, 116, 91, - 237, 127, 218, 190, 103, 131, 77, 82, 36, 100, 22, 252, 79, 69, 54, 26, - 65, 182, 115, 142, 247, 20, 89, 81, 188, 244, 27, 120, 240, 248, 13, 230, - 67, 133, 32, 201, 129, 87, 9, 245, 66, 88, 166, 34, 46, 184, 119, 218, - 144, 235, 163, 40, 138, 134, 127, 217, 64, 227, 116, 67, 55, 202, 130, 48, - 199, 42, 251, 112, 124, 153, 123, 194, 243, 49, 250, 12, 78, 157, 167, 134, - 210, 73, 156, 102, 21, 88, 216, 123, 45, 11, 208, 18, 47, 187, 20, 43, - 3, 180, 124, 2, 136, 176, 77, 111, 138, 139, 91, 225, 126, 8, 74, 255, - 88, 192, 193, 239, 138, 204, 139, 194, 166, 130, 252, 184, 140, 168, 30, 177, - 121, 98, 131, 124, 69, 171, 75, 49, 184, 34, 76, 122, 202, 115, 184, 253, - 120, 182, 33, 251, 1, 74, 216, 217, 243, 168, 70, 162, 119, 158, 197, 198, - 61, 89, 7, 5, 54, 199, 211, 170, 23, 226, 44, 247, 165, 195, 7, 225, - 91, 23, 50, 15, 51, 208, 106, 94, 12, 31, 43, 112, 146, 139, 246, 182, - 113, 1, 97, 15, 66, 2, 51, 76, 164, 184, 237, 200, 218, 176, 72, 98, - 33, 135, 38, 147, 140, 229, 50, 94, 81, 187, 129, 17, 238, 168, 146, 203, - 181, 99, 164, 3, 104, 98, 255, 189, 114, 142, 86, 102, 229, 102, 80, 129, - 64, 84, 79, 161, 81, 156, 128, 111, 164, 197, 18, 15, 55, 196, 198, 191, - 28, 113, 117, 96, 207, 253, 19, 158, 231, 13, 53, 130, 252, 211, 58, 180, - 212, 142, 7, 219, 38, 81, 62, 109, 167, 113, 33, 56, 97, 185, 157, 130, - 186, 129, 119, 182, 196, 26, 54, 110, 65, 170, 166, 236, 30, 22, 162, 0, - 106, 12, 248, 33, 48, 72, 159, 17, 76, 244, 172, 132, 89, 171, 196, 76, - 254, 166, 76, 218, 226, 3, 52, 220, 238, 181, 179, 144, 225, 23, 3, 166, - 158, 35, 228, 154, 204, 23, 203, 71, 134, 189, 18, 168, 236, 141, 117, 138, - 2, 132, 78, 57, 154, 21, 250, 196, 184, 40, 161, 40, 10, 178, 134, 120, - 132, 123, 101, 82, 205, 121, 55, 140, 231, 56, 231, 71, 206, 246, 198, 150, - 146, 192, 45, 105, 242, 1, 125, 18, 176, 46, 222, 122, 19, 80, 113, 133, - 131, 162, 81, 51, 98, 168, 247, 161, 139, 39, 63, 162, 22, 153, 170, 92, - 91, 130, 174, 200, 45, 112, 99, 164, 132, 184, 191, 186, 200, 167, 86, 145, - 167, 227, 130, 44, 12, 158, 172, 249, 204, 17, 54, 249, 16, 200, 21, 174, - 67, 223, 105, 201, 50, 36, 133, 203, 244, 131, 228, 67, 29, 195, 91, 91, - 55, 107, 167, 154, 170, 137, 218, 183, 169, 61, 99, 175, 128, 23, 142, 183, - 66, 255, 59, 187, 66, 85, 212, 109, 168, 82, 16, 43, 67, 139, 114, 176, - 216, 255, 130, 94, 152, 79, 183, 64, 100, 23, 214, 82, 34, 230, 48, 15, - 242, 130, 50, 241, 81, 32, 5, 125, 183, 182, 184, 99, 248, 109, 159, 210, - 226, 61, 119, 129, 39, 149, 78, 214, 107, 78, 147, 124, 228, 18, 143, 188, - 84, 180, 233, 119, 64, 39, 158, 133, 177, 168, 6, 150, 80, 117, 150, 56, - 49, 72, 49, 37, 30, 242, 49, 142, 33, 156, 34, 44, 44, 72, 58, 22, - 249, 46, 168, 80, 25, 196, 64, 174, 97, 179, 244, 134, 213, 105, 63, 151, - 21, 90, 168, 90, 245, 28, 157, 65, 250, 232, 188, 27, 99, 160, 156, 127, - 68, 193, 10, 80, 205, 36, 138, 229, 12, 223, 70, 169, 251, 41, 48, 94, - 41, 177, 99, 256, 158, 0, 6, 83, 231, 191, 120, 135, 157, 146, 218, 213, - 160, 7, 47, 234, 98, 211, 79, 225, 179, 95, 175, 105, 185, 79, 115, 0, - 104, 14, 65, 124, 15, 188, 52, 9, 253, 27, 132, 137, 13, 127, 75, 238, - 185, 253, 33, 8, 52, 157, 164, 68, 232, 188, 69, 28, 209, 233, 5, 129, - 216, 90, 252, 212, 33, 200, 222, 9, 112, 15, 43, 36, 226, 114, 15, 249, - 217, 8, 148, 22, 147, 23, 143, 67, 222, 116, 235, 250, 212, 210, 39, 142, - 108, 64, 209, 83, 73, 66, 99, 34, 17, 29, 45, 151, 244, 114, 28, 241, - 144, 208, 146, 179, 132, 89, 217, 198, 252, 219, 205, 165, 75, 107, 11, 173, - 76, 6, 196, 247, 152, 216, 248, 91, 209, 178, 57, 250, 174, 60, 79, 123, - 18, 135, 9, 241, 230, 159, 184, 68, 156, 251, 215, 9, 113, 234, 75, 235, - 103, 194, 205, 129, 230, 45, 96, 73, 157, 20, 200, 212, 212, 228, 161, 7, - 231, 228, 108, 43, 198, 87, 140, 140, 4, 182, 164, 3, 53, 104, 250, 213, - 85, 38, 89, 61, 52, 187, 35, 204, 86, 249, 100, 71, 248, 213, 163, 215, - 66, 106, 252, 129, 40, 111, 47, 24, 186, 221, 85, 205, 199, 237, 122, 181, - 32, 46, 182, 135, 33, 251, 142, 34, 208, 242, 128, 255, 4, 234, 15, 33, - 167, 222, 32, 186, 191, 34, 255, 244, 98, 240, 228, 204, 30, 142, 32, 70, - 69, 83, 110, 151, 10, 243, 141, 21, 223, 69, 61, 37, 59, 209, 102, 114, - 223, 33, 129, 254, 255, 103, 86, 247, 235, 72, 126, 177, 102, 226, 102, 30, - 149, 221, 62, 247, 251, 120, 163, 173, 57, 202, 204, 24, 39, 106, 120, 143, - 202, 176, 191, 147, 37, 38, 51, 133, 47, 245, 157, 132, 154, 71, 183, 111, - 30, 180, 18, 202, 82, 96, 170, 91, 157, 181, 212, 140, 256, 8, 196, 121, - 149, 79, 66, 127, 113, 78, 4, 197, 84, 256, 111, 222, 102, 63, 228, 104, - 136, 223, 67, 193, 93, 154, 249, 83, 204, 101, 200, 234, 84, 252, 230, 195, - 43, 140, 120, 242, 89, 63, 166, 233, 209, 94, 43, 170, 126, 5, 205, 78, - 112, 80, 143, 151, 146, 248, 137, 203, 45, 183, 61, 1, 155, 8, 102, 59, - 68, 212, 230, 61, 254, 191, 128, 223, 176, 123, 229, 27, 146, 120, 96, 165, - 213, 12, 232, 40, 186, 225, 66, 105, 200, 195, 212, 110, 237, 238, 151, 19, - 12, 171, 150, 82, 7, 228, 79, 52, 15, 78, 62, 43, 21, 154, 114, 21, - 12, 212, 256, 232, 125, 127, 5, 51, 37, 252, 136, 13, 47, 195, 168, 191, - 231, 55, 57, 251, 214, 116, 15, 86, 210, 41, 249, 242, 119, 27, 250, 203, - 107, 69, 90, 43, 206, 154, 127, 54, 100, 78, 187, 54, 244, 177, 234, 167, - 202, 136, 209, 171, 69, 114, 133, 173, 26, 139, 78, 141, 128, 32, 124, 39, - 45, 218, 96, 68, 90, 44, 67, 62, 83, 190, 188, 256, 103, 42, 102, 64, - 249, 0, 141, 11, 61, 69, 70, 66, 233, 237, 29, 200, 251, 157, 71, 51, - 64, 133, 113, 76, 35, 125, 76, 137, 217, 145, 35, 69, 226, 180, 56, 249, - 156, 163, 176, 237, 81, 54, 85, 169, 115, 211, 129, 70, 248, 40, 252, 192, - 194, 101, 247, 8, 181, 124, 217, 191, 194, 93, 99, 127, 117, 177, 144, 151, - 228, 121, 32, 11, 89, 81, 26, 29, 183, 76, 249, 132, 179, 70, 34, 102, - 20, 66, 87, 63, 124, 205, 174, 177, 87, 219, 73, 218, 91, 87, 176, 72, - 15, 211, 47, 61, 251, 165, 39, 247, 146, 70, 150, 57, 1, 212, 36, 162, - 39, 38, 16, 216, 3, 50, 116, 200, 32, 234, 77, 181, 155, 19, 90, 188, - 36, 6, 254, 46, 46, 203, 25, 230, 181, 196, 4, 151, 225, 65, 122, 216, - 168, 86, 158, 131, 136, 16, 49, 102, 233, 64, 154, 88, 228, 52, 146, 69, - 93, 157, 243, 121, 70, 209, 126, 213, 88, 145, 236, 65, 70, 96, 204, 47, - 10, 200, 77, 8, 103, 150, 48, 153, 5, 37, 52, 235, 209, 31, 181, 126, - 83, 142, 224, 140, 6, 32, 200, 171, 160, 179, 115, 229, 75, 194, 208, 39, - 59, 223, 52, 247, 38, 197, 135, 1, 6, 189, 106, 114, 168, 5, 211, 222, - 44, 63, 90, 160, 116, 172, 170, 133, 125, 138, 39, 131, 23, 178, 10, 214, - 36, 93, 28, 59, 68, 17, 123, 25, 255, 184, 204, 102, 194, 214, 129, 94, - 159, 245, 112, 141, 62, 11, 61, 197, 124, 221, 205, 11, 79, 71, 201, 54, - 58, 150, 29, 121, 87, 46, 240, 201, 68, 20, 194, 209, 47, 152, 158, 174, - 193, 164, 120, 255, 216, 165, 247, 58, 85, 130, 220, 23, 122, 223, 188, 98, - 21, 70, 72, 170, 150, 237, 76, 143, 112, 238, 206, 146, 215, 110, 4, 250, - 68, 44, 174, 177, 30, 98, 143, 241, 180, 127, 113, 48, 0, 1, 179, 199, - 59, 106, 201, 114, 29, 86, 173, 133, 217, 44, 200, 141, 107, 172, 16, 60, - 82, 58, 239, 94, 141, 234, 186, 235, 109, 173, 249, 139, 141, 59, 100, 248, - 84, 144, 49, 160, 51, 207, 164, 103, 74, 97, 146, 202, 193, 125, 168, 134, - 236, 111, 135, 121, 59, 145, 168, 200, 181, 173, 109, 2, 255, 6, 9, 245, - 90, 202, 214, 143, 121, 65, 85, 232, 132, 77, 228, 84, 26, 54, 184, 15, - 161, 29, 177, 79, 43, 0, 156, 184, 163, 165, 62, 90, 179, 93, 45, 239, - 1, 16, 120, 189, 127, 47, 74, 166, 20, 214, 233, 226, 89, 217, 229, 26, - 156, 53, 162, 60, 21, 3, 192, 72, 111, 51, 53, 101, 181, 208, 88, 82, - 179, 160, 219, 113, 240, 108, 43, 224, 162, 147, 62, 14, 95, 81, 205, 4, - 160, 177, 225, 115, 29, 69, 235, 168, 148, 29, 128, 114, 124, 129, 172, 165, - 215, 231, 214, 86, 160, 44, 157, 91, 248, 183, 73, 164, 56, 181, 162, 92, - 141, 118, 127, 240, 196, 77, 0, 9, 244, 79, 250, 100, 195, 25, 255, 85, - 94, 35, 212, 137, 107, 34, 110, 20, 200, 104, 17, 32, 231, 43, 150, 159, - 231, 216, 223, 190, 226, 109, 162, 197, 87, 92, 224, 11, 111, 73, 60, 225, - 238, 73, 246, 169, 19, 217, 119, 38, 121, 118, 70, 82, 99, 241, 110, 67, - 31, 76, 146, 215, 124, 240, 31, 103, 139, 224, 75, 160, 31, 78, 93, 4, - 64, 9, 103, 223, 6, 227, 119, 85, 116, 81, 21, 43, 46, 206, 234, 132, - 85, 99, 22, 131, 135, 97, 86, 13, 234, 188, 21, 14, 89, 169, 207, 238, - 219, 177, 190, 72, 157, 41, 114, 140, 92, 141, 186, 1, 63, 107, 225, 184, - 118, 150, 153, 254, 241, 106, 120, 210, 104, 144, 151, 161, 88, 206, 125, 164, - 15, 211, 173, 49, 146, 241, 71, 36, 58, 201, 46, 27, 33, 187, 91, 162, - 117, 19, 210, 213, 187, 97, 193, 50, 190, 114, 217, 60, 61, 167, 207, 213, - 213, 53, 135, 34, 156, 91, 115, 119, 46, 99, 242, 1, 90, 52, 198, 227, - 201, 91, 216, 146, 210, 82, 121, 38, 73, 133, 182, 193, 132, 148, 246, 75, - 109, 157, 179, 113, 176, 134, 205, 159, 148, 58, 103, 171, 132, 156, 133, 147, - 161, 231, 39, 100, 175, 97, 125, 28, 183, 129, 135, 191, 202, 181, 29, 218, - 43, 104, 148, 203, 189, 204, 4, 182, 169, 1, 134, 122, 141, 202, 13, 187, - 177, 112, 162, 35, 231, 6, 8, 241, 99, 6, 191, 45, 113, 113, 101, 104}; - -// The S-Box we use for further linearity breaking. -// We created it by taking the digits of decimal expansion of e. -// The code that created it can be found in 'ProduceRandomSBox.c'. -unsigned char SBox[256] = { -//0 1 2 3 4 5 6 7 8 9 A B C D E F -0x7d, 0xd1, 0x70, 0x0b, 0xfa, 0x39, 0x18, 0xc3, 0xf3, 0xbb, 0xa7, 0xd4, 0x84, 0x25, 0x3b, 0x3c, // 0 -0x2c, 0x15, 0x69, 0x9a, 0xf9, 0x27, 0xfb, 0x02, 0x52, 0xba, 0xa8, 0x4b, 0x20, 0xb5, 0x8b, 0x3a, // 1 -0x88, 0x8e, 0x26, 0xcb, 0x71, 0x5e, 0xaf, 0xad, 0x0c, 0xac, 0xa1, 0x93, 0xc6, 0x78, 0xce, 0xfc, // 2 -0x2a, 0x76, 0x17, 0x1f, 0x62, 0xc2, 0x2e, 0x99, 0x11, 0x37, 0x65, 0x40, 0xfd, 0xa0, 0x03, 0xc1, // 3 -0xca, 0x48, 0xe2, 0x9b, 0x81, 0xe4, 0x1c, 0x01, 0xec, 0x68, 0x7a, 0x5a, 0x50, 0xf8, 0x0e, 0xa3, // 4 -0xe8, 0x61, 0x2b, 0xa2, 0xeb, 0xcf, 0x8c, 0x3d, 0xb4, 0x95, 0x13, 0x08, 0x46, 0xab, 0x91, 0x7b, // 5 -0xea, 0x55, 0x67, 0x9d, 0xdd, 0x29, 0x6a, 0x8f, 0x9f, 0x22, 0x4e, 0xf2, 0x57, 0xd2, 0xa9, 0xbd, // 6 -0x38, 0x16, 0x5f, 0x4c, 0xf7, 0x9e, 0x1b, 0x2f, 0x30, 0xc7, 0x41, 0x24, 0x5c, 0xbf, 0x05, 0xf6, // 7 -0x0a, 0x31, 0xa5, 0x45, 0x21, 0x33, 0x6b, 0x6d, 0x6c, 0x86, 0xe1, 0xa4, 0xe6, 0x92, 0x9c, 0xdf, // 8 -0xe7, 0xbe, 0x28, 0xe3, 0xfe, 0x06, 0x4d, 0x98, 0x80, 0x04, 0x96, 0x36, 0x3e, 0x14, 0x4a, 0x34, // 9 -0xd3, 0xd5, 0xdb, 0x44, 0xcd, 0xf5, 0x54, 0xdc, 0x89, 0x09, 0x90, 0x42, 0x87, 0xff, 0x7e, 0x56, // A -0x5d, 0x59, 0xd7, 0x23, 0x75, 0x19, 0x97, 0x73, 0x83, 0x64, 0x53, 0xa6, 0x1e, 0xd8, 0xb0, 0x49, // B -0x3f, 0xef, 0xbc, 0x7f, 0x43, 0xf0, 0xc9, 0x72, 0x0f, 0x63, 0x79, 0x2d, 0xc0, 0xda, 0x66, 0xc8, // C -0x32, 0xde, 0x47, 0x07, 0xb8, 0xe9, 0x1d, 0xc4, 0x85, 0x74, 0x82, 0xcc, 0x60, 0x51, 0x77, 0x0d, // D -0xaa, 0x35, 0xed, 0x58, 0x7c, 0x5b, 0xb9, 0x94, 0x6e, 0x8d, 0xb1, 0xc5, 0xb7, 0xee, 0xb6, 0xae, // E -0x10, 0xe0, 0xd6, 0xd9, 0xe5, 0x4f, 0xf1, 0x12, 0x00, 0xd0, 0xf4, 0x1a, 0x6f, 0x8a, 0xb3, 0xb2 }; // F - -/////////////////////////////////////////////////////////////////////////////////////////////// -// -// Helper functions definition portion. -// -/////////////////////////////////////////////////////////////////////////////////////////////// - -// Don't vectorize, move decl to header file - -// Translates an input array with values in base 257 to output array with values in base 256. -// Returns the carry bit. -// -// Parameters: -// - input: the input array of size EIGHTH_N. Each value in the array is a number in Z_257. -// The MSB is assumed to be the last one in the array. -// - output: the input array encoded in base 256. -// -// Returns: -// - The carry bit (MSB). -swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N]); - -// Translates an input integer into the range (-FIELD_SIZE / 2) <= result <= (FIELD_SIZE / 2). -// -// Parameters: -// - x: the input integer. -// -// Returns: -// - The result, which equals (x MOD FIELD_SIZE), such that |result| <= (FIELD_SIZE / 2). -int Center(int x); - -// Calculates bit reversal permutation. -// -// Parameters: -// - input: the input to reverse. -// - numOfBits: the number of bits in the input to reverse. -// -// Returns: -// - The resulting number, which is obtained from the input by reversing its bits. -int ReverseBits(int input, int numOfBits); - -// Initializes the FFT fast lookup table. -// Shall be called only once. -void InitializeSWIFFTX(); - -// Calculates the FFT. -// -// Parameters: -// - input: the input to the FFT. -// - output: the resulting output. -void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output); - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Helper functions implementation portion. -/////////////////////////////////////////////////////////////////////////////////////////////// - -// Don't vectorize, delete this copy. - -swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N]) -{ - swift_int32_t pairs[EIGHTH_N / 2]; - int i; - - for (i = 0; i < EIGHTH_N; i += 2) - { - // input[i] + 257 * input[i + 1] - pairs[i >> 1] = input[i] + input[i + 1] + (input[i + 1] << 8); - } - - for (i = (EIGHTH_N / 2) - 1; i > 0; --i) - { - int j; - - for (j = i - 1; j < (EIGHTH_N / 2) - 1; ++j) - { - // pairs[j + 1] * 513, because 257^2 = 513 % 256^2. - register swift_int32_t temp = pairs[j] + pairs[j + 1] + (pairs[j + 1] << 9); - pairs[j] = temp & 0xffff; - pairs[j + 1] += (temp >> 16); - } - } - - for (i = 0; i < EIGHTH_N; i += 2) - { - output[i] = (unsigned char) (pairs[i >> 1] & 0xff); - output[i + 1] = (unsigned char) ((pairs[i >> 1] >> 8) & 0xff); - } - - return (pairs[EIGHTH_N/2 - 1] >> 16); -} - -int Center(int x) -{ - int result = x % FIELD_SIZE; - - if (result > (FIELD_SIZE / 2)) - result -= FIELD_SIZE; - - if (result < (FIELD_SIZE / -2)) - result += FIELD_SIZE; - - return result; -} - -int ReverseBits(int input, int numOfBits) -{ - register int reversed = 0; - - for (input |= numOfBits; input > 1; input >>= 1) - reversed = (reversed << 1) | (input & 1); - - return reversed; -} - -void InitializeSWIFFTX() -{ - int i, j, k, x; - // The powers of OMEGA - int omegaPowers[2 * N]; - omegaPowers[0] = 1; - - if (wasSetupDone) - return; - - for (i = 1; i < (2 * N); ++i) - { - omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA); - } - - for (i = 0; i < (N / W); ++i) - { - for (j = 0; j < W; ++j) - { - multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)]; - } - } - - for (x = 0; x < 256; ++x) - { - for (j = 0; j < 8; ++j) - { - register int temp = 0; - for (k = 0; k < 8; ++k) - { - temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)] - * ((x >> k) & 1); - } - - fftTable[(x << 3) + j] = Center(temp); - } - } - - wasSetupDone = true; -} - -// input should be deinterleaved in contiguos memory -// output and F are 4x32 -// multipliers & fftTable are scalar 16 - - -void FFT_4way(const unsigned char input[EIGHTH_N], swift_int32_t *output) -{ - swift_int16_t *mult = multipliers; - m128_swift_int32_t F[64]; - - for (int i = 0; i < 8; i++) - { - int j = i<<3; - -// Need to isolate bytes in input, 8 bytes per lane. -// Each iteration of the loop process one input vector -// Each lane reads a different index to ffttable. - -// deinterleave the input! - -// load table with 4 lanes from different indexes into fftTable -// extract bytes into m128 4x16 -// mutiply by vectorized mult - -// input[lane][byte] - - __m128i table; - table = _mm_set_epi32( fftTable[ input[3][i] ], - fftTable[ input[2][i] ], - fftTable[ input[1][i] ], - fftTable[ input[0][i] ] ); - - F[i ] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ), table ); - - table = _mm_set_epi32( fftTable[ input[3][i+1] ] - fftTable[ input[2][i+1] ] - fftTable[ input[1][i+1] ] - fftTable[ input[0][i+1] ] ); - - F[i+8] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ), table ); - - - m128_swift_int16_t *table = &( fftTable[input[i] << 3] ); - - F[i ] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ), - mm128_const1_32( table[0] ) ); - F[i+ 8] = _mm_mullo_epi32( mm128_const1_32( mult[j+1] ), - mm128_const1_32( table[1] ) ); - F[i+16] = _mm_mullo_epi32( mm128_const1_32( mult[j+2] ), - mm128_const1_32( table[2] ) ); - F[i+24] = _mm_mullo_epi32( mm128_const1_32( mult[j+3] ), - mm128_const1_32( table[3] ) ); - F[i+32] = _mm_mullo_epi32( mm128_const1_32( mult[j+4] ), - mm128_const1_32( table[4] ) ); - F[i+40] = _mm_mullo_epi32( mm128_const1_32( mult[j+5] ), - mm128_const1_32( table[5] ) ); - F[i+48] = _mm_mullo_epi32( mm128_const1_32( mult[j+6] ), - mm128_const1_32( table[6] ) ); - F[i+56] = _mm_mullo_epi32( mm128_const1_32( mult[j+7] ), - mm128_const1_32( table[7] ) ); - } - - - for ( int i = 0; i < 8; i++ ) - { - int j = i<<3; - ADD_SUB_4WAY( F[j ], F[j+1] ); - ADD_SUB_4WAY( F[j+2], F[j+3] ); - ADD_SUB_4WAY( F[j+4], F[j+5] ); - ADD_SUB_4WAY( F[j+6], F[j+7] ); - - F[j+3] = _mm_slli_epi32( F[j+3], 4 ); - F[j+7] = _mm_slli_epi32( F[j+7], 4 ); - - ADD_SUB_4WAY( F[j ], F[j+2] ); - ADD_SUB_4WAY( F[j+1], F[j+3] ); - ADD_SUB_4WAY( F[j+4], F[j+6] ); - ADD_SUB_4WAY( F[j+5], F[j+7] ); - - F[j+5] = _mm_slli_epi32( F[j+5], 2 ); - F[j+6] = _mm_slli_epi32( F[j+6], 4 ); - F[j+7] = _mm_slli_epi32( F[j+7], 6 ); - - ADD_SUB_4WAY( F[j ], F[j+4] ); - ADD_SUB_4WAY( F[j+1], F[j+5] ); - ADD_SUB_4WAY( F[j+2], F[j+6] ); - ADD_SUB_4WAY( F[j+3], F[j+7] ); - - output[i ] = Q_REDUCE_4WAY( F[j ] ); - output[i+ 8] = Q_REDUCE_4WAY( F[j+1] ); - output[i+16] = Q_REDUCE_4WAY( F[j+2] ); - output[i+24] = Q_REDUCE_4WAY( F[j+3] ); - output[i+32] = Q_REDUCE_4WAY( F[j+4] ); - output[i+40] = Q_REDUCE_4WAY( F[j+5] ); - output[i+48] = Q_REDUCE_4WAY( F[j+6] ); - output[i+56] = Q_REDUCE_4WAY( F[j+7] ); - } -} - -// Calculates the FFT part of SWIFFT. -// We divided the SWIFFT calculation into two, because that way we could save 2 computations of -// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs -// is only the A's part. -// -// Parameters: -// - input: the input to FFT. -// - m: the input size divided by 8. The function performs m FFTs. -// - output: will store the result. -void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output) -{ - int i; - - for (i = 0; - i < m; - i++, input += EIGHTH_N, output += N) - { - FFT(input, output); - } -} - -// Calculates the 'sum' part of SWIFFT, including the base change at the end. -// We divided the SWIFFT calculation into two, because that way we could save 2 computations of -// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs -// is only the A's part. -// -// Parameters: -// - input: the input. Of size 64 * m. -// - m: the input size divided by 64. -// - output: will store the result. -// - a: the coefficients in the sum. Of size 64 * m. -void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a) -{ - int i, j; - swift_int32_t result[N]; - register swift_int16_t carry = 0; - - for (j = 0; j < N; ++j) - { - register swift_int32_t sum = 0; - const register swift_int32_t *f = input + j; - const register swift_int16_t *k = a + j; - - for (i = 0; i < m; i++, f += N,k += N) - { - sum += (*f) * (*k); - } - - result[j] = sum; - } - - for (j = 0; j < N; ++j) - { - result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE; - } - - for (j = 0; j < 8; ++j) - { - int register carryBit = TranslateToBase256(result + (j << 3), output + (j << 3)); - carry |= carryBit << j; - } - - output[N] = carry; -} - - -// On entry input is interleaved 4x64. SIZE is *4 lanes / 8 bytes, -// multiply by 2. - - -void ComputeSingleSWIFFTX_4way( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], - unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], - bool doSmooth) -{ - int i; - // Will store the result of the FFT parts: - m128_swift_int32_t fftOut[N * M]; -// swift_int32_t fftOut[N * M]; - unsigned char intermediate[N * 3 + 8]; - unsigned char carry0,carry1,carry2; - - // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets - // overriden by the following SWIFFT): - - // 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs: - SWIFFTFFT(input, M, fftOut); - - // 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients: - - // 2a. The first SWIFFT: - SWIFFTSum(fftOut, M, intermediate, As); - // Remember the carry byte: - carry0 = intermediate[N]; - - // 2b. The second one: - SWIFFTSum(fftOut, M, intermediate + N, As + (M * N)); - carry1 = intermediate[2 * N]; - - // 2c. The third one: - SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N)); - carry2 = intermediate[3 * N]; - - //2d. Put three carry bytes in their place - intermediate[3 * N] = carry0; - intermediate[(3 * N) + 1] = carry1; - intermediate[(3 * N) + 2] = carry2; - - // Padding intermediate output with 5 zeroes. - memset(intermediate + (3 * N) + 3, 0, 5); - - // Apply the S-Box: - for (i = 0; i < (3 * N) + 8; ++i) - { - intermediate[i] = SBox[intermediate[i]]; - } - - // 3. The final and last SWIFFT: - SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut); - SWIFFTSum(fftOut, 3 * (N/8) + 1, output, As); - - if (doSmooth) - { - unsigned char sum[N]; - register int i, j; - memset(sum, 0, N); - - for (i = 0; i < (N + 1) * 8; ++i) - { - register const swift_int16_t *AsRow; - register int AShift; - - if (!(output[i >> 3] & (1 << (i & 7)))) - { - continue; - } - - AsRow = As + N * M + (i & ~(N - 1)) ; - AShift = i & 63; - - for (j = AShift; j < N; ++j) - { - sum[j] += AsRow[j - AShift]; - } - - for(j = 0; j < AShift; ++j) - { - sum[j] -= AsRow[N - AShift + j]; - } - } - - for (i = 0; i < N; ++i) - { - output[i] = sum[i]; - } - - output[N] = 0; - } -} diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c index f38ea854..c7d8c727 100644 --- a/algo/swifftx/swifftx.c +++ b/algo/swifftx/swifftx.c @@ -18,6 +18,8 @@ //#include "stdbool.h" #include +#include "simd-utils.h" + /////////////////////////////////////////////////////////////////////////////////////////////// // Constants and static tables portion. /////////////////////////////////////////////////////////////////////////////////////////////// @@ -49,20 +51,20 @@ // - A: the first operand. After the operation stores the sum of the two operands. // - B: the second operand. After the operation stores the difference between the first and the // second operands. -#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));} +//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));} // Quickly reduces an integer modulo 257. // // Parameters: // - A: the input. -#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8)) +//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8)) // Since we need to do the setup only once, this is the indicator variable: static bool wasSetupDone = false; // This array stores the powers of omegas that correspond to the indices, which are the input // values. Known also as the "outer FFT twiddle factors". -swift_int16_t multipliers[N]; +swift_int16_t multipliers[N] __attribute__ ((aligned (64))); // This array stores the powers of omegas, multiplied by the corresponding values. // We store this table to save computation time. @@ -72,14 +74,14 @@ swift_int16_t multipliers[N]; // compression function, i is between 0 and 31, x_i is a 64-bit value. // One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper -- // formula (2), section 3, page 6. -swift_int16_t fftTable[256 * EIGHTH_N]; +swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64))); // The A's we use in SWIFFTX shall be random elements of Z_257. // We generated these A's from the decimal expansion of PI as follows: we converted each // triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A // element, otherwise move to the next triple of digits in the expansion. This guarntees that // the A's are random, provided that PI digits are. -const swift_int16_t As[3 * M * N] = +const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) = {141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78, 50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93, 95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105, @@ -602,21 +604,14 @@ void InitializeSWIFFTX() int omegaPowers[2 * N]; omegaPowers[0] = 1; - if (wasSetupDone) - return; + if (wasSetupDone) return; for (i = 1; i < (2 * N); ++i) - { omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA); - } for (i = 0; i < (N / W); ++i) - { for (j = 0; j < W; ++j) - { multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)]; - } - } for (x = 0; x < 256; ++x) { @@ -624,10 +619,8 @@ void InitializeSWIFFTX() { register int temp = 0; for (k = 0; k < 8; ++k) - { temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)] * ((x >> k) & 1); - } fftTable[(x << 3) + j] = Center(temp); } @@ -636,9 +629,203 @@ void InitializeSWIFFTX() wasSetupDone = true; } +// In the original code the F matrix is rotated so it was not aranged +// the same as all the other data. Rearanging F to match all the other +// data made vectorizing possible, the compiler probably could have been +// able to auto-vectorize with proper data organisation. +// Also in the original code the custom 16 bit data types are all now 32 +// bit int32_t regardless of the type name. +// void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) { - swift_int16_t *mult = multipliers; +#if defined(__AVX2__) + + __m256i F[8] __attribute__ ((aligned (64))); + __m256i *mul = (__m256i*)multipliers; + __m256i *out = (__m256i*)output; + __m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] ); + + F[0] = _mm256_mullo_epi32( mul[0], *tbl ); + tbl = (__m256i*)&( fftTable[ input[1] << 3 ] ); + F[1] = _mm256_mullo_epi32( mul[1], *tbl ); + tbl = (__m256i*)&( fftTable[ input[2] << 3 ] ); + F[2] = _mm256_mullo_epi32( mul[2], *tbl ); + tbl = (__m256i*)&( fftTable[ input[3] << 3 ] ); + F[3] = _mm256_mullo_epi32( mul[3], *tbl ); + tbl = (__m256i*)&( fftTable[ input[4] << 3 ] ); + F[4] = _mm256_mullo_epi32( mul[4], *tbl ); + tbl = (__m256i*)&( fftTable[ input[5] << 3 ] ); + F[5] = _mm256_mullo_epi32( mul[5], *tbl ); + tbl = (__m256i*)&( fftTable[ input[6] << 3 ] ); + F[6] = _mm256_mullo_epi32( mul[6], *tbl ); + tbl = (__m256i*)&( fftTable[ input[7] << 3 ] ); + F[7] = _mm256_mullo_epi32( mul[7], *tbl ); + + #define ADD_SUB( a, b ) \ + { \ + __m256i tmp = b; \ + b = _mm256_sub_epi32( a, b ); \ + a = _mm256_add_epi32( a, tmp ); \ + } + + ADD_SUB( F[0], F[1] ); + ADD_SUB( F[2], F[3] ); + ADD_SUB( F[4], F[5] ); + ADD_SUB( F[6], F[7] ); + + F[3] = _mm256_slli_epi32( F[3], 4 ); + F[7] = _mm256_slli_epi32( F[7], 4 ); + + ADD_SUB( F[0], F[2] ); + ADD_SUB( F[1], F[3] ); + ADD_SUB( F[4], F[6] ); + ADD_SUB( F[5], F[7] ); + + F[5] = _mm256_slli_epi32( F[5], 2 ); + F[6] = _mm256_slli_epi32( F[6], 4 ); + F[7] = _mm256_slli_epi32( F[7], 6 ); + + ADD_SUB( F[0], F[4] ); + ADD_SUB( F[1], F[5] ); + ADD_SUB( F[2], F[6] ); + ADD_SUB( F[3], F[7] ); + + #undef ADD_SUB + +#if defined (__AVX512VL__) && defined(__AVX512BW__) + + const __m256i mask = _mm256_movm_epi8( 0x11111111 ); + +#else + + const __m256i mask = m256_const1_32( 0x000000ff ); + +#endif + + #define Q_REDUCE( a ) \ + _mm256_sub_epi32( _mm256_and_si256( a, mask ), \ + _mm256_srai_epi32( a, 8 ) ) + + out[0] = Q_REDUCE( F[0] ); + out[1] = Q_REDUCE( F[1] ); + out[2] = Q_REDUCE( F[2] ); + out[3] = Q_REDUCE( F[3] ); + out[4] = Q_REDUCE( F[4] ); + out[5] = Q_REDUCE( F[5] ); + out[6] = Q_REDUCE( F[6] ); + out[7] = Q_REDUCE( F[7] ); + + #undef Q_REDUCE + +#elif defined(__SSE4_1__) + + __m128i F[16] __attribute__ ((aligned (64))); + __m128i *mul = (__m128i*)multipliers; + __m128i *out = (__m128i*)output; + __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] ); + + F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] ); + F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[1] << 3 ] ); + F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] ); + F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[2] << 3 ] ); + F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] ); + F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[3] << 3 ] ); + F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] ); + F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[4] << 3 ] ); + F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] ); + F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[5] << 3 ] ); + F[10] = _mm_mullo_epi32( mul[10], tbl[0] ); + F[11] = _mm_mullo_epi32( mul[11], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[6] << 3 ] ); + F[12] = _mm_mullo_epi32( mul[12], tbl[0] ); + F[13] = _mm_mullo_epi32( mul[13], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[7] << 3 ] ); + F[14] = _mm_mullo_epi32( mul[14], tbl[0] ); + F[15] = _mm_mullo_epi32( mul[15], tbl[1] ); + + #define ADD_SUB( a, b ) \ + { \ + __m128i tmp = b; \ + b = _mm_sub_epi32( a, b ); \ + a = _mm_add_epi32( a, tmp ); \ + } + + ADD_SUB( F[ 0], F[ 2] ); + ADD_SUB( F[ 1], F[ 3] ); + ADD_SUB( F[ 4], F[ 6] ); + ADD_SUB( F[ 5], F[ 7] ); + ADD_SUB( F[ 8], F[10] ); + ADD_SUB( F[ 9], F[11] ); + ADD_SUB( F[12], F[14] ); + ADD_SUB( F[13], F[15] ); + + F[ 6] = _mm_slli_epi32( F[ 6], 4 ); + F[ 7] = _mm_slli_epi32( F[ 7], 4 ); + F[14] = _mm_slli_epi32( F[14], 4 ); + F[15] = _mm_slli_epi32( F[15], 4 ); + + ADD_SUB( F[ 0], F[ 4] ); + ADD_SUB( F[ 1], F[ 5] ); + ADD_SUB( F[ 2], F[ 6] ); + ADD_SUB( F[ 3], F[ 7] ); + ADD_SUB( F[ 8], F[12] ); + ADD_SUB( F[ 9], F[13] ); + ADD_SUB( F[10], F[14] ); + ADD_SUB( F[11], F[15] ); + + F[10] = _mm_slli_epi32( F[10], 2 ); + F[11] = _mm_slli_epi32( F[11], 2 ); + F[12] = _mm_slli_epi32( F[12], 4 ); + F[13] = _mm_slli_epi32( F[13], 4 ); + F[14] = _mm_slli_epi32( F[14], 6 ); + F[15] = _mm_slli_epi32( F[15], 6 ); + + ADD_SUB( F[ 0], F[ 8] ); + ADD_SUB( F[ 1], F[ 9] ); + ADD_SUB( F[ 2], F[10] ); + ADD_SUB( F[ 3], F[11] ); + ADD_SUB( F[ 4], F[12] ); + ADD_SUB( F[ 5], F[13] ); + ADD_SUB( F[ 6], F[14] ); + ADD_SUB( F[ 7], F[15] ); + + #undef ADD_SUB + + const __m128i mask = m128_const1_32( 0x000000ff ); + + #define Q_REDUCE( a ) \ + _mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) ) + + out[ 0] = Q_REDUCE( F[ 0] ); + out[ 1] = Q_REDUCE( F[ 1] ); + out[ 2] = Q_REDUCE( F[ 2] ); + out[ 3] = Q_REDUCE( F[ 3] ); + out[ 4] = Q_REDUCE( F[ 4] ); + out[ 5] = Q_REDUCE( F[ 5] ); + out[ 6] = Q_REDUCE( F[ 6] ); + out[ 7] = Q_REDUCE( F[ 7] ); + out[ 8] = Q_REDUCE( F[ 8] ); + out[ 9] = Q_REDUCE( F[ 9] ); + out[10] = Q_REDUCE( F[10] ); + out[11] = Q_REDUCE( F[11] ); + out[12] = Q_REDUCE( F[12] ); + out[13] = Q_REDUCE( F[13] ); + out[14] = Q_REDUCE( F[14] ); + out[15] = Q_REDUCE( F[15] ); + + #undef Q_REDUCE + +#else // < SSE4.1 + + swift_int16_t *mult = multipliers; + + // First loop unrolling: + register swift_int16_t *table = &(fftTable[input[0] << 3]); /* swift_int32_t F[64]; @@ -666,11 +853,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) F50, F51, F52, F53, F54, F55, F56, F57, F58, F59, F60, F61, F62, F63; - // First loop unrolling: - register swift_int16_t *table = &(fftTable[input[0] << 3]); - - F0 = mult[0] * table[0]; - F8 = mult[1] * table[1]; + F0 = mult[0] * table[0]; + F8 = mult[1] * table[1]; F16 = mult[2] * table[2]; F24 = mult[3] * table[3]; F32 = mult[4] * table[4]; @@ -678,90 +862,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) F48 = mult[6] * table[6]; F56 = mult[7] * table[7]; - mult += 8; table = &(fftTable[input[1] << 3]); - F1 = mult[0] * table[0]; - F9 = mult[1] * table[1]; - F17 = mult[2] * table[2]; - F25 = mult[3] * table[3]; - F33 = mult[4] * table[4]; - F41 = mult[5] * table[5]; - F49 = mult[6] * table[6]; - F57 = mult[7] * table[7]; + F1 = mult[ 8] * table[0]; + F9 = mult[ 9] * table[1]; + F17 = mult[10] * table[2]; + F25 = mult[11] * table[3]; + F33 = mult[12] * table[4]; + F41 = mult[13] * table[5]; + F49 = mult[14] * table[6]; + F57 = mult[15] * table[7]; - mult += 8; table = &(fftTable[input[2] << 3]); - F2 = mult[0] * table[0]; - F10 = mult[1] * table[1]; - F18 = mult[2] * table[2]; - F26 = mult[3] * table[3]; - F34 = mult[4] * table[4]; - F42 = mult[5] * table[5]; - F50 = mult[6] * table[6]; - F58 = mult[7] * table[7]; + F2 = mult[16] * table[0]; + F10 = mult[17] * table[1]; + F18 = mult[18] * table[2]; + F26 = mult[19] * table[3]; + F34 = mult[20] * table[4]; + F42 = mult[21] * table[5]; + F50 = mult[22] * table[6]; + F58 = mult[23] * table[7]; - mult += 8; table = &(fftTable[input[3] << 3]); - F3 = mult[0] * table[0]; - F11 = mult[1] * table[1]; - F19 = mult[2] * table[2]; - F27 = mult[3] * table[3]; - F35 = mult[4] * table[4]; - F43 = mult[5] * table[5]; - F51 = mult[6] * table[6]; - F59 = mult[7] * table[7]; + F3 = mult[24] * table[0]; + F11 = mult[25] * table[1]; + F19 = mult[26] * table[2]; + F27 = mult[27] * table[3]; + F35 = mult[28] * table[4]; + F43 = mult[29] * table[5]; + F51 = mult[30] * table[6]; + F59 = mult[31] * table[7]; - mult += 8; table = &(fftTable[input[4] << 3]); - F4 = mult[0] * table[0]; - F12 = mult[1] * table[1]; - F20 = mult[2] * table[2]; - F28 = mult[3] * table[3]; - F36 = mult[4] * table[4]; - F44 = mult[5] * table[5]; - F52 = mult[6] * table[6]; - F60 = mult[7] * table[7]; + F4 = mult[32] * table[0]; + F12 = mult[33] * table[1]; + F20 = mult[34] * table[2]; + F28 = mult[35] * table[3]; + F36 = mult[36] * table[4]; + F44 = mult[37] * table[5]; + F52 = mult[38] * table[6]; + F60 = mult[39] * table[7]; - mult += 8; table = &(fftTable[input[5] << 3]); - F5 = mult[0] * table[0]; - F13 = mult[1] * table[1]; - F21 = mult[2] * table[2]; - F29 = mult[3] * table[3]; - F37 = mult[4] * table[4]; - F45 = mult[5] * table[5]; - F53 = mult[6] * table[6]; - F61 = mult[7] * table[7]; + F5 = mult[40] * table[0]; + F13 = mult[41] * table[1]; + F21 = mult[42] * table[2]; + F29 = mult[43] * table[3]; + F37 = mult[44] * table[4]; + F45 = mult[45] * table[5]; + F53 = mult[46] * table[6]; + F61 = mult[47] * table[7]; - mult += 8; table = &(fftTable[input[6] << 3]); - F6 = mult[0] * table[0]; - F14 = mult[1] * table[1]; - F22 = mult[2] * table[2]; - F30 = mult[3] * table[3]; - F38 = mult[4] * table[4]; - F46 = mult[5] * table[5]; - F54 = mult[6] * table[6]; - F62 = mult[7] * table[7]; + F6 = mult[48] * table[0]; + F14 = mult[49] * table[1]; + F22 = mult[50] * table[2]; + F30 = mult[51] * table[3]; + F38 = mult[52] * table[4]; + F46 = mult[53] * table[5]; + F54 = mult[54] * table[6]; + F62 = mult[55] * table[7]; - mult += 8; table = &(fftTable[input[7] << 3]); - F7 = mult[0] * table[0]; - F15 = mult[1] * table[1]; - F23 = mult[2] * table[2]; - F31 = mult[3] * table[3]; - F39 = mult[4] * table[4]; - F47 = mult[5] * table[5]; - F55 = mult[6] * table[6]; - F63 = mult[7] * table[7]; - + F7 = mult[56] * table[0]; + F15 = mult[57] * table[1]; + F23 = mult[58] * table[2]; + F31 = mult[59] * table[3]; + F39 = mult[60] * table[4]; + F47 = mult[61] * table[5]; + F55 = mult[62] * table[6]; + F63 = mult[63] * table[7]; + + #define ADD_SUB( a, b ) \ + { \ + int temp = b; \ + b = a - b; \ + a = a + temp; \ + } + + #define Q_REDUCE( a ) \ + ( ( (a) & 0xff ) - ( (a) >> 8 ) ) + /* for ( int i = 0; i < 8; i++ ) @@ -800,7 +987,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) } */ - // Second loop unrolling: // Iteration 0: ADD_SUB(F0, F1); @@ -1057,6 +1243,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) output[47] = Q_REDUCE(F61); output[55] = Q_REDUCE(F62); output[63] = Q_REDUCE(F63); + + #undef ADD_SUB + #undef Q_REDUCE + +#endif // AVX2 elif SSE4.1 else } // Calculates the FFT part of SWIFFT. @@ -1086,24 +1277,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output) // - m: the input size divided by 64. // - output: will store the result. // - a: the coefficients in the sum. Of size 64 * m. -void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a) +void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output, + const swift_int16_t *a ) { int i, j; - swift_int32_t result[N]; + swift_int32_t result[N] __attribute__ ((aligned (64))); register swift_int16_t carry = 0; +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + __m512i *res = (__m512i*)result; + for ( j = 0; j < N/16; ++j ) + { + __m512i sum = _mm512_setzero_si512(); + const __m512i *f = (__m512i*)input + j; + const __m512i *k = (__m512i*)a + j; + for ( i = 0; i < m; i++, f += N/16, k += N/16 ) + sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) ); + res[j] = sum; + } + +#elif defined(__AVX2__) + + __m256i *res = (__m256i*)result; + for ( j = 0; j < N/8; ++j ) + { + __m256i sum = _mm256_setzero_si256(); + const __m256i *f = (__m256i*)input + j; + const __m256i *k = (__m256i*)a + j; + for ( i = 0; i < m; i++, f += N/8, k += N/8 ) + sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) ); + res[j] = sum; + } + +#elif defined(__SSE4_1__) + + __m128i *res = (__m128i*)result; + for ( j = 0; j < N/4; ++j ) + { + __m128i sum = _mm_setzero_si128(); + const __m128i *f = (__m128i*)input + j; + const __m128i *k = (__m128i*)a + j; + for ( i = 0; i < m; i++, f += N/4, k += N/4 ) + sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) ); + res[j] = sum; + } + +#else + for (j = 0; j < N; ++j) { register swift_int32_t sum = 0; const register swift_int32_t *f = input + j; const register swift_int16_t *k = a + j; - for (i = 0; i < m; i++, f += N,k += N) sum += (*f) * (*k); - result[j] = sum; } +#endif + for (j = 0; j < N; ++j) result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE; @@ -1116,14 +1349,15 @@ void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const s output[N] = carry; } +/* void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], bool doSmooth) { int i; // Will store the result of the FFT parts: - swift_int32_t fftOut[N * M]; - unsigned char intermediate[N * 3 + 8]; + swift_int32_t fftOut[N * M] __attribute__ ((aligned (64))); + unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64))); unsigned char carry0,carry1,carry2; // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets @@ -1193,51 +1427,50 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], output[N] = 0; } } +*/ -void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], - unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE] ) +void ComputeSingleSWIFFTX( unsigned char *input, unsigned char *output ) { int i; // Will store the result of the FFT parts: - swift_int32_t fftOut[N * M]; - unsigned char intermediate[N * 3 + 8]; + swift_int32_t fftOut[N * M] __attribute__ ((aligned (64))); + unsigned char sum[ N*3 + 8 ] __attribute__ ((aligned (64))); unsigned char carry0,carry1,carry2; // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets // overriden by the following SWIFFT): // 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs: - SWIFFTFFT(input, M, fftOut); + SWIFFTFFT( input, M, fftOut ); // 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients: // 2a. The first SWIFFT: - SWIFFTSum(fftOut, M, intermediate, As); - // Remember the carry byte: - carry0 = intermediate[N]; + SWIFFTSum( fftOut, M, sum, As ); + carry0 = sum[N]; // 2b. The second one: - SWIFFTSum(fftOut, M, intermediate + N, As + (M * N)); - carry1 = intermediate[2 * N]; + SWIFFTSum( fftOut, M, sum + N, As + M*N ); + carry1 = sum[ 2*N ]; // 2c. The third one: - SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N)); - carry2 = intermediate[3 * N]; + SWIFFTSum( fftOut, M, sum + 2*N, As + 2*M*N ); + carry2 = sum[ 3*N ]; //2d. Put three carry bytes in their place - intermediate[3 * N] = carry0; - intermediate[(3 * N) + 1] = carry1; - intermediate[(3 * N) + 2] = carry2; + sum[ 3*N ] = carry0; + sum[ 3*N + 1 ] = carry1; + sum[ 3*N + 2 ] = carry2; // Padding intermediate output with 5 zeroes. - memset(intermediate + (3 * N) + 3, 0, 5); + memset( sum + 3*N + 3, 0, 5 ); // Apply the S-Box: for ( i = 0; i < (3 * N) + 8; ++i ) - intermediate[i] = SBox[intermediate[i]]; + sum[i] = SBox[ sum[i] ]; // 3. The final and last SWIFFT: - SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut); - SWIFFTSum(fftOut, 3 * (N/8) + 1, output, As); - + SWIFFTFFT( sum, 3 * (N/8) + 1, fftOut ); + SWIFFTSum( fftOut, 3 * (N/8) + 1, sum, As ); + memcpy( output, sum, SWIFFTX_OUTPUT_BLOCK_SIZE - 1 ); } diff --git a/algo/swifftx/swifftx.c.bak b/algo/swifftx/swifftx.c.bak deleted file mode 100644 index 24453e21..00000000 --- a/algo/swifftx/swifftx.c.bak +++ /dev/null @@ -1,1155 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////////////////////// -// -// SWIFFTX ANSI C OPTIMIZED 32BIT IMPLEMENTATION FOR NIST SHA-3 COMPETITION -// -// SWIFFTX.c -// -// October 2008 -// -// This is the source file of the OPTIMIZED 32BIT implementation of SWIFFTX hash function. -// SWIFFTX is a candidate function for SHA-3 NIST competition. -// More details about SWIFFTX can be found in the accompanying submission documents. -// -/////////////////////////////////////////////////////////////////////////////////////////////// -#include "swifftx.h" -// See the remarks concerning compatibility issues inside stdint.h. -#include "stdint.h" -// Remove this while using gcc: -//#include "stdbool.h" -#include - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Constants and static tables portion. -/////////////////////////////////////////////////////////////////////////////////////////////// - -// In SWIFFTX we work over Z_257, so this is the modulus and the arithmetic is performed modulo -// this number. -#define FIELD_SIZE 257 - -// The size of FFT we use: -#define N 64 - -#define LOGN 6 - -#define EIGHTH_N (N / 8) - -// The number of FFTS done on the input. -#define M (SWIFFTX_INPUT_BLOCK_SIZE / 8) // 32 - -// Omega is the 128th root of unity in Z_257. -// We choose w = 42. -#define OMEGA 42 - -// The size of the inner FFT lookup table: -#define W 8 - -// Calculates the sum and the difference of two numbers. -// -// Parameters: -// - A: the first operand. After the operation stores the sum of the two operands. -// - B: the second operand. After the operation stores the difference between the first and the -// second operands. -#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));} - -// Quickly reduces an integer modulo 257. -// -// Parameters: -// - A: the input. -#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8)) - -// Since we need to do the setup only once, this is the indicator variable: -static bool wasSetupDone = false; - -// This array stores the powers of omegas that correspond to the indices, which are the input -// values. Known also as the "outer FFT twiddle factors". -swift_int16_t multipliers[N]; - -// This array stores the powers of omegas, multiplied by the corresponding values. -// We store this table to save computation time. -// -// To calculate the intermediate value of the compression function (the first out of two -// stages), we multiply the k-th bit of x_i by w^[(2i + 1) * k]. {x_i} is the input to the -// compression function, i is between 0 and 31, x_i is a 64-bit value. -// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper -- -// formula (2), section 3, page 6. -swift_int16_t fftTable[256 * EIGHTH_N]; - -// The A's we use in SWIFFTX shall be random elements of Z_257. -// We generated these A's from the decimal expansion of PI as follows: we converted each -// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A -// element, otherwise move to the next triple of digits in the expansion. This guarntees that -// the A's are random, provided that PI digits are. -const swift_int16_t As[3 * M * N] = -{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78, - 50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93, - 95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105, - 45, 130, 108, 124, 171, 151, 189, 128, 218, 134, 233, 165, 14, 201, 145, 134, - 52, 203, 91, 96, 197, 69, 134, 213, 136, 93, 3, 249, 141, 16, 210, 73, - 6, 92, 58, 74, 174, 6, 254, 91, 201, 107, 110, 76, 103, 11, 73, 16, - 34, 209, 7, 127, 146, 254, 95, 176, 57, 13, 108, 245, 77, 92, 186, 117, - 124, 97, 105, 118, 34, 74, 205, 122, 235, 53, 94, 238, 210, 227, 183, 11, - 129, 159, 105, 183, 142, 129, 86, 21, 137, 138, 224, 223, 190, 188, 179, 188, - 256, 25, 217, 176, 36, 176, 238, 127, 160, 210, 155, 148, 132, 0, 54, 127, - 145, 6, 46, 85, 243, 95, 173, 123, 178, 207, 211, 183, 224, 173, 146, 35, - 71, 114, 50, 22, 175, 1, 28, 19, 112, 129, 21, 34, 161, 159, 115, 52, - 4, 193, 211, 92, 115, 49, 59, 217, 218, 96, 61, 81, 24, 202, 198, 89, - 45, 128, 8, 51, 253, 87, 171, 35, 4, 188, 171, 10, 3, 137, 238, 73, - 19, 208, 124, 163, 103, 177, 155, 147, 46, 84, 253, 233, 171, 241, 211, 217, - 159, 48, 96, 79, 237, 18, 171, 226, 99, 1, 97, 195, 216, 163, 198, 95, - 0, 201, 65, 228, 21, 153, 124, 230, 44, 35, 44, 108, 85, 156, 249, 207, - 26, 222, 131, 1, 60, 242, 197, 150, 181, 19, 116, 213, 75, 98, 124, 240, - 123, 207, 62, 255, 60, 143, 187, 157, 139, 9, 12, 104, 89, 49, 193, 146, - 104, 196, 181, 82, 198, 253, 192, 191, 255, 122, 212, 104, 47, 20, 132, 208, - 46, 170, 2, 69, 234, 36, 56, 163, 28, 152, 104, 238, 162, 56, 24, 58, - 38, 150, 193, 254, 253, 125, 173, 35, 73, 126, 247, 239, 216, 6, 199, 15, - 90, 12, 97, 122, 9, 84, 207, 127, 219, 72, 58, 30, 29, 182, 41, 192, - 235, 248, 237, 74, 72, 176, 210, 252, 45, 64, 165, 87, 202, 241, 236, 223, - 151, 242, 119, 239, 52, 112, 169, 28, 13, 37, 160, 60, 158, 81, 133, 60, - 16, 145, 249, 192, 173, 217, 214, 93, 141, 184, 54, 34, 161, 104, 157, 95, - 38, 133, 218, 227, 211, 181, 9, 66, 137, 143, 77, 33, 248, 159, 4, 55, - 228, 48, 99, 219, 222, 184, 15, 36, 254, 256, 157, 237, 87, 139, 209, 113, - 232, 85, 126, 167, 197, 100, 103, 166, 64, 225, 125, 205, 117, 135, 84, 128, - 231, 112, 90, 241, 28, 22, 210, 147, 186, 49, 230, 21, 108, 39, 194, 47, - 123, 199, 107, 114, 30, 210, 250, 143, 59, 156, 131, 133, 221, 27, 76, 99, - 208, 250, 78, 12, 211, 141, 95, 81, 195, 106, 8, 232, 150, 212, 205, 221, - 11, 225, 87, 219, 126, 136, 137, 180, 198, 48, 68, 203, 239, 252, 194, 235, - 142, 137, 174, 172, 190, 145, 250, 221, 182, 204, 1, 195, 130, 153, 83, 241, - 161, 239, 211, 138, 11, 169, 155, 245, 174, 49, 10, 166, 16, 130, 181, 139, - 222, 222, 112, 99, 124, 94, 51, 243, 133, 194, 244, 136, 35, 248, 201, 177, - 178, 186, 129, 102, 89, 184, 180, 41, 149, 96, 165, 72, 225, 231, 134, 158, - 199, 28, 249, 16, 225, 195, 10, 210, 164, 252, 138, 8, 35, 152, 213, 199, - 82, 116, 97, 230, 63, 199, 241, 35, 79, 120, 54, 174, 67, 112, 1, 76, - 69, 222, 194, 96, 82, 94, 25, 228, 196, 145, 155, 136, 228, 234, 46, 101, - 246, 51, 103, 166, 246, 75, 9, 200, 161, 4, 108, 35, 129, 168, 208, 144, - 50, 14, 13, 220, 41, 132, 122, 127, 194, 9, 232, 234, 107, 28, 187, 8, - 51, 141, 97, 221, 225, 9, 113, 170, 166, 102, 135, 22, 231, 185, 227, 187, - 110, 145, 251, 146, 76, 22, 146, 228, 7, 53, 64, 25, 62, 198, 130, 190, - 221, 232, 169, 64, 188, 199, 237, 249, 173, 218, 196, 191, 48, 224, 5, 113, - 100, 166, 160, 21, 191, 197, 61, 162, 149, 171, 240, 183, 129, 231, 123, 204, - 192, 179, 134, 15, 47, 161, 142, 177, 239, 234, 186, 237, 231, 53, 208, 95, - 146, 36, 225, 231, 89, 142, 93, 248, 137, 124, 83, 39, 69, 77, 89, 208, - 182, 48, 85, 147, 244, 164, 246, 68, 38, 190, 220, 35, 202, 91, 157, 151, - 201, 240, 185, 218, 4, 152, 2, 132, 177, 88, 190, 196, 229, 74, 220, 135, - 137, 196, 11, 47, 5, 251, 106, 144, 163, 60, 222, 127, 52, 57, 202, 102, - 64, 140, 110, 206, 23, 182, 39, 245, 1, 163, 157, 186, 163, 80, 7, 230, - 44, 249, 176, 102, 164, 125, 147, 120, 18, 191, 186, 125, 64, 65, 198, 157, - 164, 213, 95, 61, 13, 181, 208, 91, 242, 197, 158, 34, 98, 169, 91, 14, - 17, 93, 157, 17, 65, 30, 183, 6, 139, 58, 255, 108, 100, 136, 209, 144, - 164, 6, 237, 33, 210, 110, 57, 126, 197, 136, 125, 244, 165, 151, 168, 3, - 143, 251, 247, 155, 136, 130, 88, 14, 74, 121, 250, 133, 21, 226, 185, 232, - 118, 132, 89, 64, 204, 161, 2, 70, 224, 159, 35, 204, 123, 180, 13, 52, - 231, 57, 25, 78, 66, 69, 97, 42, 198, 84, 176, 59, 8, 232, 125, 134, - 193, 2, 232, 109, 216, 69, 90, 142, 32, 38, 249, 37, 75, 180, 184, 188, - 19, 47, 120, 87, 146, 70, 232, 120, 191, 45, 33, 38, 19, 248, 110, 110, - 44, 64, 2, 84, 244, 228, 252, 228, 170, 123, 38, 144, 213, 144, 171, 212, - 243, 87, 189, 46, 128, 110, 84, 77, 65, 183, 61, 184, 101, 44, 168, 68, - 14, 106, 105, 8, 227, 211, 166, 39, 152, 43, 52, 254, 197, 55, 119, 89, - 168, 65, 53, 138, 177, 56, 219, 0, 58, 121, 148, 18, 44, 100, 215, 103, - 145, 229, 117, 196, 91, 89, 113, 143, 172, 239, 249, 184, 154, 39, 112, 65, - 204, 42, 84, 38, 155, 151, 151, 16, 100, 87, 174, 162, 145, 147, 149, 186, - 237, 145, 134, 144, 198, 235, 213, 163, 48, 230, 24, 47, 57, 71, 127, 0, - 150, 219, 12, 81, 197, 150, 131, 13, 169, 63, 175, 184, 48, 235, 65, 243, - 149, 200, 163, 254, 202, 114, 247, 67, 143, 250, 126, 228, 80, 130, 216, 214, - 36, 2, 230, 33, 119, 125, 3, 142, 237, 100, 3, 152, 197, 174, 244, 129, - 232, 30, 206, 199, 39, 210, 220, 43, 237, 221, 201, 54, 179, 42, 28, 133, - 246, 203, 198, 177, 0, 28, 194, 85, 223, 109, 155, 147, 221, 60, 133, 108, - 157, 254, 26, 75, 157, 185, 49, 142, 31, 137, 71, 43, 63, 64, 237, 148, - 237, 172, 159, 160, 155, 254, 234, 224, 140, 193, 114, 140, 62, 109, 136, 39, - 255, 8, 158, 146, 128, 49, 222, 96, 57, 209, 180, 249, 202, 127, 113, 231, - 78, 178, 46, 33, 228, 215, 104, 31, 207, 186, 82, 41, 42, 39, 103, 119, - 123, 133, 243, 254, 238, 156, 90, 186, 37, 212, 33, 107, 252, 51, 177, 36, - 237, 76, 159, 245, 93, 214, 97, 56, 190, 38, 160, 94, 105, 222, 220, 158, - 49, 16, 191, 52, 120, 87, 179, 2, 27, 144, 223, 230, 184, 6, 129, 227, - 69, 47, 215, 181, 162, 139, 72, 200, 45, 163, 159, 62, 2, 221, 124, 40, - 159, 242, 35, 208, 179, 166, 98, 67, 178, 68, 143, 225, 178, 146, 187, 159, - 57, 66, 176, 192, 236, 250, 168, 224, 122, 43, 159, 120, 133, 165, 122, 64, - 87, 74, 161, 241, 9, 87, 90, 24, 255, 113, 203, 220, 57, 139, 197, 159, - 31, 151, 27, 140, 77, 162, 7, 27, 84, 228, 187, 220, 53, 126, 162, 242, - 84, 181, 223, 103, 86, 177, 207, 31, 140, 18, 207, 256, 201, 166, 96, 23, - 233, 103, 197, 84, 161, 75, 59, 149, 138, 154, 119, 92, 16, 53, 116, 97, - 220, 114, 35, 45, 77, 209, 40, 196, 71, 22, 81, 178, 110, 14, 3, 180, - 110, 129, 112, 47, 18, 61, 134, 78, 73, 79, 254, 232, 125, 180, 205, 54, - 220, 119, 63, 89, 181, 52, 77, 109, 151, 77, 80, 207, 144, 25, 20, 6, - 208, 47, 201, 206, 192, 14, 73, 176, 256, 201, 207, 87, 216, 60, 56, 73, - 92, 243, 179, 113, 49, 59, 55, 168, 121, 137, 69, 154, 95, 57, 187, 47, - 129, 4, 15, 92, 6, 116, 69, 196, 48, 134, 84, 81, 111, 56, 38, 176, - 239, 6, 128, 72, 242, 134, 36, 221, 59, 48, 242, 68, 130, 110, 171, 89, - 13, 220, 48, 29, 5, 75, 104, 233, 91, 129, 105, 162, 44, 113, 163, 163, - 85, 147, 190, 111, 197, 80, 213, 153, 81, 68, 203, 33, 161, 165, 10, 61, - 120, 252, 0, 205, 28, 42, 193, 64, 39, 37, 83, 175, 5, 218, 215, 174, - 128, 121, 231, 11, 150, 145, 135, 197, 136, 91, 193, 5, 107, 88, 82, 6, - 4, 188, 256, 70, 40, 2, 167, 57, 169, 203, 115, 254, 215, 172, 84, 80, - 188, 167, 34, 137, 43, 243, 2, 79, 178, 38, 188, 135, 233, 194, 208, 13, - 11, 151, 231, 196, 12, 122, 162, 56, 17, 114, 191, 207, 90, 132, 64, 238, - 187, 6, 198, 176, 240, 88, 118, 236, 15, 226, 166, 22, 193, 229, 82, 246, - 213, 64, 37, 63, 31, 243, 252, 37, 156, 38, 175, 204, 138, 141, 211, 82, - 106, 217, 97, 139, 153, 56, 129, 218, 158, 9, 83, 26, 87, 112, 71, 21, - 250, 5, 65, 141, 68, 116, 231, 113, 10, 218, 99, 205, 201, 92, 157, 4, - 97, 46, 49, 220, 72, 139, 103, 171, 149, 129, 193, 19, 69, 245, 43, 31, - 58, 68, 36, 195, 159, 22, 54, 34, 233, 141, 205, 100, 226, 96, 22, 192, - 41, 231, 24, 79, 234, 138, 30, 120, 117, 216, 172, 197, 172, 107, 86, 29, - 181, 151, 0, 6, 146, 186, 68, 55, 54, 58, 213, 182, 60, 231, 33, 232, - 77, 210, 216, 154, 80, 51, 141, 122, 68, 148, 219, 122, 254, 48, 64, 175, - 41, 115, 62, 243, 141, 81, 119, 121, 5, 68, 121, 88, 239, 29, 230, 90, - 135, 159, 35, 223, 168, 112, 49, 37, 146, 60, 126, 134, 42, 145, 115, 90, - 73, 133, 211, 86, 120, 141, 122, 241, 127, 56, 130, 36, 174, 75, 83, 246, - 112, 45, 136, 194, 201, 115, 1, 156, 114, 167, 208, 12, 176, 147, 32, 170, - 251, 100, 102, 220, 122, 210, 6, 49, 75, 201, 38, 105, 132, 135, 126, 102, - 13, 121, 76, 228, 202, 20, 61, 213, 246, 13, 207, 42, 148, 168, 37, 253, - 34, 94, 141, 185, 18, 234, 157, 109, 104, 64, 250, 125, 49, 236, 86, 48, - 196, 77, 75, 237, 156, 103, 225, 19, 110, 229, 22, 68, 177, 93, 221, 181, - 152, 153, 61, 108, 101, 74, 247, 195, 127, 216, 30, 166, 168, 61, 83, 229, - 120, 156, 96, 120, 201, 124, 43, 27, 253, 250, 120, 143, 89, 235, 189, 243, - 150, 7, 127, 119, 149, 244, 84, 185, 134, 34, 128, 193, 236, 234, 132, 117, - 137, 32, 145, 184, 44, 121, 51, 76, 11, 228, 142, 251, 39, 77, 228, 251, - 41, 58, 246, 107, 125, 187, 9, 240, 35, 8, 11, 162, 242, 220, 158, 163, - 2, 184, 163, 227, 242, 2, 100, 101, 2, 78, 129, 34, 89, 28, 26, 157, - 79, 31, 107, 250, 194, 156, 186, 69, 212, 66, 41, 180, 139, 42, 211, 253, - 256, 239, 29, 129, 104, 248, 182, 68, 1, 189, 48, 226, 36, 229, 3, 158, - 41, 53, 241, 22, 115, 174, 16, 163, 224, 19, 112, 219, 177, 233, 42, 27, - 250, 134, 18, 28, 145, 122, 68, 34, 134, 31, 147, 17, 39, 188, 150, 76, - 45, 42, 167, 249, 12, 16, 23, 182, 13, 79, 121, 3, 70, 197, 239, 44, - 86, 177, 255, 81, 64, 171, 138, 131, 73, 110, 44, 201, 254, 198, 146, 91, - 48, 9, 104, 31, 29, 161, 101, 31, 138, 180, 231, 233, 79, 137, 61, 236, - 140, 15, 249, 218, 234, 119, 99, 195, 110, 137, 237, 207, 8, 31, 45, 24, - 90, 155, 203, 253, 192, 203, 65, 176, 210, 171, 142, 214, 220, 122, 136, 237, - 189, 186, 147, 40, 80, 254, 173, 33, 191, 46, 192, 26, 108, 255, 228, 205, - 61, 76, 39, 107, 225, 126, 228, 182, 140, 251, 143, 134, 252, 168, 221, 8, - 185, 85, 60, 233, 147, 244, 87, 137, 8, 140, 96, 80, 53, 45, 175, 160, - 124, 189, 112, 37, 144, 19, 70, 17, 170, 242, 2, 3, 28, 95, 120, 199, - 212, 43, 9, 117, 86, 151, 101, 241, 200, 145, 241, 19, 178, 69, 204, 197, - 227, 166, 94, 7, 193, 45, 247, 234, 19, 187, 212, 212, 236, 125, 33, 95, - 198, 121, 122, 103, 77, 155, 235, 49, 25, 237, 249, 11, 162, 7, 238, 24, - 16, 150, 129, 25, 152, 17, 42, 67, 247, 162, 77, 154, 31, 133, 55, 137, - 79, 119, 153, 10, 86, 28, 244, 186, 41, 169, 106, 44, 10, 49, 110, 179, - 32, 133, 155, 244, 61, 70, 131, 168, 170, 39, 231, 252, 32, 69, 92, 238, - 239, 35, 132, 136, 236, 167, 90, 32, 123, 88, 69, 22, 20, 89, 145, 166, - 30, 118, 75, 4, 49, 31, 225, 54, 11, 50, 56, 191, 246, 1, 187, 33, - 119, 107, 139, 68, 19, 240, 131, 55, 94, 113, 31, 252, 12, 179, 121, 2, - 120, 252, 0, 76, 41, 80, 185, 42, 62, 121, 105, 159, 121, 109, 111, 98, - 7, 118, 86, 29, 210, 70, 231, 179, 223, 229, 164, 70, 62, 47, 0, 206, - 204, 178, 168, 120, 224, 166, 99, 25, 103, 63, 246, 224, 117, 204, 75, 124, - 140, 133, 110, 110, 222, 88, 151, 118, 46, 37, 22, 143, 158, 40, 2, 50, - 153, 94, 190, 199, 13, 198, 127, 211, 180, 90, 183, 98, 0, 142, 210, 154, - 100, 187, 67, 231, 202, 100, 198, 235, 252, 160, 247, 124, 247, 14, 121, 221, - 57, 88, 253, 243, 185, 89, 45, 249, 221, 194, 108, 175, 193, 119, 50, 141, - 223, 133, 136, 64, 176, 250, 129, 100, 124, 94, 181, 159, 99, 185, 177, 240, - 135, 42, 103, 52, 202, 208, 143, 186, 193, 103, 154, 237, 102, 88, 225, 161, - 50, 188, 191, 109, 12, 87, 19, 227, 247, 183, 13, 52, 205, 170, 205, 146, - 89, 160, 18, 105, 192, 73, 231, 225, 184, 157, 252, 220, 61, 59, 169, 183, - 221, 20, 141, 20, 158, 101, 245, 7, 245, 225, 118, 137, 84, 55, 19, 27, - 164, 110, 35, 25, 202, 94, 150, 46, 91, 152, 130, 1, 7, 46, 16, 237, - 171, 109, 19, 200, 65, 38, 10, 213, 70, 96, 126, 226, 185, 225, 181, 46, - 10, 165, 11, 123, 53, 158, 22, 147, 64, 22, 227, 69, 182, 237, 197, 37, - 39, 49, 186, 223, 139, 128, 55, 36, 166, 178, 220, 20, 98, 172, 166, 253, - 45, 0, 120, 180, 189, 185, 158, 159, 196, 6, 214, 79, 141, 52, 156, 107, - 5, 109, 142, 159, 33, 64, 190, 133, 95, 132, 95, 202, 160, 63, 186, 23, - 231, 107, 163, 33, 234, 15, 244, 77, 108, 49, 51, 7, 164, 87, 142, 99, - 240, 202, 47, 256, 118, 190, 196, 178, 217, 42, 39, 153, 21, 192, 232, 202, - 14, 82, 179, 64, 233, 4, 219, 10, 133, 78, 43, 144, 146, 216, 202, 81, - 71, 252, 8, 201, 68, 256, 85, 233, 164, 88, 176, 30, 5, 152, 126, 179, - 249, 84, 140, 190, 159, 54, 118, 98, 2, 159, 27, 133, 74, 121, 239, 196, - 71, 149, 119, 135, 102, 20, 87, 112, 44, 75, 221, 3, 151, 158, 5, 98, - 152, 25, 97, 106, 63, 171, 240, 79, 234, 240, 230, 92, 76, 70, 173, 196, - 36, 225, 218, 133, 64, 240, 150, 41, 146, 66, 133, 51, 134, 73, 170, 238, - 140, 90, 45, 89, 46, 147, 96, 169, 174, 174, 244, 151, 90, 40, 32, 74, - 38, 154, 246, 57, 31, 14, 189, 151, 83, 243, 197, 183, 220, 185, 53, 225, - 51, 106, 188, 208, 222, 248, 93, 13, 93, 215, 131, 25, 142, 185, 113, 222, - 131, 215, 149, 50, 159, 85, 32, 5, 205, 192, 2, 227, 42, 214, 197, 42, - 126, 182, 68, 123, 109, 36, 237, 179, 170, 199, 77, 256, 5, 128, 214, 243, - 137, 177, 170, 253, 179, 180, 153, 236, 100, 196, 216, 231, 198, 37, 192, 80, - 121, 221, 246, 1, 16, 246, 29, 78, 64, 148, 124, 38, 96, 125, 28, 20, - 48, 51, 73, 187, 139, 208, 98, 253, 221, 188, 84, 129, 1, 205, 95, 205, - 117, 79, 71, 126, 134, 237, 19, 184, 137, 125, 129, 178, 223, 54, 188, 112, - 30, 7, 225, 228, 205, 184, 233, 87, 117, 22, 58, 10, 8, 42, 2, 114, - 254, 19, 17, 13, 150, 92, 233, 179, 63, 12, 60, 171, 127, 35, 50, 5, - 195, 113, 241, 25, 249, 184, 166, 44, 221, 35, 151, 116, 8, 54, 195, 89, - 218, 186, 132, 5, 41, 89, 226, 177, 11, 41, 87, 172, 5, 23, 20, 59, - 228, 94, 76, 33, 137, 43, 151, 221, 61, 232, 4, 120, 93, 217, 80, 228, - 228, 6, 58, 25, 62, 84, 91, 48, 209, 20, 247, 243, 55, 106, 80, 79, - 235, 34, 20, 180, 146, 2, 236, 13, 236, 206, 243, 222, 204, 83, 148, 213, - 214, 117, 237, 98, 0, 90, 204, 168, 32, 41, 126, 67, 191, 74, 27, 255, - 26, 75, 240, 113, 185, 105, 167, 154, 112, 67, 151, 63, 161, 134, 239, 176, - 42, 87, 249, 130, 45, 242, 17, 100, 107, 120, 212, 218, 237, 76, 231, 162, - 175, 172, 118, 155, 92, 36, 124, 17, 121, 71, 13, 9, 82, 126, 147, 142, - 218, 148, 138, 80, 163, 106, 164, 123, 140, 129, 35, 42, 186, 154, 228, 214, - 75, 73, 8, 253, 42, 153, 232, 164, 95, 24, 110, 90, 231, 197, 90, 196, - 57, 164, 252, 181, 31, 7, 97, 256, 35, 77, 200, 212, 99, 179, 92, 227, - 17, 180, 49, 176, 9, 188, 13, 182, 93, 44, 128, 219, 134, 92, 151, 6, - 23, 126, 200, 109, 66, 30, 140, 180, 146, 134, 67, 200, 7, 9, 223, 168, - 186, 221, 3, 154, 150, 165, 43, 53, 138, 27, 86, 213, 235, 160, 70, 2, - 240, 20, 89, 212, 84, 141, 168, 246, 183, 227, 30, 167, 138, 185, 253, 83, - 52, 143, 236, 94, 59, 65, 89, 218, 194, 157, 164, 156, 111, 95, 202, 168, - 245, 256, 151, 28, 222, 194, 72, 130, 217, 134, 253, 77, 246, 100, 76, 32, - 254, 174, 182, 193, 14, 237, 74, 1, 74, 26, 135, 216, 152, 208, 112, 38, - 181, 62, 25, 71, 61, 234, 254, 97, 191, 23, 92, 256, 190, 205, 6, 16, - 134, 147, 210, 219, 148, 59, 73, 185, 24, 247, 174, 143, 116, 220, 128, 144, - 111, 126, 101, 98, 130, 136, 101, 102, 69, 127, 24, 168, 146, 226, 226, 207, - 176, 122, 149, 254, 134, 196, 22, 151, 197, 21, 50, 205, 116, 154, 65, 116, - 177, 224, 127, 77, 177, 159, 225, 69, 176, 54, 100, 104, 140, 8, 11, 126, - 11, 188, 185, 159, 107, 16, 254, 142, 80, 28, 5, 157, 104, 57, 109, 82, - 102, 80, 173, 242, 238, 207, 57, 105, 237, 160, 59, 189, 189, 199, 26, 11, - 190, 156, 97, 118, 20, 12, 254, 189, 165, 147, 142, 199, 5, 213, 64, 133, - 108, 217, 133, 60, 94, 28, 116, 136, 47, 165, 125, 42, 183, 143, 14, 129, - 223, 70, 212, 205, 181, 180, 3, 201, 182, 46, 57, 104, 239, 60, 99, 181, - 220, 231, 45, 79, 156, 89, 149, 143, 190, 103, 153, 61, 235, 73, 136, 20, - 89, 243, 16, 130, 247, 141, 134, 93, 80, 68, 85, 84, 8, 72, 194, 4, - 242, 110, 19, 133, 199, 70, 172, 92, 132, 254, 67, 74, 36, 94, 13, 90, - 154, 184, 9, 109, 118, 243, 214, 71, 36, 95, 0, 90, 201, 105, 112, 215, - 69, 196, 224, 210, 236, 242, 155, 211, 37, 134, 69, 113, 157, 97, 68, 26, - 230, 149, 219, 180, 20, 76, 172, 145, 154, 40, 129, 8, 93, 56, 162, 124, - 207, 233, 105, 19, 3, 183, 155, 134, 8, 244, 213, 78, 139, 88, 156, 37, - 51, 152, 111, 102, 112, 250, 114, 252, 201, 241, 133, 24, 136, 153, 5, 90, - 210, 197, 216, 24, 131, 17, 147, 246, 13, 86, 3, 253, 179, 237, 101, 114, - 243, 191, 207, 2, 220, 133, 244, 53, 87, 125, 154, 158, 197, 20, 8, 83, - 32, 191, 38, 241, 204, 22, 168, 59, 217, 123, 162, 82, 21, 50, 130, 89, - 239, 253, 195, 56, 253, 74, 147, 125, 234, 199, 250, 28, 65, 193, 22, 237, - 193, 94, 58, 229, 139, 176, 69, 42, 179, 164, 150, 168, 246, 214, 86, 174, - 59, 117, 15, 19, 76, 37, 214, 238, 153, 226, 154, 45, 109, 114, 198, 107, - 45, 70, 238, 196, 142, 252, 244, 71, 123, 136, 134, 188, 99, 132, 25, 42, - 240, 0, 196, 33, 26, 124, 256, 145, 27, 102, 153, 35, 28, 132, 221, 167, - 138, 133, 41, 170, 95, 224, 40, 139, 239, 153, 1, 106, 255, 106, 170, 163, - 127, 44, 155, 232, 194, 119, 232, 117, 239, 143, 108, 41, 3, 9, 180, 256, - 144, 113, 133, 200, 79, 69, 128, 216, 31, 50, 102, 209, 249, 136, 150, 154, - 182, 51, 228, 39, 127, 142, 87, 15, 94, 92, 187, 245, 31, 236, 64, 58, - 114, 11, 17, 166, 189, 152, 218, 34, 123, 39, 58, 37, 153, 91, 63, 121, - 31, 34, 12, 254, 106, 96, 171, 14, 155, 247, 214, 69, 24, 98, 3, 204, - 202, 194, 207, 30, 253, 44, 119, 70, 14, 96, 82, 250, 63, 6, 232, 38, - 89, 144, 102, 191, 82, 254, 20, 222, 96, 162, 110, 6, 159, 58, 200, 226, - 98, 128, 42, 70, 84, 247, 128, 211, 136, 54, 143, 166, 60, 118, 99, 218, - 27, 193, 85, 81, 219, 223, 46, 41, 23, 233, 152, 222, 36, 236, 54, 181, - 56, 50, 4, 207, 129, 92, 78, 88, 197, 251, 131, 105, 31, 172, 38, 131, - 19, 204, 129, 47, 227, 106, 202, 183, 23, 6, 77, 224, 102, 147, 11, 218, - 131, 132, 60, 192, 208, 223, 236, 23, 103, 115, 89, 18, 185, 171, 70, 174, - 139, 0, 100, 160, 221, 11, 228, 60, 12, 122, 114, 12, 157, 235, 148, 57, - 83, 62, 173, 131, 169, 126, 85, 99, 93, 243, 81, 80, 29, 245, 206, 82, - 236, 227, 166, 14, 230, 213, 144, 97, 27, 111, 99, 164, 105, 150, 89, 111, - 252, 118, 140, 232, 120, 183, 137, 213, 232, 157, 224, 33, 134, 118, 186, 80, - 159, 2, 186, 193, 54, 242, 25, 237, 232, 249, 226, 213, 90, 149, 90, 160, - 118, 69, 64, 37, 10, 183, 109, 246, 30, 52, 219, 69, 189, 26, 116, 220, - 50, 244, 243, 243, 139, 137, 232, 98, 38, 45, 256, 143, 171, 101, 73, 238, - 123, 45, 194, 167, 250, 123, 12, 29, 136, 237, 141, 21, 89, 96, 199, 44, - 8, 214, 208, 17, 113, 41, 137, 26, 166, 155, 89, 85, 54, 58, 97, 160, - 50, 239, 58, 71, 21, 157, 139, 12, 37, 198, 182, 131, 149, 134, 16, 204, - 164, 181, 248, 166, 52, 216, 136, 201, 37, 255, 187, 240, 5, 101, 147, 231, - 14, 163, 253, 134, 146, 216, 8, 54, 224, 90, 220, 195, 75, 215, 186, 58, - 71, 204, 124, 105, 239, 53, 16, 85, 69, 163, 195, 223, 33, 38, 69, 88, - 88, 203, 99, 55, 176, 13, 156, 204, 236, 99, 194, 134, 75, 247, 126, 129, - 160, 124, 233, 206, 139, 144, 154, 45, 233, 51, 206, 61, 60, 55, 205, 107, - 84, 108, 96, 188, 203, 31, 89, 20, 115, 144, 137, 90, 237, 78, 231, 185, - 120, 217, 1, 176, 169, 30, 155, 176, 100, 113, 53, 42, 193, 108, 14, 121, - 176, 158, 137, 92, 178, 44, 110, 249, 108, 234, 94, 101, 128, 12, 250, 173, - 72, 202, 232, 66, 139, 152, 189, 18, 32, 197, 9, 238, 246, 55, 119, 183, - 196, 119, 113, 247, 191, 100, 200, 245, 46, 16, 234, 112, 136, 116, 232, 48, - 176, 108, 11, 237, 14, 153, 93, 177, 124, 72, 67, 121, 135, 143, 45, 18, - 97, 251, 184, 172, 136, 55, 213, 8, 103, 12, 221, 212, 13, 160, 116, 91, - 237, 127, 218, 190, 103, 131, 77, 82, 36, 100, 22, 252, 79, 69, 54, 26, - 65, 182, 115, 142, 247, 20, 89, 81, 188, 244, 27, 120, 240, 248, 13, 230, - 67, 133, 32, 201, 129, 87, 9, 245, 66, 88, 166, 34, 46, 184, 119, 218, - 144, 235, 163, 40, 138, 134, 127, 217, 64, 227, 116, 67, 55, 202, 130, 48, - 199, 42, 251, 112, 124, 153, 123, 194, 243, 49, 250, 12, 78, 157, 167, 134, - 210, 73, 156, 102, 21, 88, 216, 123, 45, 11, 208, 18, 47, 187, 20, 43, - 3, 180, 124, 2, 136, 176, 77, 111, 138, 139, 91, 225, 126, 8, 74, 255, - 88, 192, 193, 239, 138, 204, 139, 194, 166, 130, 252, 184, 140, 168, 30, 177, - 121, 98, 131, 124, 69, 171, 75, 49, 184, 34, 76, 122, 202, 115, 184, 253, - 120, 182, 33, 251, 1, 74, 216, 217, 243, 168, 70, 162, 119, 158, 197, 198, - 61, 89, 7, 5, 54, 199, 211, 170, 23, 226, 44, 247, 165, 195, 7, 225, - 91, 23, 50, 15, 51, 208, 106, 94, 12, 31, 43, 112, 146, 139, 246, 182, - 113, 1, 97, 15, 66, 2, 51, 76, 164, 184, 237, 200, 218, 176, 72, 98, - 33, 135, 38, 147, 140, 229, 50, 94, 81, 187, 129, 17, 238, 168, 146, 203, - 181, 99, 164, 3, 104, 98, 255, 189, 114, 142, 86, 102, 229, 102, 80, 129, - 64, 84, 79, 161, 81, 156, 128, 111, 164, 197, 18, 15, 55, 196, 198, 191, - 28, 113, 117, 96, 207, 253, 19, 158, 231, 13, 53, 130, 252, 211, 58, 180, - 212, 142, 7, 219, 38, 81, 62, 109, 167, 113, 33, 56, 97, 185, 157, 130, - 186, 129, 119, 182, 196, 26, 54, 110, 65, 170, 166, 236, 30, 22, 162, 0, - 106, 12, 248, 33, 48, 72, 159, 17, 76, 244, 172, 132, 89, 171, 196, 76, - 254, 166, 76, 218, 226, 3, 52, 220, 238, 181, 179, 144, 225, 23, 3, 166, - 158, 35, 228, 154, 204, 23, 203, 71, 134, 189, 18, 168, 236, 141, 117, 138, - 2, 132, 78, 57, 154, 21, 250, 196, 184, 40, 161, 40, 10, 178, 134, 120, - 132, 123, 101, 82, 205, 121, 55, 140, 231, 56, 231, 71, 206, 246, 198, 150, - 146, 192, 45, 105, 242, 1, 125, 18, 176, 46, 222, 122, 19, 80, 113, 133, - 131, 162, 81, 51, 98, 168, 247, 161, 139, 39, 63, 162, 22, 153, 170, 92, - 91, 130, 174, 200, 45, 112, 99, 164, 132, 184, 191, 186, 200, 167, 86, 145, - 167, 227, 130, 44, 12, 158, 172, 249, 204, 17, 54, 249, 16, 200, 21, 174, - 67, 223, 105, 201, 50, 36, 133, 203, 244, 131, 228, 67, 29, 195, 91, 91, - 55, 107, 167, 154, 170, 137, 218, 183, 169, 61, 99, 175, 128, 23, 142, 183, - 66, 255, 59, 187, 66, 85, 212, 109, 168, 82, 16, 43, 67, 139, 114, 176, - 216, 255, 130, 94, 152, 79, 183, 64, 100, 23, 214, 82, 34, 230, 48, 15, - 242, 130, 50, 241, 81, 32, 5, 125, 183, 182, 184, 99, 248, 109, 159, 210, - 226, 61, 119, 129, 39, 149, 78, 214, 107, 78, 147, 124, 228, 18, 143, 188, - 84, 180, 233, 119, 64, 39, 158, 133, 177, 168, 6, 150, 80, 117, 150, 56, - 49, 72, 49, 37, 30, 242, 49, 142, 33, 156, 34, 44, 44, 72, 58, 22, - 249, 46, 168, 80, 25, 196, 64, 174, 97, 179, 244, 134, 213, 105, 63, 151, - 21, 90, 168, 90, 245, 28, 157, 65, 250, 232, 188, 27, 99, 160, 156, 127, - 68, 193, 10, 80, 205, 36, 138, 229, 12, 223, 70, 169, 251, 41, 48, 94, - 41, 177, 99, 256, 158, 0, 6, 83, 231, 191, 120, 135, 157, 146, 218, 213, - 160, 7, 47, 234, 98, 211, 79, 225, 179, 95, 175, 105, 185, 79, 115, 0, - 104, 14, 65, 124, 15, 188, 52, 9, 253, 27, 132, 137, 13, 127, 75, 238, - 185, 253, 33, 8, 52, 157, 164, 68, 232, 188, 69, 28, 209, 233, 5, 129, - 216, 90, 252, 212, 33, 200, 222, 9, 112, 15, 43, 36, 226, 114, 15, 249, - 217, 8, 148, 22, 147, 23, 143, 67, 222, 116, 235, 250, 212, 210, 39, 142, - 108, 64, 209, 83, 73, 66, 99, 34, 17, 29, 45, 151, 244, 114, 28, 241, - 144, 208, 146, 179, 132, 89, 217, 198, 252, 219, 205, 165, 75, 107, 11, 173, - 76, 6, 196, 247, 152, 216, 248, 91, 209, 178, 57, 250, 174, 60, 79, 123, - 18, 135, 9, 241, 230, 159, 184, 68, 156, 251, 215, 9, 113, 234, 75, 235, - 103, 194, 205, 129, 230, 45, 96, 73, 157, 20, 200, 212, 212, 228, 161, 7, - 231, 228, 108, 43, 198, 87, 140, 140, 4, 182, 164, 3, 53, 104, 250, 213, - 85, 38, 89, 61, 52, 187, 35, 204, 86, 249, 100, 71, 248, 213, 163, 215, - 66, 106, 252, 129, 40, 111, 47, 24, 186, 221, 85, 205, 199, 237, 122, 181, - 32, 46, 182, 135, 33, 251, 142, 34, 208, 242, 128, 255, 4, 234, 15, 33, - 167, 222, 32, 186, 191, 34, 255, 244, 98, 240, 228, 204, 30, 142, 32, 70, - 69, 83, 110, 151, 10, 243, 141, 21, 223, 69, 61, 37, 59, 209, 102, 114, - 223, 33, 129, 254, 255, 103, 86, 247, 235, 72, 126, 177, 102, 226, 102, 30, - 149, 221, 62, 247, 251, 120, 163, 173, 57, 202, 204, 24, 39, 106, 120, 143, - 202, 176, 191, 147, 37, 38, 51, 133, 47, 245, 157, 132, 154, 71, 183, 111, - 30, 180, 18, 202, 82, 96, 170, 91, 157, 181, 212, 140, 256, 8, 196, 121, - 149, 79, 66, 127, 113, 78, 4, 197, 84, 256, 111, 222, 102, 63, 228, 104, - 136, 223, 67, 193, 93, 154, 249, 83, 204, 101, 200, 234, 84, 252, 230, 195, - 43, 140, 120, 242, 89, 63, 166, 233, 209, 94, 43, 170, 126, 5, 205, 78, - 112, 80, 143, 151, 146, 248, 137, 203, 45, 183, 61, 1, 155, 8, 102, 59, - 68, 212, 230, 61, 254, 191, 128, 223, 176, 123, 229, 27, 146, 120, 96, 165, - 213, 12, 232, 40, 186, 225, 66, 105, 200, 195, 212, 110, 237, 238, 151, 19, - 12, 171, 150, 82, 7, 228, 79, 52, 15, 78, 62, 43, 21, 154, 114, 21, - 12, 212, 256, 232, 125, 127, 5, 51, 37, 252, 136, 13, 47, 195, 168, 191, - 231, 55, 57, 251, 214, 116, 15, 86, 210, 41, 249, 242, 119, 27, 250, 203, - 107, 69, 90, 43, 206, 154, 127, 54, 100, 78, 187, 54, 244, 177, 234, 167, - 202, 136, 209, 171, 69, 114, 133, 173, 26, 139, 78, 141, 128, 32, 124, 39, - 45, 218, 96, 68, 90, 44, 67, 62, 83, 190, 188, 256, 103, 42, 102, 64, - 249, 0, 141, 11, 61, 69, 70, 66, 233, 237, 29, 200, 251, 157, 71, 51, - 64, 133, 113, 76, 35, 125, 76, 137, 217, 145, 35, 69, 226, 180, 56, 249, - 156, 163, 176, 237, 81, 54, 85, 169, 115, 211, 129, 70, 248, 40, 252, 192, - 194, 101, 247, 8, 181, 124, 217, 191, 194, 93, 99, 127, 117, 177, 144, 151, - 228, 121, 32, 11, 89, 81, 26, 29, 183, 76, 249, 132, 179, 70, 34, 102, - 20, 66, 87, 63, 124, 205, 174, 177, 87, 219, 73, 218, 91, 87, 176, 72, - 15, 211, 47, 61, 251, 165, 39, 247, 146, 70, 150, 57, 1, 212, 36, 162, - 39, 38, 16, 216, 3, 50, 116, 200, 32, 234, 77, 181, 155, 19, 90, 188, - 36, 6, 254, 46, 46, 203, 25, 230, 181, 196, 4, 151, 225, 65, 122, 216, - 168, 86, 158, 131, 136, 16, 49, 102, 233, 64, 154, 88, 228, 52, 146, 69, - 93, 157, 243, 121, 70, 209, 126, 213, 88, 145, 236, 65, 70, 96, 204, 47, - 10, 200, 77, 8, 103, 150, 48, 153, 5, 37, 52, 235, 209, 31, 181, 126, - 83, 142, 224, 140, 6, 32, 200, 171, 160, 179, 115, 229, 75, 194, 208, 39, - 59, 223, 52, 247, 38, 197, 135, 1, 6, 189, 106, 114, 168, 5, 211, 222, - 44, 63, 90, 160, 116, 172, 170, 133, 125, 138, 39, 131, 23, 178, 10, 214, - 36, 93, 28, 59, 68, 17, 123, 25, 255, 184, 204, 102, 194, 214, 129, 94, - 159, 245, 112, 141, 62, 11, 61, 197, 124, 221, 205, 11, 79, 71, 201, 54, - 58, 150, 29, 121, 87, 46, 240, 201, 68, 20, 194, 209, 47, 152, 158, 174, - 193, 164, 120, 255, 216, 165, 247, 58, 85, 130, 220, 23, 122, 223, 188, 98, - 21, 70, 72, 170, 150, 237, 76, 143, 112, 238, 206, 146, 215, 110, 4, 250, - 68, 44, 174, 177, 30, 98, 143, 241, 180, 127, 113, 48, 0, 1, 179, 199, - 59, 106, 201, 114, 29, 86, 173, 133, 217, 44, 200, 141, 107, 172, 16, 60, - 82, 58, 239, 94, 141, 234, 186, 235, 109, 173, 249, 139, 141, 59, 100, 248, - 84, 144, 49, 160, 51, 207, 164, 103, 74, 97, 146, 202, 193, 125, 168, 134, - 236, 111, 135, 121, 59, 145, 168, 200, 181, 173, 109, 2, 255, 6, 9, 245, - 90, 202, 214, 143, 121, 65, 85, 232, 132, 77, 228, 84, 26, 54, 184, 15, - 161, 29, 177, 79, 43, 0, 156, 184, 163, 165, 62, 90, 179, 93, 45, 239, - 1, 16, 120, 189, 127, 47, 74, 166, 20, 214, 233, 226, 89, 217, 229, 26, - 156, 53, 162, 60, 21, 3, 192, 72, 111, 51, 53, 101, 181, 208, 88, 82, - 179, 160, 219, 113, 240, 108, 43, 224, 162, 147, 62, 14, 95, 81, 205, 4, - 160, 177, 225, 115, 29, 69, 235, 168, 148, 29, 128, 114, 124, 129, 172, 165, - 215, 231, 214, 86, 160, 44, 157, 91, 248, 183, 73, 164, 56, 181, 162, 92, - 141, 118, 127, 240, 196, 77, 0, 9, 244, 79, 250, 100, 195, 25, 255, 85, - 94, 35, 212, 137, 107, 34, 110, 20, 200, 104, 17, 32, 231, 43, 150, 159, - 231, 216, 223, 190, 226, 109, 162, 197, 87, 92, 224, 11, 111, 73, 60, 225, - 238, 73, 246, 169, 19, 217, 119, 38, 121, 118, 70, 82, 99, 241, 110, 67, - 31, 76, 146, 215, 124, 240, 31, 103, 139, 224, 75, 160, 31, 78, 93, 4, - 64, 9, 103, 223, 6, 227, 119, 85, 116, 81, 21, 43, 46, 206, 234, 132, - 85, 99, 22, 131, 135, 97, 86, 13, 234, 188, 21, 14, 89, 169, 207, 238, - 219, 177, 190, 72, 157, 41, 114, 140, 92, 141, 186, 1, 63, 107, 225, 184, - 118, 150, 153, 254, 241, 106, 120, 210, 104, 144, 151, 161, 88, 206, 125, 164, - 15, 211, 173, 49, 146, 241, 71, 36, 58, 201, 46, 27, 33, 187, 91, 162, - 117, 19, 210, 213, 187, 97, 193, 50, 190, 114, 217, 60, 61, 167, 207, 213, - 213, 53, 135, 34, 156, 91, 115, 119, 46, 99, 242, 1, 90, 52, 198, 227, - 201, 91, 216, 146, 210, 82, 121, 38, 73, 133, 182, 193, 132, 148, 246, 75, - 109, 157, 179, 113, 176, 134, 205, 159, 148, 58, 103, 171, 132, 156, 133, 147, - 161, 231, 39, 100, 175, 97, 125, 28, 183, 129, 135, 191, 202, 181, 29, 218, - 43, 104, 148, 203, 189, 204, 4, 182, 169, 1, 134, 122, 141, 202, 13, 187, - 177, 112, 162, 35, 231, 6, 8, 241, 99, 6, 191, 45, 113, 113, 101, 104}; - -// The S-Box we use for further linearity breaking. -// We created it by taking the digits of decimal expansion of e. -// The code that created it can be found in 'ProduceRandomSBox.c'. -unsigned char SBox[256] = { -//0 1 2 3 4 5 6 7 8 9 A B C D E F -0x7d, 0xd1, 0x70, 0x0b, 0xfa, 0x39, 0x18, 0xc3, 0xf3, 0xbb, 0xa7, 0xd4, 0x84, 0x25, 0x3b, 0x3c, // 0 -0x2c, 0x15, 0x69, 0x9a, 0xf9, 0x27, 0xfb, 0x02, 0x52, 0xba, 0xa8, 0x4b, 0x20, 0xb5, 0x8b, 0x3a, // 1 -0x88, 0x8e, 0x26, 0xcb, 0x71, 0x5e, 0xaf, 0xad, 0x0c, 0xac, 0xa1, 0x93, 0xc6, 0x78, 0xce, 0xfc, // 2 -0x2a, 0x76, 0x17, 0x1f, 0x62, 0xc2, 0x2e, 0x99, 0x11, 0x37, 0x65, 0x40, 0xfd, 0xa0, 0x03, 0xc1, // 3 -0xca, 0x48, 0xe2, 0x9b, 0x81, 0xe4, 0x1c, 0x01, 0xec, 0x68, 0x7a, 0x5a, 0x50, 0xf8, 0x0e, 0xa3, // 4 -0xe8, 0x61, 0x2b, 0xa2, 0xeb, 0xcf, 0x8c, 0x3d, 0xb4, 0x95, 0x13, 0x08, 0x46, 0xab, 0x91, 0x7b, // 5 -0xea, 0x55, 0x67, 0x9d, 0xdd, 0x29, 0x6a, 0x8f, 0x9f, 0x22, 0x4e, 0xf2, 0x57, 0xd2, 0xa9, 0xbd, // 6 -0x38, 0x16, 0x5f, 0x4c, 0xf7, 0x9e, 0x1b, 0x2f, 0x30, 0xc7, 0x41, 0x24, 0x5c, 0xbf, 0x05, 0xf6, // 7 -0x0a, 0x31, 0xa5, 0x45, 0x21, 0x33, 0x6b, 0x6d, 0x6c, 0x86, 0xe1, 0xa4, 0xe6, 0x92, 0x9c, 0xdf, // 8 -0xe7, 0xbe, 0x28, 0xe3, 0xfe, 0x06, 0x4d, 0x98, 0x80, 0x04, 0x96, 0x36, 0x3e, 0x14, 0x4a, 0x34, // 9 -0xd3, 0xd5, 0xdb, 0x44, 0xcd, 0xf5, 0x54, 0xdc, 0x89, 0x09, 0x90, 0x42, 0x87, 0xff, 0x7e, 0x56, // A -0x5d, 0x59, 0xd7, 0x23, 0x75, 0x19, 0x97, 0x73, 0x83, 0x64, 0x53, 0xa6, 0x1e, 0xd8, 0xb0, 0x49, // B -0x3f, 0xef, 0xbc, 0x7f, 0x43, 0xf0, 0xc9, 0x72, 0x0f, 0x63, 0x79, 0x2d, 0xc0, 0xda, 0x66, 0xc8, // C -0x32, 0xde, 0x47, 0x07, 0xb8, 0xe9, 0x1d, 0xc4, 0x85, 0x74, 0x82, 0xcc, 0x60, 0x51, 0x77, 0x0d, // D -0xaa, 0x35, 0xed, 0x58, 0x7c, 0x5b, 0xb9, 0x94, 0x6e, 0x8d, 0xb1, 0xc5, 0xb7, 0xee, 0xb6, 0xae, // E -0x10, 0xe0, 0xd6, 0xd9, 0xe5, 0x4f, 0xf1, 0x12, 0x00, 0xd0, 0xf4, 0x1a, 0x6f, 0x8a, 0xb3, 0xb2 }; // F - -/////////////////////////////////////////////////////////////////////////////////////////////// -// -// Helper functions definition portion. -// -/////////////////////////////////////////////////////////////////////////////////////////////// - -// Translates an input array with values in base 257 to output array with values in base 256. -// Returns the carry bit. -// -// Parameters: -// - input: the input array of size EIGHTH_N. Each value in the array is a number in Z_257. -// The MSB is assumed to be the last one in the array. -// - output: the input array encoded in base 256. -// -// Returns: -// - The carry bit (MSB). -swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N]); - -// Translates an input integer into the range (-FIELD_SIZE / 2) <= result <= (FIELD_SIZE / 2). -// -// Parameters: -// - x: the input integer. -// -// Returns: -// - The result, which equals (x MOD FIELD_SIZE), such that |result| <= (FIELD_SIZE / 2). -int Center(int x); - -// Calculates bit reversal permutation. -// -// Parameters: -// - input: the input to reverse. -// - numOfBits: the number of bits in the input to reverse. -// -// Returns: -// - The resulting number, which is obtained from the input by reversing its bits. -int ReverseBits(int input, int numOfBits); - -// Initializes the FFT fast lookup table. -// Shall be called only once. -void InitializeSWIFFTX(); - -// Calculates the FFT. -// -// Parameters: -// - input: the input to the FFT. -// - output: the resulting output. -void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output); - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Helper functions implementation portion. -/////////////////////////////////////////////////////////////////////////////////////////////// - -swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N]) -{ - swift_int32_t pairs[EIGHTH_N / 2]; - int i; - - for (i = 0; i < EIGHTH_N; i += 2) - { - // input[i] + 257 * input[i + 1] - pairs[i >> 1] = input[i] + input[i + 1] + (input[i + 1] << 8); - } - - for (i = (EIGHTH_N / 2) - 1; i > 0; --i) - { - int j; - - for (j = i - 1; j < (EIGHTH_N / 2) - 1; ++j) - { - // pairs[j + 1] * 513, because 257^2 = 513 % 256^2. - register swift_int32_t temp = pairs[j] + pairs[j + 1] + (pairs[j + 1] << 9); - pairs[j] = temp & 0xffff; - pairs[j + 1] += (temp >> 16); - } - } - - for (i = 0; i < EIGHTH_N; i += 2) - { - output[i] = (unsigned char) (pairs[i >> 1] & 0xff); - output[i + 1] = (unsigned char) ((pairs[i >> 1] >> 8) & 0xff); - } - - return (pairs[EIGHTH_N/2 - 1] >> 16); -} - -int Center(int x) -{ - int result = x % FIELD_SIZE; - - if (result > (FIELD_SIZE / 2)) - result -= FIELD_SIZE; - - if (result < (FIELD_SIZE / -2)) - result += FIELD_SIZE; - - return result; -} - -int ReverseBits(int input, int numOfBits) -{ - register int reversed = 0; - - for (input |= numOfBits; input > 1; input >>= 1) - reversed = (reversed << 1) | (input & 1); - - return reversed; -} - -void InitializeSWIFFTX() -{ - int i, j, k, x; - // The powers of OMEGA - int omegaPowers[2 * N]; - omegaPowers[0] = 1; - - if (wasSetupDone) - return; - - for (i = 1; i < (2 * N); ++i) - { - omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA); - } - - for (i = 0; i < (N / W); ++i) - { - for (j = 0; j < W; ++j) - { - multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)]; - } - } - - for (x = 0; x < 256; ++x) - { - for (j = 0; j < 8; ++j) - { - register int temp = 0; - for (k = 0; k < 8; ++k) - { - temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)] - * ((x >> k) & 1); - } - - fftTable[(x << 3) + j] = Center(temp); - } - } - - wasSetupDone = true; -} - -void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) -{ - register swift_int16_t *mult = multipliers; - register swift_int32_t F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, - F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, - F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, - F30, F31, F32, F33, F34, F35, F36, F37, F38, F39, - F40, F41, F42, F43, F44, F45, F46, F47, F48, F49, - F50, F51, F52, F53, F54, F55, F56, F57, F58, F59, - F60, F61, F62, F63; - - // First loop unrolling: - register swift_int16_t *table = &(fftTable[input[0] << 3]); - - F0 = mult[0] * table[0]; - F8 = mult[1] * table[1]; - F16 = mult[2] * table[2]; - F24 = mult[3] * table[3]; - F32 = mult[4] * table[4]; - F40 = mult[5] * table[5]; - F48 = mult[6] * table[6]; - F56 = mult[7] * table[7]; - - mult += 8; - table = &(fftTable[input[1] << 3]); - - F1 = mult[0] * table[0]; - F9 = mult[1] * table[1]; - F17 = mult[2] * table[2]; - F25 = mult[3] * table[3]; - F33 = mult[4] * table[4]; - F41 = mult[5] * table[5]; - F49 = mult[6] * table[6]; - F57 = mult[7] * table[7]; - - mult += 8; - table = &(fftTable[input[2] << 3]); - - F2 = mult[0] * table[0]; - F10 = mult[1] * table[1]; - F18 = mult[2] * table[2]; - F26 = mult[3] * table[3]; - F34 = mult[4] * table[4]; - F42 = mult[5] * table[5]; - F50 = mult[6] * table[6]; - F58 = mult[7] * table[7]; - - mult += 8; - table = &(fftTable[input[3] << 3]); - - F3 = mult[0] * table[0]; - F11 = mult[1] * table[1]; - F19 = mult[2] * table[2]; - F27 = mult[3] * table[3]; - F35 = mult[4] * table[4]; - F43 = mult[5] * table[5]; - F51 = mult[6] * table[6]; - F59 = mult[7] * table[7]; - - mult += 8; - table = &(fftTable[input[4] << 3]); - - F4 = mult[0] * table[0]; - F12 = mult[1] * table[1]; - F20 = mult[2] * table[2]; - F28 = mult[3] * table[3]; - F36 = mult[4] * table[4]; - F44 = mult[5] * table[5]; - F52 = mult[6] * table[6]; - F60 = mult[7] * table[7]; - - mult += 8; - table = &(fftTable[input[5] << 3]); - - F5 = mult[0] * table[0]; - F13 = mult[1] * table[1]; - F21 = mult[2] * table[2]; - F29 = mult[3] * table[3]; - F37 = mult[4] * table[4]; - F45 = mult[5] * table[5]; - F53 = mult[6] * table[6]; - F61 = mult[7] * table[7]; - - mult += 8; - table = &(fftTable[input[6] << 3]); - - F6 = mult[0] * table[0]; - F14 = mult[1] * table[1]; - F22 = mult[2] * table[2]; - F30 = mult[3] * table[3]; - F38 = mult[4] * table[4]; - F46 = mult[5] * table[5]; - F54 = mult[6] * table[6]; - F62 = mult[7] * table[7]; - - mult += 8; - table = &(fftTable[input[7] << 3]); - - F7 = mult[0] * table[0]; - F15 = mult[1] * table[1]; - F23 = mult[2] * table[2]; - F31 = mult[3] * table[3]; - F39 = mult[4] * table[4]; - F47 = mult[5] * table[5]; - F55 = mult[6] * table[6]; - F63 = mult[7] * table[7]; - - // Second loop unrolling: - // Iteration 0: - ADD_SUB(F0, F1); - ADD_SUB(F2, F3); - ADD_SUB(F4, F5); - ADD_SUB(F6, F7); - - F3 <<= 4; - F7 <<= 4; - - ADD_SUB(F0, F2); - ADD_SUB(F1, F3); - ADD_SUB(F4, F6); - ADD_SUB(F5, F7); - - F5 <<= 2; - F6 <<= 4; - F7 <<= 6; - - ADD_SUB(F0, F4); - ADD_SUB(F1, F5); - ADD_SUB(F2, F6); - ADD_SUB(F3, F7); - - output[0] = Q_REDUCE(F0); - output[8] = Q_REDUCE(F1); - output[16] = Q_REDUCE(F2); - output[24] = Q_REDUCE(F3); - output[32] = Q_REDUCE(F4); - output[40] = Q_REDUCE(F5); - output[48] = Q_REDUCE(F6); - output[56] = Q_REDUCE(F7); - - // Iteration 1: - ADD_SUB(F8, F9); - ADD_SUB(F10, F11); - ADD_SUB(F12, F13); - ADD_SUB(F14, F15); - - F11 <<= 4; - F15 <<= 4; - - ADD_SUB(F8, F10); - ADD_SUB(F9, F11); - ADD_SUB(F12, F14); - ADD_SUB(F13, F15); - - F13 <<= 2; - F14 <<= 4; - F15 <<= 6; - - ADD_SUB(F8, F12); - ADD_SUB(F9, F13); - ADD_SUB(F10, F14); - ADD_SUB(F11, F15); - - output[1] = Q_REDUCE(F8); - output[9] = Q_REDUCE(F9); - output[17] = Q_REDUCE(F10); - output[25] = Q_REDUCE(F11); - output[33] = Q_REDUCE(F12); - output[41] = Q_REDUCE(F13); - output[49] = Q_REDUCE(F14); - output[57] = Q_REDUCE(F15); - - // Iteration 2: - ADD_SUB(F16, F17); - ADD_SUB(F18, F19); - ADD_SUB(F20, F21); - ADD_SUB(F22, F23); - - F19 <<= 4; - F23 <<= 4; - - ADD_SUB(F16, F18); - ADD_SUB(F17, F19); - ADD_SUB(F20, F22); - ADD_SUB(F21, F23); - - F21 <<= 2; - F22 <<= 4; - F23 <<= 6; - - ADD_SUB(F16, F20); - ADD_SUB(F17, F21); - ADD_SUB(F18, F22); - ADD_SUB(F19, F23); - - output[2] = Q_REDUCE(F16); - output[10] = Q_REDUCE(F17); - output[18] = Q_REDUCE(F18); - output[26] = Q_REDUCE(F19); - output[34] = Q_REDUCE(F20); - output[42] = Q_REDUCE(F21); - output[50] = Q_REDUCE(F22); - output[58] = Q_REDUCE(F23); - - // Iteration 3: - ADD_SUB(F24, F25); - ADD_SUB(F26, F27); - ADD_SUB(F28, F29); - ADD_SUB(F30, F31); - - F27 <<= 4; - F31 <<= 4; - - ADD_SUB(F24, F26); - ADD_SUB(F25, F27); - ADD_SUB(F28, F30); - ADD_SUB(F29, F31); - - F29 <<= 2; - F30 <<= 4; - F31 <<= 6; - - ADD_SUB(F24, F28); - ADD_SUB(F25, F29); - ADD_SUB(F26, F30); - ADD_SUB(F27, F31); - - output[3] = Q_REDUCE(F24); - output[11] = Q_REDUCE(F25); - output[19] = Q_REDUCE(F26); - output[27] = Q_REDUCE(F27); - output[35] = Q_REDUCE(F28); - output[43] = Q_REDUCE(F29); - output[51] = Q_REDUCE(F30); - output[59] = Q_REDUCE(F31); - - // Iteration 4: - ADD_SUB(F32, F33); - ADD_SUB(F34, F35); - ADD_SUB(F36, F37); - ADD_SUB(F38, F39); - - F35 <<= 4; - F39 <<= 4; - - ADD_SUB(F32, F34); - ADD_SUB(F33, F35); - ADD_SUB(F36, F38); - ADD_SUB(F37, F39); - - F37 <<= 2; - F38 <<= 4; - F39 <<= 6; - - ADD_SUB(F32, F36); - ADD_SUB(F33, F37); - ADD_SUB(F34, F38); - ADD_SUB(F35, F39); - - output[4] = Q_REDUCE(F32); - output[12] = Q_REDUCE(F33); - output[20] = Q_REDUCE(F34); - output[28] = Q_REDUCE(F35); - output[36] = Q_REDUCE(F36); - output[44] = Q_REDUCE(F37); - output[52] = Q_REDUCE(F38); - output[60] = Q_REDUCE(F39); - - // Iteration 5: - ADD_SUB(F40, F41); - ADD_SUB(F42, F43); - ADD_SUB(F44, F45); - ADD_SUB(F46, F47); - - F43 <<= 4; - F47 <<= 4; - - ADD_SUB(F40, F42); - ADD_SUB(F41, F43); - ADD_SUB(F44, F46); - ADD_SUB(F45, F47); - - F45 <<= 2; - F46 <<= 4; - F47 <<= 6; - - ADD_SUB(F40, F44); - ADD_SUB(F41, F45); - ADD_SUB(F42, F46); - ADD_SUB(F43, F47); - - output[5] = Q_REDUCE(F40); - output[13] = Q_REDUCE(F41); - output[21] = Q_REDUCE(F42); - output[29] = Q_REDUCE(F43); - output[37] = Q_REDUCE(F44); - output[45] = Q_REDUCE(F45); - output[53] = Q_REDUCE(F46); - output[61] = Q_REDUCE(F47); - - // Iteration 6: - ADD_SUB(F48, F49); - ADD_SUB(F50, F51); - ADD_SUB(F52, F53); - ADD_SUB(F54, F55); - - F51 <<= 4; - F55 <<= 4; - - ADD_SUB(F48, F50); - ADD_SUB(F49, F51); - ADD_SUB(F52, F54); - ADD_SUB(F53, F55); - - F53 <<= 2; - F54 <<= 4; - F55 <<= 6; - - ADD_SUB(F48, F52); - ADD_SUB(F49, F53); - ADD_SUB(F50, F54); - ADD_SUB(F51, F55); - - output[6] = Q_REDUCE(F48); - output[14] = Q_REDUCE(F49); - output[22] = Q_REDUCE(F50); - output[30] = Q_REDUCE(F51); - output[38] = Q_REDUCE(F52); - output[46] = Q_REDUCE(F53); - output[54] = Q_REDUCE(F54); - output[62] = Q_REDUCE(F55); - - // Iteration 7: - ADD_SUB(F56, F57); - ADD_SUB(F58, F59); - ADD_SUB(F60, F61); - ADD_SUB(F62, F63); - - F59 <<= 4; - F63 <<= 4; - - ADD_SUB(F56, F58); - ADD_SUB(F57, F59); - ADD_SUB(F60, F62); - ADD_SUB(F61, F63); - - F61 <<= 2; - F62 <<= 4; - F63 <<= 6; - - ADD_SUB(F56, F60); - ADD_SUB(F57, F61); - ADD_SUB(F58, F62); - ADD_SUB(F59, F63); - - output[7] = Q_REDUCE(F56); - output[15] = Q_REDUCE(F57); - output[23] = Q_REDUCE(F58); - output[31] = Q_REDUCE(F59); - output[39] = Q_REDUCE(F60); - output[47] = Q_REDUCE(F61); - output[55] = Q_REDUCE(F62); - output[63] = Q_REDUCE(F63); -} - -// Calculates the FFT part of SWIFFT. -// We divided the SWIFFT calculation into two, because that way we could save 2 computations of -// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs -// is only the A's part. -// -// Parameters: -// - input: the input to FFT. -// - m: the input size divided by 8. The function performs m FFTs. -// - output: will store the result. -void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output) -{ - int i; - - for (i = 0; - i < m; - i++, input += EIGHTH_N, output += N) - { - FFT(input, output); - } -} - -// Calculates the 'sum' part of SWIFFT, including the base change at the end. -// We divided the SWIFFT calculation into two, because that way we could save 2 computations of -// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs -// is only the A's part. -// -// Parameters: -// - input: the input. Of size 64 * m. -// - m: the input size divided by 64. -// - output: will store the result. -// - a: the coefficients in the sum. Of size 64 * m. -void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a) -{ - int i, j; - swift_int32_t result[N]; - register swift_int16_t carry = 0; - - for (j = 0; j < N; ++j) - { - register swift_int32_t sum = 0; - const register swift_int32_t *f = input + j; - const register swift_int16_t *k = a + j; - - for (i = 0; i < m; i++, f += N,k += N) - { - sum += (*f) * (*k); - } - - result[j] = sum; - } - - for (j = 0; j < N; ++j) - { - result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE; - } - - for (j = 0; j < 8; ++j) - { - int register carryBit = TranslateToBase256(result + (j << 3), output + (j << 3)); - carry |= carryBit << j; - } - - output[N] = carry; -} - -void ComputeSingleSWIFFTX(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], - unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], - bool doSmooth) -{ - int i; - // Will store the result of the FFT parts: - swift_int32_t fftOut[N * M]; - unsigned char intermediate[N * 3 + 8]; - unsigned char carry0,carry1,carry2; - - // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets - // overriden by the following SWIFFT): - - // 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs: - SWIFFTFFT(input, M, fftOut); - - // 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients: - - // 2a. The first SWIFFT: - SWIFFTSum(fftOut, M, intermediate, As); - // Remember the carry byte: - carry0 = intermediate[N]; - - // 2b. The second one: - SWIFFTSum(fftOut, M, intermediate + N, As + (M * N)); - carry1 = intermediate[2 * N]; - - // 2c. The third one: - SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N)); - carry2 = intermediate[3 * N]; - - //2d. Put three carry bytes in their place - intermediate[3 * N] = carry0; - intermediate[(3 * N) + 1] = carry1; - intermediate[(3 * N) + 2] = carry2; - - // Padding intermediate output with 5 zeroes. - memset(intermediate + (3 * N) + 3, 0, 5); - - // Apply the S-Box: - for (i = 0; i < (3 * N) + 8; ++i) - { - intermediate[i] = SBox[intermediate[i]]; - } - - // 3. The final and last SWIFFT: - SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut); - SWIFFTSum(fftOut, 3 * (N/8) + 1, output, As); - - if (doSmooth) - { - unsigned char sum[N]; - register int i, j; - memset(sum, 0, N); - - for (i = 0; i < (N + 1) * 8; ++i) - { - register const swift_int16_t *AsRow; - register int AShift; - - if (!(output[i >> 3] & (1 << (i & 7)))) - { - continue; - } - - AsRow = As + N * M + (i & ~(N - 1)) ; - AShift = i & 63; - - for (j = AShift; j < N; ++j) - { - sum[j] += AsRow[j - AShift]; - } - - for(j = 0; j < AShift; ++j) - { - sum[j] -= AsRow[N - AShift + j]; - } - } - - for (i = 0; i < N; ++i) - { - output[i] = sum[i]; - } - - output[N] = 0; - } -} diff --git a/algo/swifftx/swifftx.h b/algo/swifftx/swifftx.h index eedbc8f0..ad2214a6 100644 --- a/algo/swifftx/swifftx.h +++ b/algo/swifftx/swifftx.h @@ -61,11 +61,10 @@ void ComputeSingleSWIFFT(unsigned char *input, unsigned short m, // // Returns: // - Success value. -void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], - unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE] ); +void ComputeSingleSWIFFTX( unsigned char *input, unsigned char *output ); -void ComputeSingleSWIFFTX_smooth( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], - unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], bool doSmooth); +//void ComputeSingleSWIFFTX_smooth( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], +// unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], bool doSmooth); // Calculates the powers of OMEGA and generates the bit reversal permutation. // You must call this function before doing SWIFFT/X, otherwise you will get zeroes everywhere. diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c new file mode 100644 index 00000000..553bb6a6 --- /dev/null +++ b/algo/verthash/Verthash.c @@ -0,0 +1,743 @@ +/* + * Copyright 2018-2021 CryptoGraphics + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See LICENSE for more details. + */ + +#include "algo-gate-api.h" +#include "Verthash.h" +#include "mm_malloc.h" +#include "malloc-huge.h" + +//----------------------------------------------------------------------------- +// Verthash info management +int verthash_info_init(verthash_info_t* info, const char* file_name) +{ + // init fields to 0 + info->fileName = NULL; + info->data = NULL; + info->dataSize = 0; + info->bitmask = 0; + size_t fileNameLen; + + if ( !file_name || !( fileNameLen = strlen( file_name ) ) ) + { + applog( LOG_ERR, "Invalid file specification" ); + return -1; + } + + info->fileName = (char*)malloc( fileNameLen + 1 ); + if ( !info->fileName ) + { + applog( LOG_ERR, "Failed to allocate memory for Verthash data" ); + return -1; + } + + memset( info->fileName, 0, fileNameLen + 1 ); + memcpy( info->fileName, file_name, fileNameLen ); + + FILE *fileMiningData = fopen_utf8( info->fileName, "rb" ); + if ( !fileMiningData ) + { + if ( opt_data_file || !opt_verify ) + { + if ( opt_data_file ) + applog( LOG_ERR, "Verthash data file not found or invalid: %s", + info->fileName ); + else + { + applog( LOG_ERR, + "No Verthash data file specified and default not found"); + applog( LOG_NOTICE, + "Add '--verify' to create default 'verthash.dat'"); + } + return -1; + } + else + { + applog( LOG_NOTICE, "Creating default 'verthash.dat' in current directory, this will take several minutes"); + if ( verthash_generate_data_file( info->fileName ) ) + return -1; + + fileMiningData = fopen_utf8( info->fileName, "rb" ); + if ( !fileMiningData ) + { + applog( LOG_ERR, "File system error opening %s", info->fileName ); + return -1; + } + + applog( LOG_NOTICE, "Verthash data file created successfully" ); + } + } + + // Get file size + fseek(fileMiningData, 0, SEEK_END); + int fileSize = ftell(fileMiningData); + fseek(fileMiningData, 0, SEEK_SET); + + if ( fileSize < 0 ) + { + fclose(fileMiningData); + return 1; + } + + // Allocate data + info->data = (uint8_t *)malloc_hugepages( fileSize ); + if ( info->data ) + { + if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages"); + } + else + info->data = (uint8_t *)_mm_malloc( fileSize, 64 ); + + if ( !info->data ) + { + fclose( fileMiningData ); + // Memory allocation fatal error. + return 2; + } + + // Load data + if ( !fread( info->data, fileSize, 1, fileMiningData ) ) + { + applog( LOG_ERR, "File system error reading %s", info->fileName ); + fclose(fileMiningData); + return -1; + } + + fclose(fileMiningData); + + // Update fields + info->bitmask = ((fileSize - VH_HASH_OUT_SIZE)/VH_BYTE_ALIGNMENT) + 1; + info->dataSize = fileSize; + + applog( LOG_NOTICE, "Using Verthash data file '%s'", info->fileName ); + return 0; +} + +//----------------------------------------------------------------------------- +void verthash_info_free(verthash_info_t* info) +{ + free(info->fileName); + free(info->data); + info->dataSize = 0; + info->bitmask = 0; +} + + +//----------------------------------------------------------------------------- +// Verthash hash +#define VH_P0_SIZE 64 +#define VH_N_ITER 8 +#define VH_N_SUBSET VH_P0_SIZE*VH_N_ITER +#define VH_N_ROT 32 +#define VH_N_INDEXES 4096 +#define VH_BYTE_ALIGNMENT 16 + +static inline uint32_t fnv1a(const uint32_t a, const uint32_t b) +{ + return (a ^ b) * 0x1000193; +} + +#if 0 +static void rotate_indexes( uint32_t *p ) +{ +#if defined(__AVX2__) + + for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 ) + { + __m256i *px = (__m256i*)p + x; + + px[0] = mm256_rol_32( px[0], 1 ); + px[1] = mm256_rol_32( px[1], 1 ); + px[2] = mm256_rol_32( px[2], 1 ); + px[3] = mm256_rol_32( px[3], 1 ); + px[4] = mm256_rol_32( px[4], 1 ); + px[5] = mm256_rol_32( px[5], 1 ); + px[6] = mm256_rol_32( px[6], 1 ); + px[7] = mm256_rol_32( px[7], 1 ); + } + +#else + + for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 ) + { + __m128i *px = (__m128i*)p0_index + x; + + px[0] = mm128_rol_32( px[0], 1 ); + px[1] = mm128_rol_32( px[1], 1 ); + px[2] = mm128_rol_32( px[2], 1 ); + px[3] = mm128_rol_32( px[3], 1 ); + px[4] = mm128_rol_32( px[4], 1 ); + px[5] = mm128_rol_32( px[5], 1 ); + px[6] = mm128_rol_32( px[6], 1 ); + px[7] = mm128_rol_32( px[7], 1 ); + } + +#endif +/* + for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x ) + p[x] = ( p[x] << 1 ) | ( p[x] >> 31 ); +*/ +} +#endif +// Vectorized and targetted version of fnv1a +#if defined (__AVX2__) + +#define MULXOR \ + *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \ + *(__m256i*)hash, *(__m256i*)blob_off ), k ); + +#elif defined(__SSE41__) + +#define MULXOR \ + casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \ + casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \ + casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \ + casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k ); + +#else + +#define MULXOR \ + for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \ + hash[j] = fnv1a( hash[j], blob_off[j] ); \ + +#endif + +#define UPDATE_ACCUMULATOR \ + accumulator = fnv1a( accumulator, blob_off[0] ); \ + accumulator = fnv1a( accumulator, blob_off[1] ); \ + accumulator = fnv1a( accumulator, blob_off[2] ); \ + accumulator = fnv1a( accumulator, blob_off[3] ); \ + accumulator = fnv1a( accumulator, blob_off[4] ); \ + accumulator = fnv1a( accumulator, blob_off[5] ); \ + accumulator = fnv1a( accumulator, blob_off[6] ); \ + accumulator = fnv1a( accumulator, blob_off[7] ) + + +// first pass no rotate +#define ROUND_0 \ +for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \ +{ \ + const uint32_t *blob_off = blob + \ + ( ( fnv1a( subset[i], accumulator ) % mdiv ) \ + * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \ + UPDATE_ACCUMULATOR; \ + MULXOR; \ +} + +// subsequent passes rotate by r on demand, no need for mass rotate +#define ROUND_r( r ) \ +for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \ +{ \ + const uint32_t *blob_off = blob + \ + ( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \ + * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \ + UPDATE_ACCUMULATOR; \ + MULXOR; \ +} + +void verthash_hash( const void *blob_bytes, const size_t blob_size, + const void *input, void *output ) +{ + uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64))); + uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64))); + const uint32_t *blob = (const uint32_t*)blob_bytes; + uint32_t accumulator = 0x811c9dc5; + const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE ) + / VH_BYTE_ALIGNMENT ) + 1; +#if defined (__AVX2__) + const __m256i k = _mm256_set1_epi32( 0x1000193 ); +#elif defined(__SSE41__) + const __m128i k = _mm_set1_epi32( 0x1000193 ); +#endif + + sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE ); + verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] ); + + ROUND_0; + for ( size_t r = 1; r < VH_N_ROT; ++r ) + ROUND_r( r ); + + memcpy( output, hash, VH_HASH_OUT_SIZE ); +} + +//----------------------------------------------------------------------------- +// Verthash data file generator + +#define NODE_SIZE 32 + +struct Graph +{ + FILE *db; + int64_t log2; + int64_t pow2; + uint8_t *pk; + int64_t index; +}; + +int64_t Log2(int64_t x) +{ + int64_t r = 0; + for (; x > 1; x >>= 1) + { + r++; + } + + return r; +} + +int64_t bfsToPost(struct Graph *g, const int64_t node) +{ + return node & ~g->pow2; +} + +int64_t numXi(int64_t index) +{ + return (1 << ((uint64_t)index)) * (index + 1) * index; +} + +void WriteId(struct Graph *g, uint8_t *Node, const int64_t id) +{ + fseek(g->db, id * NODE_SIZE, SEEK_SET); + fwrite(Node, 1, NODE_SIZE, g->db); +} + +void WriteNode(struct Graph *g, uint8_t *Node, const int64_t id) +{ + const int64_t idx = bfsToPost(g, id); + WriteId(g, Node, idx); +} + +void NewNode(struct Graph *g, const int64_t id, uint8_t *hash) +{ + WriteNode(g, hash, id); +} + +uint8_t *GetId(struct Graph *g, const int64_t id) +{ + fseek(g->db, id * NODE_SIZE, SEEK_SET); + uint8_t *node = (uint8_t *)malloc(NODE_SIZE); + const size_t bytes_read = fread(node, 1, NODE_SIZE, g->db); + if(bytes_read != NODE_SIZE) { + return NULL; + } + return node; +} + +uint8_t *GetNode(struct Graph *g, const int64_t id) +{ + const int64_t idx = bfsToPost(g, id); + return GetId(g, idx); +} + +uint32_t WriteVarInt(uint8_t *buffer, int64_t val) +{ + memset(buffer, 0, NODE_SIZE); + uint64_t uval = ((uint64_t)(val)) << 1; + if (val < 0) + { + uval = ~uval; + } + uint32_t i = 0; + while (uval >= 0x80) + { + buffer[i] = (uint8_t)uval | 0x80; + uval >>= 7; + i++; + } + buffer[i] = (uint8_t)uval; + return i; +} + +void ButterflyGraph(struct Graph *g, int64_t index, int64_t *count) +{ + if (index == 0) + { + index = 1; + } + + int64_t numLevel = 2 * index; + int64_t perLevel = (int64_t)(1 << (uint64_t)index); + int64_t begin = *count - perLevel; + int64_t level, i; + + for (level = 1; level < numLevel; level++) + { + for (i = 0; i < perLevel; i++) + { + int64_t prev; + int64_t shift = index - level; + if (level > numLevel / 2) + { + shift = level - numLevel / 2; + } + if (((i >> (uint64_t)shift) & 1) == 0) + { + prev = i + (1 << (uint64_t)shift); + } + else + { + prev = i - (1 << (uint64_t)shift); + } + + uint8_t *parent0 = GetNode(g, begin + (level - 1) * perLevel + prev); + uint8_t *parent1 = GetNode(g, *count - perLevel); + uint8_t *buf = (uint8_t *)malloc(NODE_SIZE); + WriteVarInt(buf, *count); + uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4); + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE); + + uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE); + sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE); + + NewNode(g, *count, hashOutput); + (*count)++; + + free(hashOutput); + free(hashInput); + free(parent0); + free(parent1); + free(buf); + } + } +} + +void XiGraphIter(struct Graph *g, int64_t index) +{ + int64_t count = g->pow2; + + int8_t stackSize = 5; + int64_t *stack = (int64_t *)malloc(sizeof(int64_t) * stackSize); + for (int i = 0; i < 5; i++) + stack[i] = index; + + int8_t graphStackSize = 5; + int32_t *graphStack = (int32_t *)malloc(sizeof(int32_t) * graphStackSize); + for (int i = 0; i < 5; i++) + graphStack[i] = graphStackSize - i - 1; + + int64_t i = 0; + int64_t graph = 0; + int64_t pow2index = 1 << ((uint64_t)index); + + for (i = 0; i < pow2index; i++) + { + uint8_t *buf = (uint8_t *)malloc(NODE_SIZE); + WriteVarInt(buf, count); + uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 2); + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE); + + sha3(hashInput, NODE_SIZE * 2, hashOutput, NODE_SIZE); + NewNode(g, count, hashOutput); + count++; + + free(hashOutput); + free(hashInput); + free(buf); + } + + if (index == 1) + { + ButterflyGraph(g, index, &count); + return; + } + + while (stackSize != 0 && graphStackSize != 0) + { + + index = stack[stackSize - 1]; + graph = graphStack[graphStackSize - 1]; + + stackSize--; + if (stackSize > 0) + { + int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize)); + memcpy(tempStack, stack, sizeof(int64_t) * (stackSize)); + free(stack); + stack = tempStack; + } + + graphStackSize--; + if (graphStackSize > 0) + { + int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize)); + memcpy(tempGraphStack, graphStack, sizeof(int32_t) * (graphStackSize)); + free(graphStack); + graphStack = tempGraphStack; + } + + int8_t indicesSize = 5; + int64_t *indices = (int64_t *)malloc(sizeof(int64_t) * indicesSize); + for (int i = 0; i < indicesSize; i++) + indices[i] = index - 1; + + int8_t graphsSize = 5; + int32_t *graphs = (int32_t *)malloc(sizeof(int32_t) * graphsSize); + for (int i = 0; i < graphsSize; i++) + graphs[i] = graphsSize - i - 1; + + int64_t pow2indexInner = 1 << ((uint64_t)index); + int64_t pow2indexInner_1 = 1 << ((uint64_t)index - 1); + + if (graph == 0) + { + uint64_t sources = count - pow2indexInner; + for (i = 0; i < pow2indexInner_1; i++) + { + uint8_t *parent0 = GetNode(g, sources + i); + uint8_t *parent1 = GetNode(g, sources + i + pow2indexInner_1); + + uint8_t *buf = (uint8_t *)malloc(NODE_SIZE); + WriteVarInt(buf, count); + + uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4); + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE); + + uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE); + sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE); + + NewNode(g, count, hashOutput); + count++; + + free(hashOutput); + free(hashInput); + free(parent0); + free(parent1); + free(buf); + } + } + else if (graph == 1) + { + uint64_t firstXi = count; + for (i = 0; i < pow2indexInner_1; i++) + { + uint64_t nodeId = firstXi + i; + uint8_t *parent = GetNode(g, firstXi - pow2indexInner_1 + i); + + uint8_t *buf = (uint8_t *)malloc(NODE_SIZE); + WriteVarInt(buf, nodeId); + + uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3); + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE); + + uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE); + sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE); + + NewNode(g, count, hashOutput); + count++; + + free(hashOutput); + free(hashInput); + free(parent); + free(buf); + } + } + else if (graph == 2) + { + uint64_t secondXi = count; + for (i = 0; i < pow2indexInner_1; i++) + { + uint64_t nodeId = secondXi + i; + uint8_t *parent = GetNode(g, secondXi - pow2indexInner_1 + i); + + uint8_t *buf = (uint8_t *)malloc(NODE_SIZE); + WriteVarInt(buf, nodeId); + + uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3); + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE); + + uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE); + sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE); + + NewNode(g, count, hashOutput); + count++; + + free(hashOutput); + free(hashInput); + free(parent); + free(buf); + } + } + else if (graph == 3) + { + uint64_t secondButter = count; + for (i = 0; i < pow2indexInner_1; i++) + { + uint64_t nodeId = secondButter + i; + uint8_t *parent = GetNode(g, secondButter - pow2indexInner_1 + i); + + uint8_t *buf = (uint8_t *)malloc(NODE_SIZE); + WriteVarInt(buf, nodeId); + + uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3); + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE); + + uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE); + sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE); + + NewNode(g, count, hashOutput); + count++; + + free(hashOutput); + free(hashInput); + free(parent); + free(buf); + } + } + else + { + uint64_t sinks = count; + uint64_t sources = sinks + pow2indexInner - numXi(index); + for (i = 0; i < pow2indexInner_1; i++) + { + uint64_t nodeId0 = sinks + i; + uint64_t nodeId1 = sinks + i + pow2indexInner_1; + uint8_t *parent0 = GetNode(g, sinks - pow2indexInner_1 + i); + uint8_t *parent1_0 = GetNode(g, sources + i); + uint8_t *parent1_1 = GetNode(g, sources + i + pow2indexInner_1); + + uint8_t *buf = (uint8_t *)malloc(NODE_SIZE); + WriteVarInt(buf, nodeId0); + + uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4); + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 3), parent1_0, NODE_SIZE); + + uint8_t *hashOutput0 = (uint8_t *)malloc(NODE_SIZE); + sha3(hashInput, NODE_SIZE * 4, hashOutput0, NODE_SIZE); + + WriteVarInt(buf, nodeId1); + + memcpy(hashInput, g->pk, NODE_SIZE); + memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE); + memcpy(hashInput + (NODE_SIZE * 3), parent1_1, NODE_SIZE); + + uint8_t *hashOutput1 = (uint8_t *)malloc(NODE_SIZE); + sha3(hashInput, NODE_SIZE * 4, hashOutput1, NODE_SIZE); + + NewNode(g, nodeId0, hashOutput0); + NewNode(g, nodeId1, hashOutput1); + count += 2; + + free(parent0); + free(parent1_0); + free(parent1_1); + free(buf); + free(hashInput); + free(hashOutput0); + free(hashOutput1); + } + } + + if ((graph == 0 || graph == 3) || + ((graph == 1 || graph == 2) && index == 2)) + { + ButterflyGraph(g, index - 1, &count); + } + else if (graph == 1 || graph == 2) + { + + int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize + indicesSize)); + memcpy(tempStack, stack, stackSize * sizeof(int64_t)); + memcpy(tempStack + stackSize, indices, indicesSize * sizeof(int64_t)); + stackSize += indicesSize; + free(stack); + stack = tempStack; + + int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize + graphsSize)); + memcpy(tempGraphStack, graphStack, graphStackSize * sizeof(int32_t)); + memcpy(tempGraphStack + graphStackSize, graphs, graphsSize * sizeof(int32_t)); + graphStackSize += graphsSize; + free(graphStack); + graphStack = tempGraphStack; + } + + free(indices); + free(graphs); + } + + free(stack); + free(graphStack); +} + +struct Graph *NewGraph(int64_t index, const char* targetFile, uint8_t *pk) +{ + uint8_t exists = 0; + FILE *db; + if ((db = fopen_utf8(targetFile, "r")) != NULL) + { + fclose(db); + exists = 1; + } + + db = fopen_utf8(targetFile, "wb+"); + int64_t size = numXi(index); + int64_t log2 = Log2(size) + 1; + int64_t pow2 = 1 << ((uint64_t)log2); + + struct Graph *g = (struct Graph *)malloc(sizeof(struct Graph)); + + if ( !g ) return NULL; + + g->db = db; + g->log2 = log2; + g->pow2 = pow2; + g->pk = pk; + g->index = index; + + if (exists == 0) + { + XiGraphIter(g, index); + } + + fclose(db); + return g; +} + +//----------------------------------------------------------------------------- + +// use info for _mm_malloc, then verify file +int verthash_generate_data_file(const char* output_file_name) +{ + const char *hashInput = "Verthash Proof-of-Space Datafile"; + uint8_t *pk = (uint8_t*)malloc( NODE_SIZE ); + + if ( !pk ) + { + applog( LOG_ERR, "Verthash data memory allocation failed"); + return -1; + } + + sha3( hashInput, 32, pk, NODE_SIZE ); + + int64_t index = 17; + if ( !NewGraph( index, output_file_name, pk ) ) + { + applog( LOG_ERR, "Verthash file creation failed"); + return -1; + } + + return 0; +} + diff --git a/algo/verthash/Verthash.h b/algo/verthash/Verthash.h new file mode 100644 index 00000000..e3e4029b --- /dev/null +++ b/algo/verthash/Verthash.h @@ -0,0 +1,57 @@ +/* + * Copyright 2018-2021 CryptoGraphics + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See LICENSE for more details. + */ + +#ifndef Verthash_INCLUDE_ONCE +#define Verthash_INCLUDE_ONCE + +#include "tiny_sha3/sha3.h" +#include "fopen_utf8.h" + +#include +#include +#include +#include + +// Verthash constants used to compute bitmask, used inside kernel during IO pass +#define VH_HASH_OUT_SIZE 32 +#define VH_BYTE_ALIGNMENT 16 +#define VH_HEADER_SIZE 80 + +//----------------------------------------------------------------------------- +// Verthash data +//! Verthash C api for data maniputation. +typedef struct VerthashInfo +{ + char* fileName; + uint8_t* data; + uint64_t dataSize; + uint32_t bitmask; +} verthash_info_t; + +//! Must be called before usage. Reset all fields and set a mining data file name. +//! Error codes +//! 0 - Success(No error). +//! 1 - File name is invalid. +//! 2 - Memory allocation error +int verthash_info_init(verthash_info_t* info, const char* file_name); + +//! Reset all fields and free allocated data. +void verthash_info_free(verthash_info_t* info); + +//! Generate verthash data file and save it to specified location. +int verthash_generate_data_file(const char* output_file_name); + +void verthash_hash( const void *blob_bytes, const size_t blob_size, + const void *input, void *output ); + +void verthash_sha3_512_prehash_72( const void *input ); +void verthash_sha3_512_final_8( void *hash, const uint64_t nonce ); + +#endif // !Verthash_INCLUDE_ONCE + diff --git a/algo/verthash/fopen_utf8.c b/algo/verthash/fopen_utf8.c new file mode 100644 index 00000000..e2bd4b1d --- /dev/null +++ b/algo/verthash/fopen_utf8.c @@ -0,0 +1,181 @@ +#ifndef H_FOPEN_UTF8 +#define H_FOPEN_UTF8 + +#include "fopen_utf8.h" +#include +#include +#include +#include + +int utf8_char_size(const uint8_t *c) +{ + const uint8_t m0x = 0x80, c0x = 0x00, + m10x = 0xC0, c10x = 0x80, + m110x = 0xE0, c110x = 0xC0, + m1110x = 0xF0, c1110x = 0xE0, + m11110x = 0xF8, c11110x = 0xF0; + + if ((c[0] & m0x) == c0x) + return 1; + + if ((c[0] & m110x) == c110x) + if ((c[1] & m10x) == c10x) + return 2; + + if ((c[0] & m1110x) == c1110x) + if ((c[1] & m10x) == c10x) + if ((c[2] & m10x) == c10x) + return 3; + + if ((c[0] & m11110x) == c11110x) + if ((c[1] & m10x) == c10x) + if ((c[2] & m10x) == c10x) + if ((c[3] & m10x) == c10x) + return 4; + + if ((c[0] & m10x) == c10x) // not a first UTF-8 byte + return 0; + + return -1; // if c[0] is a first byte but the other bytes don't match +} + +uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index) +{ + uint32_t v; + int size; + const uint8_t m6 = 63, m5 = 31, m4 = 15, m3 = 7; + + if (c==NULL) + return 0; + + size = utf8_char_size(c); + + if (size > 0 && index) + *index += size-1; + + switch (size) + { + case 1: + v = c[0]; + break; + case 2: + v = c[0] & m5; + v = v << 6 | (c[1] & m6); + break; + case 3: + v = c[0] & m4; + v = v << 6 | (c[1] & m6); + v = v << 6 | (c[2] & m6); + break; + case 4: + v = c[0] & m3; + v = v << 6 | (c[1] & m6); + v = v << 6 | (c[2] & m6); + v = v << 6 | (c[3] & m6); + break; + case 0: // not a first UTF-8 byte + case -1: // corrupt UTF-8 letter + default: + v = -1; + break; + } + + return v; +} + +int codepoint_utf16_size(uint32_t c) +{ + if (c < 0x10000) return 1; + if (c < 0x110000) return 2; + + return 0; +} + +uint16_t *sprint_utf16(uint16_t *str, uint32_t c) // str must be able to hold 1 to 3 entries and will be null-terminated by this function +{ + int c_size; + + if (str==NULL) + return NULL; + + c_size = codepoint_utf16_size(c); + + switch (c_size) + { + case 1: + str[0] = c; + if (c > 0) + str[1] = '\0'; + break; + + case 2: + c -= 0x10000; + str[0] = 0xD800 + (c >> 10); + str[1] = 0xDC00 + (c & 0x3FF); + str[2] = '\0'; + break; + + default: + str[0] = '\0'; + } + + return str; +} + +size_t strlen_utf8_to_utf16(const uint8_t *str) +{ + size_t i, count; + uint32_t c; + + for (i=0, count=0; ; i++) + { + if (str[i]==0) + return count; + + c = utf8_to_unicode32(&str[i], &i); + count += codepoint_utf16_size(c); + } +} + +uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16) +{ + size_t i, j; + uint32_t c; + + if (utf8==NULL) + return NULL; + + if (utf16==NULL) + utf16 = (uint16_t *) calloc(strlen_utf8_to_utf16(utf8) + 1, sizeof(uint16_t)); + + for (i=0, j=0, c=1; c; i++) + { + c = utf8_to_unicode32(&utf8[i], &i); + sprint_utf16(&utf16[j], c); + j += codepoint_utf16_size(c); + } + + return utf16; +} + +FILE *fopen_utf8(const char *path, const char *mode) +{ + #ifdef _WIN32 + wchar_t *wpath, wmode[8]; + FILE *file; + + if (utf8_to_utf16((const uint8_t *) mode, (uint16_t *) wmode)==NULL) + return NULL; + + wpath = (wchar_t *) utf8_to_utf16((const uint8_t *) path, NULL); + if (wpath==NULL) + return NULL; + + file = _wfopen(wpath, wmode); + free(wpath); + return file; + #else + return fopen(path, mode); + #endif +} +#endif diff --git a/algo/verthash/fopen_utf8.h b/algo/verthash/fopen_utf8.h new file mode 100644 index 00000000..0547313f --- /dev/null +++ b/algo/verthash/fopen_utf8.h @@ -0,0 +1,25 @@ +#ifndef H_FOPEN_UTF8 +#define H_FOPEN_UTF8 +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +int utf8_char_size(const uint8_t *c); +uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index); +int codepoint_utf16_size(uint32_t c); +uint16_t *sprint_utf16(uint16_t *str, uint32_t c); +size_t strlen_utf8_to_utf16(const uint8_t *str); +uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16); + +FILE *fopen_utf8(const char *path, const char *mode); + +#ifdef __cplusplus +} +#endif +#endif + diff --git a/algo/verthash/tiny_sha3/sha3-4way.c b/algo/verthash/tiny_sha3/sha3-4way.c new file mode 100644 index 00000000..debbd775 --- /dev/null +++ b/algo/verthash/tiny_sha3/sha3-4way.c @@ -0,0 +1,292 @@ +#if defined(__AVX2__) + +// sha3-4way.c +// 19-Nov-11 Markku-Juhani O. Saarinen +// vectorization by JayDDee 2021-03-27 +// +// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3" +// Revised 03-Sep-15 for portability + OpenSSL - style API + +#include "sha3-4way.h" + +// constants +static const uint64_t keccakf_rndc[24] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 + }; + +void sha3_4way_keccakf( __m256i st[25] ) +{ + int i, j, r; + __m256i t, bc[5]; + + for ( r = 0; r < KECCAKF_ROUNDS; r++ ) + { + // Theta + bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) ); + bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) ); + bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) ); + bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) ); + bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) ); + + for ( i = 0; i < 5; i++ ) + { + t = _mm256_xor_si256( bc[ (i+4) % 5 ], + mm256_rol_64( bc[ (i+1) % 5 ], 1 ) ); + st[ i ] = _mm256_xor_si256( st[ i ], t ); + st[ i+5 ] = _mm256_xor_si256( st[ i+ 5 ], t ); + st[ i+10 ] = _mm256_xor_si256( st[ i+10 ], t ); + st[ i+15 ] = _mm256_xor_si256( st[ i+15 ], t ); + st[ i+20 ] = _mm256_xor_si256( st[ i+20 ], t ); + } + + // Rho Pi +#define RHO_PI( i, c ) \ + bc[0] = st[ i ]; \ + st[ i ] = mm256_rol_64( t, c ); \ + t = bc[0] + + t = st[1]; + + RHO_PI( 10, 1 ); + RHO_PI( 7, 3 ); + RHO_PI( 11, 6 ); + RHO_PI( 17, 10 ); + RHO_PI( 18, 15 ); + RHO_PI( 3, 21 ); + RHO_PI( 5, 28 ); + RHO_PI( 16, 36 ); + RHO_PI( 8, 45 ); + RHO_PI( 21, 55 ); + RHO_PI( 24, 2 ); + RHO_PI( 4, 14 ); + RHO_PI( 15, 27 ); + RHO_PI( 23, 41 ); + RHO_PI( 19, 56 ); + RHO_PI( 13, 8 ); + RHO_PI( 12, 25 ); + RHO_PI( 2, 43 ); + RHO_PI( 20, 62 ); + RHO_PI( 14, 18 ); + RHO_PI( 22, 39 ); + RHO_PI( 9, 61 ); + RHO_PI( 6, 20 ); + RHO_PI( 1, 44 ); + +#undef RHO_PI + + // Chi + for ( j = 0; j < 25; j += 5 ) + { + bc[0] = st[j]; + bc[1] = st[j+1]; + st[ j ] = mm256_xorandnot( st[ j ], st[j+1], st[j+2] ); + st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] ); + st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] ); + st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] ); + st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] ); + } + + // Iota + st[0] = _mm256_xor_si256( st[0], + _mm256_set1_epi64x( keccakf_rndc[ r ] ) ); + } +} + +int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen ) +{ + for ( int i = 0; i < 25; i++ ) c->st[ i ] = m256_zero; + c->mdlen = mdlen; + c->rsiz = 200 - 2 * mdlen; + c->pt = 0; + return 1; +} + +int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len ) +{ + size_t i; + int j = c->pt; + const int rsiz = c->rsiz / 8; + const int l = len / 8; + + for ( i = 0; i < l; i++ ) + { + c->st[ j ] = _mm256_xor_si256( c->st[ j ], + ( (const __m256i*)data )[i] ); + j++; + if ( j >= rsiz ) + { + sha3_4way_keccakf( c->st ); + j = 0; + } + } + c->pt = j; + + return 1; +} + +int sha3_4way_final( void *md, sha3_4way_ctx_t *c ) +{ + c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ], + m256_const1_64( 6 ) ); + c->st[ c->rsiz / 8 - 1 ] = + _mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ], + m256_const1_64( 0x8000000000000000 ) ); + sha3_4way_keccakf( c->st ); + memcpy( md, c->st, c->mdlen * 4 ); + return 1; +} + +void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen ) +{ + sha3_4way_ctx_t ctx; + sha3_4way_init( &ctx, mdlen); + sha3_4way_update( &ctx, in, inlen ); + sha3_4way_final( md, &ctx ); + return md; +} + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +void sha3_8way_keccakf( __m512i st[25] ) +{ + int i, j, r; + __m512i t, bc[5]; + + // actual iteration + for ( r = 0; r < KECCAKF_ROUNDS; r++ ) + { + + // Theta + for ( i = 0; i < 5; i++ ) + bc[i] = _mm512_xor_si512( st[i], + mm512_xor4( st[ i+5 ], st[ i+10 ], st[ i+15 ], st[i+20 ] ) ); + + for ( i = 0; i < 5; i++ ) + { + t = _mm512_xor_si512( bc[(i + 4) % 5], + _mm512_rol_epi64( bc[(i + 1) % 5], 1 ) ); + for ( j = 0; j < 25; j += 5 ) + st[j + i] = _mm512_xor_si512( st[j + i], t ); + } + + // Rho Pi +#define RHO_PI( i, c ) \ + bc[0] = st[ i ]; \ + st[ i ] = _mm512_rol_epi64( t, c ); \ + t = bc[0] + + t = st[1]; + + RHO_PI( 10, 1 ); + RHO_PI( 7, 3 ); + RHO_PI( 11, 6 ); + RHO_PI( 17, 10 ); + RHO_PI( 18, 15 ); + RHO_PI( 3, 21 ); + RHO_PI( 5, 28 ); + RHO_PI( 16, 36 ); + RHO_PI( 8, 45 ); + RHO_PI( 21, 55 ); + RHO_PI( 24, 2 ); + RHO_PI( 4, 14 ); + RHO_PI( 15, 27 ); + RHO_PI( 23, 41 ); + RHO_PI( 19, 56 ); + RHO_PI( 13, 8 ); + RHO_PI( 12, 25 ); + RHO_PI( 2, 43 ); + RHO_PI( 20, 62 ); + RHO_PI( 14, 18 ); + RHO_PI( 22, 39 ); + RHO_PI( 9, 61 ); + RHO_PI( 6, 20 ); + RHO_PI( 1, 44 ); + +#undef RHO_PI + + // Chi + for ( j = 0; j < 25; j += 5 ) + { + for ( i = 0; i < 5; i++ ) + bc[i] = st[j + i]; + for ( i = 0; i < 5; i++ ) + st[ j+i ] = _mm512_xor_si512( st[ j+i ], _mm512_andnot_si512( + bc[ (i+1) % 5 ], bc[ (i+2) % 5 ] ) ); + } + + // Iota + st[0] = _mm512_xor_si512( st[0], _mm512_set1_epi64( keccakf_rndc[r] ) ); + } +} + +// Initialize the context for SHA3 + +int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen ) +{ + for ( int i = 0; i < 25; i++ ) c->st[ i ] = m512_zero; + c->mdlen = mdlen; + c->rsiz = 200 - 2 * mdlen; + c->pt = 0; + return 1; +} + +// update state with more data + +int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len ) +{ + size_t i; + int j = c->pt; + const int rsiz = c->rsiz / 8; + const int l = len / 8; + + for ( i = 0; i < l; i++ ) + { + c->st[ j ] = _mm512_xor_si512( c->st[ j ], + ( (const __m512i*)data )[i] ); + j++; + if ( j >= rsiz ) + { + sha3_8way_keccakf( c->st ); + j = 0; + } + } + c->pt = j; + + return 1; +} + +// finalize and output a hash + +int sha3_8way_final( void *md, sha3_8way_ctx_t *c ) +{ + c->st[ c->pt ] = + _mm512_xor_si512( c->st[ c->pt ], + m512_const1_64( 6 ) ); + c->st[ c->rsiz / 8 - 1 ] = + _mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ], + m512_const1_64( 0x8000000000000000 ) ); + sha3_8way_keccakf( c->st ); + memcpy( md, c->st, c->mdlen * 8 ); + return 1; +} + +// compute a SHA-3 hash (md) of given byte length from "in" + +void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen ) +{ + sha3_8way_ctx_t sha3; + sha3_8way_init( &sha3, mdlen); + sha3_8way_update( &sha3, in, inlen ); + sha3_8way_final( md, &sha3 ); + return md; +} + +#endif // AVX512 +#endif // AVX2 diff --git a/algo/verthash/tiny_sha3/sha3-4way.h b/algo/verthash/tiny_sha3/sha3-4way.h new file mode 100644 index 00000000..6723b73b --- /dev/null +++ b/algo/verthash/tiny_sha3/sha3-4way.h @@ -0,0 +1,67 @@ +// sha3.h +// 19-Nov-11 Markku-Juhani O. Saarinen +// 2021-03-27 JayDDee +// +#ifndef SHA3_4WAY_H +#define SHA3_4WAY_H + +#include +#include +#include "simd-utils.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef KECCAKF_ROUNDS +#define KECCAKF_ROUNDS 24 +#endif + +#if defined(__AVX2__) + +typedef struct +{ + __m256i st[25]; // 64-bit words * 4 lanes + int pt, rsiz, mdlen; // these don't overflow +} sha3_4way_ctx_t __attribute__ ((aligned (64)));; + +// Compression function. +void sha3_4way_keccakf( __m256i st[25] ); + +// OpenSSL - like interfece +int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes +int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len ); +int sha3_4way_final( void *md, sha3_4way_ctx_t *c ); // digest goes to md + +// compute a sha3 hash (md) of given byte length from "in" +void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen ); + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// state context +typedef struct +{ + __m512i st[25]; // 64-bit words * 8 lanes + int pt, rsiz, mdlen; // these don't overflow +} sha3_8way_ctx_t __attribute__ ((aligned (64)));; + +// Compression function. +void sha3_8way_keccakf( __m512i st[25] ); + +// OpenSSL - like interfece +int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes +int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len ); +int sha3_8way_final( void *md, sha3_8way_ctx_t *c ); // digest goes to md + +// compute a sha3 hash (md) of given byte length from "in" +void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen ); + +#endif // AVX512 +#endif // AVX2 + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/algo/verthash/tiny_sha3/sha3.c b/algo/verthash/tiny_sha3/sha3.c new file mode 100644 index 00000000..94b06602 --- /dev/null +++ b/algo/verthash/tiny_sha3/sha3.c @@ -0,0 +1,226 @@ +// sha3.c +// 19-Nov-11 Markku-Juhani O. Saarinen + +// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3" +// Revised 03-Sep-15 for portability + OpenSSL - style API + +#include "sha3.h" +#include + +// update the state with given number of rounds + +void sha3_keccakf(uint64_t st[25]) +{ + // constants + const uint64_t keccakf_rndc[24] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 + }; +/* + const int keccakf_rotc[24] = { + 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 + }; + const int keccakf_piln[24] = { + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 + }; +*/ + + // variables + int i, j, r; + uint64_t t, bc[5]; + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + uint8_t *v; + + // endianess conversion. this is redundant on little-endian targets + for (i = 0; i < 25; i++) { + v = (uint8_t *) &st[i]; + st[i] = ((uint64_t) v[0]) | (((uint64_t) v[1]) << 8) | + (((uint64_t) v[2]) << 16) | (((uint64_t) v[3]) << 24) | + (((uint64_t) v[4]) << 32) | (((uint64_t) v[5]) << 40) | + (((uint64_t) v[6]) << 48) | (((uint64_t) v[7]) << 56); + } +#endif + + // actual iteration + for (r = 0; r < KECCAKF_ROUNDS; r++) { + + // Theta + for (i = 0; i < 5; i++) + bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20]; + + for (i = 0; i < 5; i++) { + t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); + for (j = 0; j < 25; j += 5) + st[j + i] ^= t; + } + + + // Rho Pi +#define RHO_PI( i, c ) \ + bc[0] = st[ i ]; \ + st[ i ] = ROTL64( t, c ); \ + t = bc[0] + + t = st[1]; + + RHO_PI( 10, 1 ); + RHO_PI( 7, 3 ); + RHO_PI( 11, 6 ); + RHO_PI( 17, 10 ); + RHO_PI( 18, 15 ); + RHO_PI( 3, 21 ); + RHO_PI( 5, 28 ); + RHO_PI( 16, 36 ); + RHO_PI( 8, 45 ); + RHO_PI( 21, 55 ); + RHO_PI( 24, 2 ); + RHO_PI( 4, 14 ); + RHO_PI( 15, 27 ); + RHO_PI( 23, 41 ); + RHO_PI( 19, 56 ); + RHO_PI( 13, 8 ); + RHO_PI( 12, 25 ); + RHO_PI( 2, 43 ); + RHO_PI( 20, 62 ); + RHO_PI( 14, 18 ); + RHO_PI( 22, 39 ); + RHO_PI( 9, 61 ); + RHO_PI( 6, 20 ); + RHO_PI( 1, 44 ); + +#undef RHO_PI + +/* + for (i = 0; i < 24; i++) { + j = keccakf_piln[i]; + bc[0] = st[j]; + st[j] = ROTL64(t, keccakf_rotc[i]); + t = bc[0]; + } +*/ + + // Chi + for (j = 0; j < 25; j += 5) { + for (i = 0; i < 5; i++) + bc[i] = st[j + i]; + for (i = 0; i < 5; i++) + st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; + } + + // Iota + st[0] ^= keccakf_rndc[r]; + } + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + // endianess conversion. this is redundant on little-endian targets + for (i = 0; i < 25; i++) { + v = (uint8_t *) &st[i]; + t = st[i]; + v[0] = t & 0xFF; + v[1] = (t >> 8) & 0xFF; + v[2] = (t >> 16) & 0xFF; + v[3] = (t >> 24) & 0xFF; + v[4] = (t >> 32) & 0xFF; + v[5] = (t >> 40) & 0xFF; + v[6] = (t >> 48) & 0xFF; + v[7] = (t >> 56) & 0xFF; + } +#endif +} + +// Initialize the context for SHA3 + +int sha3_init(sha3_ctx_t *c, int mdlen) +{ + int i; + + for (i = 0; i < 25; i++) + c->st.q[i] = 0; + c->mdlen = mdlen; + c->rsiz = 200 - 2 * mdlen; + c->pt = 0; + + return 1; +} + +// update state with more data + +int sha3_update(sha3_ctx_t *c, const void *data, size_t len) +{ + size_t i; + int j = c->pt / 8; + const int rsiz = c->rsiz / 8; + const int l = len / 8; + + for ( i = 0; i < l; i++ ) + { + c->st.q[ j++ ] ^= ( ((const uint64_t *) data) [i] ); + if ( j >= rsiz ) + { + sha3_keccakf( c->st.q ); + j = 0; + } + } + c->pt = j*8; + + return 1; +} + +// finalize and output a hash + +int sha3_final(void *md, sha3_ctx_t *c) +{ + c->st.q[ c->pt / 8 ] ^= 6; + c->st.q[ c->rsiz / 8 - 1 ] ^= 0x8000000000000000; + sha3_keccakf(c->st.q); + memcpy( md, c->st.q, c->mdlen ); + return 1; +} + +// compute a SHA-3 hash (md) of given byte length from "in" + +void *sha3(const void *in, size_t inlen, void *md, int mdlen) +{ + sha3_ctx_t sha3; + sha3_init(&sha3, mdlen); + sha3_update(&sha3, in, inlen); + sha3_final(md, &sha3); + + return md; +} + +// SHAKE128 and SHAKE256 extensible-output functionality + +void shake_xof(sha3_ctx_t *c) +{ + c->st.b[c->pt] ^= 0x1F; + c->st.b[c->rsiz - 1] ^= 0x80; + sha3_keccakf(c->st.q); + c->pt = 0; +} + +void shake_out(sha3_ctx_t *c, void *out, size_t len) +{ + size_t i; + int j; + + j = c->pt; + for (i = 0; i < len; i++) { + if (j >= c->rsiz) { + sha3_keccakf(c->st.q); + j = 0; + } + ((uint8_t *) out)[i] = c->st.b[j++]; + } + c->pt = j; +} + diff --git a/algo/verthash/tiny_sha3/sha3.h b/algo/verthash/tiny_sha3/sha3.h new file mode 100644 index 00000000..2d7bf8d2 --- /dev/null +++ b/algo/verthash/tiny_sha3/sha3.h @@ -0,0 +1,55 @@ +// sha3.h +// 19-Nov-11 Markku-Juhani O. Saarinen + +#ifndef SHA3_H +#define SHA3_H + +#include +#include + + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef KECCAKF_ROUNDS +#define KECCAKF_ROUNDS 24 +#endif + +#ifndef ROTL64 +#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y)))) +#endif + +// state context +typedef struct { + union { // state: + uint8_t b[200]; // 8-bit bytes + uint64_t q[25]; // 64-bit words + } st; + int pt, rsiz, mdlen; // these don't overflow +} sha3_ctx_t; + +// Compression function. +void sha3_keccakf(uint64_t st[25]); + +// OpenSSL - like interfece +int sha3_init(sha3_ctx_t *c, int mdlen); // mdlen = hash output in bytes +int sha3_update(sha3_ctx_t *c, const void *data, size_t len); +int sha3_final(void *md, sha3_ctx_t *c); // digest goes to md + +// compute a sha3 hash (md) of given byte length from "in" +void *sha3(const void *in, size_t inlen, void *md, int mdlen); + +// SHAKE128 and SHAKE256 extensible-output functions +#define shake128_init(c) sha3_init(c, 16) +#define shake256_init(c) sha3_init(c, 32) +#define shake_update sha3_update + +void shake_xof(sha3_ctx_t *c); +void shake_out(sha3_ctx_t *c, void *out, size_t len); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c new file mode 100644 index 00000000..eeb2e5dd --- /dev/null +++ b/algo/verthash/verthash-gate.c @@ -0,0 +1,176 @@ +#include "algo-gate-api.h" +#include "algo/sha/sha256-hash.h" +#include "Verthash.h" +#include "tiny_sha3/sha3-4way.h" + +static verthash_info_t verthashInfo; + +// Verthash data file hash in bytes for verification +// 0x48aa21d7afededb63976d48a8ff8ec29d5b02563af4a1110b056cd43e83155a5 +static const uint8_t verthashDatFileHash_bytes[32] = +{ 0xa5, 0x55, 0x31, 0xe8, 0x43, 0xcd, 0x56, 0xb0, + 0x10, 0x11, 0x4a, 0xaf, 0x63, 0x25, 0xb0, 0xd5, + 0x29, 0xec, 0xf8, 0x8f, 0x8a, 0xd4, 0x76, 0x39, + 0xb6, 0xed, 0xed, 0xaf, 0xd7, 0x21, 0xaa, 0x48 }; + +#if defined(__AVX2__) + +static __thread sha3_4way_ctx_t sha3_mid_ctxA; +static __thread sha3_4way_ctx_t sha3_mid_ctxB; + +#else + +static __thread sha3_ctx_t sha3_mid_ctx[8]; + +#endif + +void verthash_sha3_512_prehash_72( const void *input ) +{ +#if defined(__AVX2__) + + __m256i vin[10]; + mm256_intrlv80_4x64( vin, input ); + + sha3_4way_init( &sha3_mid_ctxA, 64 ); + sha3_4way_init( &sha3_mid_ctxB, 64 ); + + vin[0] = _mm256_add_epi8( vin[0], _mm256_set_epi64x( 4,3,2,1 ) ); + sha3_4way_update( &sha3_mid_ctxA, vin, 72 ); + + vin[0] = _mm256_add_epi8( vin[0], _mm256_set1_epi64x( 4 ) ); + sha3_4way_update( &sha3_mid_ctxB, vin, 72 ); + +#else + + char in[80] __attribute__ ((aligned (64))); + memcpy( in, input, 80 ); + for ( int i = 0; i < 8; i++ ) + { + in[0] += 1; + sha3_init( &sha3_mid_ctx[i], 64 ); + sha3_update( &sha3_mid_ctx[i], in, 72 ); + } + +#endif +} + +void verthash_sha3_512_final_8( void *hash, const uint64_t nonce ) +{ +#if defined(__AVX2__) + + __m256i vhashA[ 10 ] __attribute__ ((aligned (64))); + __m256i vhashB[ 10 ] __attribute__ ((aligned (64))); + + sha3_4way_ctx_t ctx; + const __m256i vnonce = _mm256_set1_epi64x( nonce ); + + memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx ); + sha3_4way_update( &ctx, &vnonce, 8 ); + sha3_4way_final( vhashA, &ctx ); + + memcpy( &ctx, &sha3_mid_ctxB, sizeof ctx ); + sha3_4way_update( &ctx, &vnonce, 8 ); + sha3_4way_final( vhashB, &ctx ); + + dintrlv_4x64( hash, hash+64, hash+128, hash+192, vhashA, 512 ); + dintrlv_4x64( hash+256, hash+320, hash+384, hash+448, vhashB, 512 ); + +#else + + for ( int i = 0; i < 8; i++ ) + { + sha3_ctx_t ctx; + memcpy( &ctx, &sha3_mid_ctx[i], sizeof ctx ); + sha3_update( &ctx, &nonce, 8 ); + sha3_final( hash + i*64, &ctx ); + } + +#endif +} + +int scanhash_verthash( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t edata[20] __attribute__((aligned(64))); + uint32_t hash[8] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 1; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + mm128_bswap32_80( edata, pdata ); + verthash_sha3_512_prehash_72( edata ); + + do + { + edata[19] = n; + verthash_hash( verthashInfo.data, verthashInfo.dataSize, + edata, hash ); + if ( valid_hash( hash, ptarget ) && !bench ) + { + pdata[19] = bswap_32( n ); + submit_solution( work, hash, mythr ); + } + n++; + } while ( n < last_nonce && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + pdata[19] = n; + return 0; +} + +static const char *default_verthash_data_file = "verthash.dat"; + +bool register_verthash_algo( algo_gate_t* gate ) +{ + opt_target_factor = 256.0; + gate->scanhash = (void*)&scanhash_verthash; + gate->optimizations = SSE42_OPT | AVX2_OPT; + + const char *verthash_data_file = opt_data_file ? opt_data_file + : default_verthash_data_file; + + int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file ); + if (vhLoadResult == 0) // No Error + { + if ( opt_verify ) + { + uint8_t vhDataFileHash[32] = { 0 }; + + applog( LOG_NOTICE, "Verifying Verthash data" ); + sha256_full( vhDataFileHash, verthashInfo.data, + verthashInfo.dataSize ); + if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes, + sizeof(verthashDatFileHash_bytes) ) == 0 ) + applog( LOG_NOTICE, "Verthash data has been verified" ); + else + { + applog( LOG_ERR, "Verthash data verification has failed" ); + return false; + } + } + } + else + { + // Handle Verthash error codes + if ( vhLoadResult == 1 ) + { + applog( LOG_ERR, "Verthash data file not found: %s", + verthash_data_file ); + if ( !opt_data_file ) + applog( LOG_NOTICE, "Add '--verify' to create verthash.dat"); + } + else if ( vhLoadResult == 2 ) + applog( LOG_ERR, "Failed to allocate memory for Verthash data" ); +// else // for debugging purposes +// applog( LOG_ERR, "Verthash data initialization unknown error code: %d", +// vhLoadResult ); + return false; + } + + printf("\n"); + return true; +} + diff --git a/algo/whirlpool/whirlpool.c b/algo/whirlpool/whirlpool.c index 59fcf71c..1c6b6883 100644 --- a/algo/whirlpool/whirlpool.c +++ b/algo/whirlpool/whirlpool.c @@ -82,7 +82,7 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce, be32enc(&endiandata[19], n ); whirlpool_hash(vhash, endiandata); - if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) + if (vhash[7] <= Htarg && fulltest(vhash, ptarget) & ! opt_benchmark ) submit_solution( work, vhash, mythr ); } while ( n < max_nonce && !work_restart[thr_id].restart); diff --git a/algo/x11/0x10-gate.c b/algo/x11/0x10-gate.c deleted file mode 100644 index 3463f5f0..00000000 --- a/algo/x11/0x10-gate.c +++ /dev/null @@ -1,21 +0,0 @@ -#include "0x10-gate.h" - -bool register_0x10_algo( algo_gate_t *gate ) -{ -#if defined (0X10_8WAY) - init_0x10_8way_ctx(); - gate->scanhash = (void*)&scanhash_0x10_8way; - gate->hash = (void*)&0x10_8way_hash; -#elif defined (0X10_4WAY) - init_0x10_4way_ctx(); - gate->scanhash = (void*)&scanhash_0x10_4way; - gate->hash = (void*)&0x10_4way_hash; -#else - init_0x10_ctx(); - gate->scanhash = (void*)&scanhash_0x10; - gate->hash = (void*)&0x10_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT ; - return true; -}; - diff --git a/algo/x11/0x10-gate.h b/algo/x11/0x10-gate.h deleted file mode 100644 index 1836977b..00000000 --- a/algo/x11/0x10-gate.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef 0X10_GATE_H__ -#define 0X10_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - #define 0X10_8WAY 1 -#elif defined(__AVX2__) && defined(__AES__) - #define 0X10_4WAY 1 -#endif - -bool register_0x10_algo( algo_gate_t* gate ); -#if defined(0X10_8WAY) - -void 0x10_8way_hash( void *state, const void *input ); -int scanhash_0x10_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_0x10_8way_ctx(); - -#elif defined(0X10_4WAY) - -void 0x10_4way_hash( void *state, const void *input ); -int scanhash_0x10_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_0x10_4way_ctx(); - -#else - -void 0x10_hash( void *state, const void *input ); -int scanhash_0x10( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_0x10_ctx(); - -#endif - -#endif - diff --git a/algo/x11/0x10-4way.c b/algo/x11/hash0x10-4way.c similarity index 76% rename from algo/x11/0x10-4way.c rename to algo/x11/hash0x10-4way.c index ac0208b1..c49dac69 100644 --- a/algo/x11/0x10-4way.c +++ b/algo/x11/hash0x10-4way.c @@ -1,5 +1,5 @@ #include "cpuminer-config.h" -#include "0x10-gate.h" +#include "hash0x10-gate.h" #include #include #include "algo/blake/blake-hash-4way.h" @@ -20,7 +20,7 @@ #include "algo/echo/echo-hash-4way.h" #endif -#if defined (0X10_8WAY) +#if defined (HASH0X10_8WAY) typedef struct { blake512_8way_context blake; @@ -40,32 +40,32 @@ typedef struct { sph_shavite512_context shavite; hashState_echo echo; #endif -} 0x10_8way_ctx_holder; +} hash0x10_8way_ctx_holder; -0x10_8way_ctx_holder 0x10_8way_ctx; +hash0x10_8way_ctx_holder hash0x10_8way_ctx; -void init_0x10_8way_ctx() +void init_hash0x10_8way_ctx() { - blake512_8way_init( &0x10_8way_ctx.blake ); - bmw512_8way_init( &0x10_8way_ctx.bmw ); - skein512_8way_init( &0x10_8way_ctx.skein ); - jh512_8way_init( &0x10_8way_ctx.jh ); - keccak512_8way_init( &0x10_8way_ctx.keccak ); - luffa_4way_init( &0x10_8way_ctx.luffa, 512 ); - cube_4way_init( &0x10_8way_ctx.cube, 512, 16, 32 ); - simd_4way_init( &0x10_8way_ctx.simd, 512 ); + blake512_8way_init( &hash0x10_8way_ctx.blake ); + bmw512_8way_init( &hash0x10_8way_ctx.bmw ); + skein512_8way_init( &hash0x10_8way_ctx.skein ); + jh512_8way_init( &hash0x10_8way_ctx.jh ); + keccak512_8way_init( &hash0x10_8way_ctx.keccak ); + luffa_4way_init( &hash0x10_8way_ctx.luffa, 512 ); + cube_4way_init( &hash0x10_8way_ctx.cube, 512, 16, 32 ); + simd_4way_init( &hash0x10_8way_ctx.simd, 512 ); #if defined(__VAES__) - groestl512_4way_init( &0x10_8way_ctx.groestl, 64 ); - shavite512_4way_init( &0x10_8way_ctx.shavite ); - echo_4way_init( &0x10_8way_ctx.echo, 512 ); + groestl512_4way_init( &hash0x10_8way_ctx.groestl, 64 ); + shavite512_4way_init( &hash0x10_8way_ctx.shavite ); + echo_4way_init( &hash0x10_8way_ctx.echo, 512 ); #else - init_groestl( &0x10_8way_ctx.groestl, 64 ); - sph_shavite512_init( &0x10_8way_ctx.shavite ); - init_echo( &0x10_8way_ctx.echo, 512 ); + init_groestl( &hash0x10_8way_ctx.groestl, 64 ); + sph_shavite512_init( &hash0x10_8way_ctx.shavite ); + init_echo( &hash0x10_8way_ctx.echo, 512 ); #endif } -void 0x10_8way_hash( void *state, const void *input ) +void hash0x10_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); uint64_t vhashA[4*8] __attribute__ ((aligned (64))); @@ -78,8 +78,8 @@ void 0x10_8way_hash( void *state, const void *input ) uint64_t hash5[8] __attribute__ ((aligned (64))); uint64_t hash6[8] __attribute__ ((aligned (64))); uint64_t hash7[8] __attribute__ ((aligned (64))); - 0x10_8way_ctx_holder ctx; - memcpy( &ctx, &0x10_8way_ctx, sizeof(0x10_8way_ctx) ); + hash0x10_8way_ctx_holder ctx; + memcpy( &ctx, &hash0x10_8way_ctx, sizeof(hash0x10_8way_ctx) ); blake512_8way_update( &ctx.blake, input, 80 ); blake512_8way_close( &ctx.blake, vhash ); @@ -105,25 +105,25 @@ void 0x10_8way_hash( void *state, const void *input ) vhash ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &0x10_8way_ctx.groestl, + memcpy( &ctx.groestl, &hash0x10_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &0x10_8way_ctx.groestl, + memcpy( &ctx.groestl, &hash0x10_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &0x10_8way_ctx.groestl, + memcpy( &ctx.groestl, &hash0x10_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - memcpy( &ctx.groestl, &0x10_8way_ctx.groestl, + memcpy( &ctx.groestl, &hash0x10_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); - memcpy( &ctx.groestl, &0x10_8way_ctx.groestl, + memcpy( &ctx.groestl, &hash0x10_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); - memcpy( &ctx.groestl, &0x10_8way_ctx.groestl, + memcpy( &ctx.groestl, &hash0x10_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); - memcpy( &ctx.groestl, &0x10_8way_ctx.groestl, + memcpy( &ctx.groestl, &hash0x10_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); @@ -165,31 +165,31 @@ void 0x10_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &0x10_8way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_8way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash1, 64 ); sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &0x10_8way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_8way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash2, 64 ); sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &0x10_8way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_8way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash3, 64 ); sph_shavite512_close( &ctx.shavite, hash3 ); - memcpy( &ctx.shavite, &0x10_8way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_8way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash4, 64 ); sph_shavite512_close( &ctx.shavite, hash4 ); - memcpy( &ctx.shavite, &0x10_8way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_8way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash5, 64 ); sph_shavite512_close( &ctx.shavite, hash5 ); - memcpy( &ctx.shavite, &0x10_8way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_8way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash6, 64 ); sph_shavite512_close( &ctx.shavite, hash6 ); - memcpy( &ctx.shavite, &0x10_8way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_8way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); @@ -215,25 +215,25 @@ void 0x10_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &0x10_8way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_8way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash1, (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &0x10_8way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_8way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash2, (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &0x10_8way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_8way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); - memcpy( &ctx.echo, &0x10_8way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_8way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash4, (const BitSequence *) hash4, 512 ); - memcpy( &ctx.echo, &0x10_8way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_8way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash5, (const BitSequence *) hash5, 512 ); - memcpy( &ctx.echo, &0x10_8way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_8way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash6, (const BitSequence *) hash6, 512 ); - memcpy( &ctx.echo, &0x10_8way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_8way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); @@ -249,7 +249,7 @@ void 0x10_8way_hash( void *state, const void *input ) memcpy( state+224, hash7, 32 ); } -int scanhash_0x10_8way( struct work *work, uint32_t max_nonce, +int scanhash_hash0x10_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[8*8] __attribute__ ((aligned (128))); @@ -271,7 +271,7 @@ int scanhash_0x10_8way( struct work *work, uint32_t max_nonce, _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - 0x10_8way_hash( hash, vdata ); + hash0x10_8way_hash( hash, vdata ); pdata[19] = n; for ( int i = 0; i < 8; i++ ) @@ -288,7 +288,7 @@ int scanhash_0x10_8way( struct work *work, uint32_t max_nonce, } -#elif defined (0X10_4WAY) +#elif defined (HASHhash0x10_4WAY) typedef struct { @@ -303,26 +303,26 @@ typedef struct { sph_shavite512_context shavite; simd_2way_context simd; hashState_echo echo; -} 0x10_4way_ctx_holder; +} hash0x10_4way_ctx_holder; -0x10_4way_ctx_holder 0x10_4way_ctx; +hash0x10_4way_ctx_holder hash0x10_4way_ctx; -void init_0x10_4way_ctx() +void init_hash0x10_4way_ctx() { - blake512_4way_init( &0x10_4way_ctx.blake ); - bmw512_4way_init( &0x10_4way_ctx.bmw ); - init_groestl( &0x10_4way_ctx.groestl, 64 ); - skein512_4way_init( &0x10_4way_ctx.skein ); - jh512_4way_init( &0x10_4way_ctx.jh ); - keccak512_4way_init( &0x10_4way_ctx.keccak ); - luffa_2way_init( &0x10_4way_ctx.luffa, 512 ); - cubehashInit( &0x10_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &0x10_4way_ctx.shavite ); - simd_2way_init( &0x10_4way_ctx.simd, 512 ); - init_echo( &0x10_4way_ctx.echo, 512 ); + blake512_4way_init( &hash0x10_4way_ctx.blake ); + bmw512_4way_init( &hash0x10_4way_ctx.bmw ); + init_groestl( &hash0x10_4way_ctx.groestl, 64 ); + skein512_4way_init( &hash0x10_4way_ctx.skein ); + jh512_4way_init( &hash0x10_4way_ctx.jh ); + keccak512_4way_init( &hash0x10_4way_ctx.keccak ); + luffa_2way_init( &hash0x10_4way_ctx.luffa, 512 ); + cubehashInit( &hash0x10_4way_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &hash0x10_4way_ctx.shavite ); + simd_2way_init( &hash0x10_4way_ctx.simd, 512 ); + init_echo( &hash0x10_4way_ctx.echo, 512 ); } -void 0x10_4way_hash( void *state, const void *input ) +void hash0x10_4way_hash( void *state, const void *input ) { uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); @@ -331,8 +331,8 @@ void 0x10_4way_hash( void *state, const void *input ) uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhashB[8*2] __attribute__ ((aligned (64))); - 0x10_4way_ctx_holder ctx; - memcpy( &ctx, &0x10_4way_ctx, sizeof(0x10_4way_ctx) ); + hash0x10_4way_ctx_holder ctx; + memcpy( &ctx, &hash0x10_4way_ctx, sizeof(hash0x10_4way_ctx) ); // 1 Blake 4way blake512_4way_update( &ctx.blake, input, 80 ); @@ -351,11 +351,11 @@ void 0x10_4way_hash( void *state, const void *input ) // 4 Groestl update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &0x10_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &hash0x10_4way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &0x10_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &hash0x10_4way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &0x10_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &hash0x10_4way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); // 4way @@ -382,11 +382,11 @@ void 0x10_4way_hash( void *state, const void *input ) // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &0x10_4way_ctx.cube, sizeof(cubehashParam) ); + memcpy( &ctx.cube, &hash0x10_4way_ctx.cube, sizeof(cubehashParam) ); cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &0x10_4way_ctx.cube, sizeof(cubehashParam) ); + memcpy( &ctx.cube, &hash0x10_4way_ctx.cube, sizeof(cubehashParam) ); cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &0x10_4way_ctx.cube, sizeof(cubehashParam) ); + memcpy( &ctx.cube, &hash0x10_4way_ctx.cube, sizeof(cubehashParam) ); cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); // 9 Simd @@ -401,15 +401,15 @@ void 0x10_4way_hash( void *state, const void *input ) // 10 Shavite sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &0x10_4way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_4way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash1, 64 ); sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &0x10_4way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_4way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash2, 64 ); sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &0x10_4way_ctx.shavite, + memcpy( &ctx.shavite, &hash0x10_4way_ctx.shavite, sizeof(sph_shavite512_context) ); sph_shavite512( &ctx.shavite, hash3, 64 ); sph_shavite512_close( &ctx.shavite, hash3 ); @@ -417,13 +417,13 @@ void 0x10_4way_hash( void *state, const void *input ) // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &0x10_4way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_4way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash1, (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &0x10_4way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_4way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash2, (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &0x10_4way_ctx.echo, sizeof(hashState_echo) ); + memcpy( &ctx.echo, &hash0x10_4way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); @@ -433,7 +433,7 @@ void 0x10_4way_hash( void *state, const void *input ) memcpy( state+96, hash3, 32 ); } -int scanhash_0x10_4way( struct work *work, uint32_t max_nonce, +int scanhash_hash0x10_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); @@ -446,7 +446,7 @@ int scanhash_0x10_4way( struct work *work, uint32_t max_nonce, __m256i *noncev = (__m256i*)vdata + 9; // aligned const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; + 0xFFF, 0xFFFF, hash0x10000000 }; uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0 }; @@ -461,7 +461,7 @@ int scanhash_0x10_4way( struct work *work, uint32_t max_nonce, *noncev = mm256_intrlv_blend_32( mm256_bswap_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - 0x10_4way_hash( hash, vdata ); + hash0x10_4way_hash( hash, vdata ); pdata[19] = n; for ( int i = 0; i < 4; i++ ) @@ -480,4 +480,4 @@ int scanhash_0x10_4way( struct work *work, uint32_t max_nonce, return 0; } -#endif +#endif \ No newline at end of file diff --git a/algo/x11/hash0x10-gate.c b/algo/x11/hash0x10-gate.c new file mode 100644 index 00000000..ca822637 --- /dev/null +++ b/algo/x11/hash0x10-gate.c @@ -0,0 +1,20 @@ +#include "hash0x10-gate.h" + +bool register_hash0x10_algo( algo_gate_t *gate ) +{ +#if defined (HASH0X10_8WAY) + init_hash0x10_8way_ctx(); + gate->scanhash = (void*)&scanhash_hash0x10_8way; + gate->hash = (void*)&hash0x10_8way_hash; +#elif defined (HASH0X10_4WAY) + init_hash0x10_4way_ctx(); + gate->scanhash = (void*)&scanhash_hash0x10_4way; + gate->hash = (void*)&hash0x10_4way_hash; +#else + init_hash0x10_ctx(); + gate->scanhash = (void*)&scanhash_hash0x10; + gate->hash = (void*)&hash0x10_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT ; + return true; +}; diff --git a/algo/x11/hash0x10-gate.h b/algo/x11/hash0x10-gate.h new file mode 100644 index 00000000..7c5b425b --- /dev/null +++ b/algo/x11/hash0x10-gate.h @@ -0,0 +1,37 @@ +#ifndef HASH0X10_GATE_H__ +#define HASH0X10_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define HASH0X10_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define HASH0X10_4WAY 1 +#endif + +bool register_hash0x10_algo( algo_gate_t* gate ); +#if defined(HASH0X10_8WAY) + +void hash0x10_8way_hash( void *state, const void *input ); +int scanhash_hash0x10_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_hash0x10_8way_ctx(); + +#elif defined(HASH0X10_4WAY) + +void hash0x10_4way_hash( void *state, const void *input ); +int scanhash_hash0x10_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_hash0x10_4way_ctx(); + +#else + +void hash0x10_hash( void *state, const void *input ); +int scanhash_hash0x10( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_hash0x10_ctx(); + +#endif + +#endif diff --git a/algo/x11/0x10.c b/algo/x11/hash0x10.c similarity index 80% rename from algo/x11/0x10.c rename to algo/x11/hash0x10.c index 1c3744d8..7418abb3 100644 --- a/algo/x11/0x10.c +++ b/algo/x11/hash0x10.c @@ -1,7 +1,7 @@ #include "cpuminer-config.h" -#include "0x10-gate.h" +#include "hash0x10-gate.h" -#if !defined(0X10_8WAY) && !defined(0X10_4WAY) +#if !defined(HASH0X10_8WAY) && !defined(HASH0X10_4WAY) #include #include @@ -42,35 +42,35 @@ typedef struct { cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; -} 0x10_ctx_holder; +} hash0x10_ctx_holder; -0x10_ctx_holder 0x10_ctx; +hash0x10_ctx_holder hash0x10_ctx; -void init_0x10_ctx() +void init_hash0x10_ctx() { - sph_blake512_init( &0x10_ctx.blake ); - sph_bmw512_init( &0x10_ctx.bmw ); + sph_blake512_init( &hash0x10_ctx.blake ); + sph_bmw512_init( &hash0x10_ctx.bmw ); #if defined(__AES__) - init_groestl( &0x10_ctx.groestl, 64 ); - init_echo( &0x10_ctx.echo, 512 ); + init_groestl( &hash0x10_ctx.groestl, 64 ); + init_echo( &hash0x10_ctx.echo, 512 ); #else - sph_groestl512_init( &0x10_ctx.groestl ); - sph_echo512_init( &0x10_ctx.echo ); + sph_groestl512_init( &hash0x10_ctx.groestl ); + sph_echo512_init( &hash0x10_ctx.echo ); #endif - sph_skein512_init( &0x10_ctx.skein ); - sph_jh512_init( &0x10_ctx.jh ); - sph_keccak512_init( &0x10_ctx.keccak ); - init_luffa( &0x10_ctx.luffa, 512 ); - cubehashInit( &0x10_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &0x10_ctx.shavite ); - init_sd( &0x10_ctx.simd, 512 ); + sph_skein512_init( &hash0x10_ctx.skein ); + sph_jh512_init( &hash0x10_ctx.jh ); + sph_keccak512_init( &hash0x10_ctx.keccak ); + init_luffa( &hash0x10_ctx.luffa, 512 ); + cubehashInit( &hash0x10_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &hash0x10_ctx.shavite ); + init_sd( &hash0x10_ctx.simd, 512 ); } -void 0x10_hash( void *state, const void *input ) +void hash0x10_hash( void *state, const void *input ) { unsigned char hash[64] __attribute__((aligned(64))); - 0x10_ctx_holder ctx; - memcpy( &ctx, &0x10_ctx, sizeof(0x10_ctx) ); + hash0x10_ctx_holder ctx; + memcpy( &ctx, &hash0x10_ctx, sizeof(hash0x10_ctx) ); sph_blake512( &ctx.blake, input, 80 ); sph_blake512_close( &ctx.blake, hash ); @@ -121,7 +121,7 @@ void 0x10_hash( void *state, const void *input ) memcpy( state, hash, 32 ); } -int scanhash_0x10( struct work *work, uint32_t max_nonce, +int scanhash_hash0x10( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t endiandata[20] __attribute__((aligned(64))); @@ -160,7 +160,7 @@ int scanhash_0x10( struct work *work, uint32_t max_nonce, { pdata[19] = ++n; be32enc( &endiandata[19], n ); - 0x10_hash( hash64, &endiandata ); + hash0x10_hash( hash64, &endiandata ); if ( ( hash64[7] & mask ) == 0 ) { if ( fulltest( hash64, ptarget ) ) @@ -173,4 +173,4 @@ int scanhash_0x10( struct work *work, uint32_t max_nonce, pdata[19] = n; return 0; } -#endif +#endif \ No newline at end of file diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 4d12029d..39efd257 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -16,8 +16,7 @@ #if defined (X16R_8WAY) -// Perform midstate prehash of hash functions with block size <= 64 bytes -// and interleave 4x64 before nonce insertion for final hash. +// Perform midstate prehash of hash functions with block size <= 72 bytes. void x16r_8way_prehash( void *vdata, void *pdata ) { @@ -34,6 +33,11 @@ void x16r_8way_prehash( void *vdata, void *pdata ) jh512_8way_init( &x16r_ctx.jh ); jh512_8way_update( &x16r_ctx.jh, vdata, 64 ); break; + case KECCAK: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + keccak512_8way_init( &x16r_ctx.keccak ); + keccak512_8way_update( &x16r_ctx.keccak, vdata, 72 ); + break; case SKEIN: mm512_bswap32_intrlv80_8x64( vdata, pdata ); skein512_8way_init( &x16r_ctx.skein ); @@ -48,15 +52,22 @@ void x16r_8way_prehash( void *vdata, void *pdata ) break; case CUBEHASH: mm128_bswap32_80( edata, pdata ); - cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); - cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 ); - intrlv_8x64( vdata, edata, edata, edata, edata, - edata, edata, edata, edata, 640 ); + intrlv_4x128( vdata2, edata, edata, edata, edata, 640 ); + cube_4way_init( &x16r_ctx.cube, 512, 16, 32 ); + cube_4way_update( &x16r_ctx.cube, vdata2, 64 ); + rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 ); break; case HAMSI: mm512_bswap32_intrlv80_8x64( vdata, pdata ); hamsi512_8way_init( &x16r_ctx.hamsi ); - hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 ); + hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 ); + break; + case FUGUE: + mm128_bswap32_80( edata, pdata ); + fugue512_init( &x16r_ctx.fugue ); + fugue512_update( &x16r_ctx.fugue, edata, 76 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); break; case SHABAL: mm256_bswap32_intrlv80_8x32( vdata2, pdata ); @@ -173,13 +184,13 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) hash7, vhash ); break; case KECCAK: - keccak512_8way_init( &ctx.keccak ); - if ( i == 0 ) - keccak512_8way_update( &ctx.keccak, input, size ); + if ( i == 0 ) + keccak512_8way_update( &ctx.keccak, input + (72<<3), 8 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); + keccak512_8way_init( &ctx.keccak ); keccak512_8way_update( &ctx.keccak, vhash, size ); } keccak512_8way_close( &ctx.keccak, vhash ); @@ -203,15 +214,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) case LUFFA: if ( i == 0 ) { - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - luffa_4way_update_close( &ctx.luffa, vhash, - vhash + (16<<2), 16 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - luffa_4way_update_close( &ctx.luffa, vhash, - vhash + (16<<2), 16 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); } else { @@ -226,56 +237,24 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) case CUBEHASH: if ( i == 0 ) { - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)in0 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, - (const byte*)in1 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, - (const byte*)in2 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, - (const byte*)in3 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash4, - (const byte*)in4 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash5, - (const byte*)in5 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash6, - (const byte*)in6 + 64, 16 ); + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + cube_4way_update_close( &ctx.cube, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash7, - (const byte*)in7 + 64, 16 ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + cube_4way_update_close( &ctx.cube, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); } else { - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash4, - (const byte*)in4, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash5, - (const byte*)in5, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash6, - (const byte*)in6, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash7, - (const byte*)in7, size ); + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); } break; case SHAVITE: @@ -334,7 +313,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) break; case HAMSI: if ( i == 0 ) - hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 ); + hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, @@ -347,14 +326,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) hash7, vhash ); break; case FUGUE: - fugue512_full( &ctx.fugue, hash0, in0, size ); - fugue512_full( &ctx.fugue, hash1, in1, size ); - fugue512_full( &ctx.fugue, hash2, in2, size ); - fugue512_full( &ctx.fugue, hash3, in3, size ); - fugue512_full( &ctx.fugue, hash4, in4, size ); - fugue512_full( &ctx.fugue, hash5, in5, size ); - fugue512_full( &ctx.fugue, hash6, in6, size ); - fugue512_full( &ctx.fugue, hash7, in7, size ); + if ( i == 0 ) + { + fugue512_update( &ctx.fugue, in0 + 76, 4 ); + fugue512_final( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in1 + 76, 4 ); + fugue512_final( &ctx.fugue, hash1 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in2 + 76, 4 ); + fugue512_final( &ctx.fugue, hash2 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in3 + 76, 4 ); + fugue512_final( &ctx.fugue, hash3 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in4 + 76, 4 ); + fugue512_final( &ctx.fugue, hash4 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in5 + 76, 4 ); + fugue512_final( &ctx.fugue, hash5 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in6 + 76, 4 ); + fugue512_final( &ctx.fugue, hash6 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in7 + 76, 4 ); + fugue512_final( &ctx.fugue, hash7 ); + } + else + { + fugue512_full( &ctx.fugue, hash0, in0, size ); + fugue512_full( &ctx.fugue, hash1, in1, size ); + fugue512_full( &ctx.fugue, hash2, in2, size ); + fugue512_full( &ctx.fugue, hash3, in3, size ); + fugue512_full( &ctx.fugue, hash4, in4, size ); + fugue512_full( &ctx.fugue, hash5, in5, size ); + fugue512_full( &ctx.fugue, hash6, in6, size ); + fugue512_full( &ctx.fugue, hash7, in7, size ); + } break; case SHABAL: intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, @@ -375,25 +383,25 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) { sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash0 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash1 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash2 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash4 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash5 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash6 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash7 ); } @@ -490,6 +498,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, { x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); s_ntime = ntime; + if ( opt_debug && !thr_id ) applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); } @@ -533,6 +542,11 @@ void x16r_4way_prehash( void *vdata, void *pdata ) jh512_4way_init( &x16r_ctx.jh ); jh512_4way_update( &x16r_ctx.jh, vdata, 64 ); break; + case KECCAK: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + keccak512_4way_init( &x16r_ctx.keccak ); + keccak512_4way_update( &x16r_ctx.keccak, vdata, 72 ); + break; case SKEIN: mm256_bswap32_intrlv80_4x64( vdata, pdata ); skein512_4way_prehash64( &x16r_ctx.skein, vdata ); @@ -546,14 +560,21 @@ void x16r_4way_prehash( void *vdata, void *pdata ) break; case CUBEHASH: mm128_bswap32_80( edata, pdata ); - cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); - cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 ); - intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + intrlv_2x128( vdata2, edata, edata, 640 ); + cube_2way_init( &x16r_ctx.cube, 512, 16, 32 ); + cube_2way_update( &x16r_ctx.cube, vdata2, 64 ); + rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 ); break; case HAMSI: mm256_bswap32_intrlv80_4x64( vdata, pdata ); hamsi512_4way_init( &x16r_ctx.hamsi ); - hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 ); + hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 ); + break; + case FUGUE: + mm128_bswap32_80( edata, pdata ); + fugue512_init( &x16r_ctx.fugue ); + fugue512_update( &x16r_ctx.fugue, edata, 76 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); break; case SHABAL: mm128_bswap32_intrlv80_4x32( vdata2, pdata ); @@ -646,12 +667,12 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case KECCAK: - keccak512_4way_init( &ctx.keccak ); - if ( i == 0 ) - keccak512_4way_update( &ctx.keccak, input, size ); + if ( i == 0 ) + keccak512_4way_update( &ctx.keccak, input + (72<<2), 8 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + keccak512_4way_init( &ctx.keccak ); keccak512_4way_update( &ctx.keccak, vhash, size ); } keccak512_4way_close( &ctx.keccak, vhash ); @@ -670,13 +691,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) case LUFFA: if ( i == 0 ) { - intrlv_2x128( vhash, hash0, hash1, 640 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); - dintrlv_2x128_512( hash0, hash1, vhash ); - intrlv_2x128( vhash, hash2, hash3, 640 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); - dintrlv_2x128_512( hash2, hash3, vhash ); + intrlv_2x128( vhash, hash0, hash1, 640 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, hash2, hash3, 640 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash2, hash3, vhash ); } else { @@ -691,32 +712,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) case CUBEHASH: if ( i == 0 ) { - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)in0 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2 + 64, 16 ); + intrlv_2x128( vhash, in0, in1, size<<3 ); + cube_2way_update_close( &ctx.cube, vhash, + vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash0, hash1, vhash ); memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3 + 64, 16 ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + cube_2way_update_close( &ctx.cube, vhash, + vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash2, hash3, vhash ); } else { - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); + intrlv_2x128( vhash, in0, in1, size<<3 ); + cube_2way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + cube_2way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); } break; case SHAVITE: @@ -763,7 +776,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) break; case HAMSI: if ( i == 0 ) - hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 ); + hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); @@ -774,10 +787,27 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: - fugue512_full( &ctx.fugue, hash0, in0, size ); - fugue512_full( &ctx.fugue, hash1, in1, size ); - fugue512_full( &ctx.fugue, hash2, in2, size ); - fugue512_full( &ctx.fugue, hash3, in3, size ); + if ( i == 0 ) + { + fugue512_update( &ctx.fugue, in0 + 76, 4 ); + fugue512_final( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in1 + 76, 4 ); + fugue512_final( &ctx.fugue, hash1 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in2 + 76, 4 ); + fugue512_final( &ctx.fugue, hash2 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in3 + 76, 4 ); + fugue512_final( &ctx.fugue, hash3 ); + } + else + { + fugue512_full( &ctx.fugue, hash0, in0, size ); + fugue512_full( &ctx.fugue, hash1, in1, size ); + fugue512_full( &ctx.fugue, hash2, in2, size ); + fugue512_full( &ctx.fugue, hash3, in3, size ); + } break; case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); @@ -883,7 +913,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); s_ntime = ntime; if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); + applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); } x16r_4way_prehash( vdata, pdata ); diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 09315f6a..88401062 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -1,4 +1,5 @@ #include "x16r-gate.h" +#include "algo/sha/sha256d.h" __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 }; @@ -61,8 +62,7 @@ bool register_x16r_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | - VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -80,8 +80,7 @@ bool register_x16rv2_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rv2; gate->hash = (void*)&x16rv2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | - VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -99,8 +98,7 @@ bool register_x16s_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | - VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; @@ -233,8 +231,7 @@ bool register_x16rt_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | - VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; opt_target_factor = 256.0; return true; }; @@ -251,8 +248,7 @@ bool register_x16rt_veil_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | - VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; gate->build_extraheader = (void*)&veil_build_extraheader; opt_target_factor = 256.0; return true; @@ -291,8 +287,7 @@ bool register_x21s_algo( algo_gate_t* gate ) gate->hash = (void*)&x21s_hash; gate->miner_thread_init = (void*)&x21s_thread_init; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | - VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 748b7fa3..76ca5e7e 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -37,6 +37,7 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -115,7 +116,7 @@ union _x16r_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cubehashParam cube; + cube_4way_context cube; simd_4way_context simd; hamsi512_8way_context hamsi; hashState_fugue fugue; @@ -164,8 +165,8 @@ union _x16r_4way_context_overlay jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; + cube_2way_context cube; hashState_luffa luffa1; - cubehashParam cube; simd_2way_context simd; hamsi512_4way_context hamsi; hashState_fugue fugue; diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c index fcd56af6..fee47ffa 100644 --- a/algo/x16/x16rt-4way.c +++ b/algo/x16/x16rt-4way.c @@ -24,15 +24,15 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x0cff; static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t ntime = bswap_32( pdata[17] ); - if ( s_ntime != ntime ) + uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80; + if ( s_ntime != masked_ntime ) { - x16rt_getTimeHash( ntime, &timeHash ); + x16rt_getTimeHash( masked_ntime, &timeHash ); x16rt_getAlgoString( &timeHash[0], x16r_hash_order ); - s_ntime = ntime; + s_ntime = masked_ntime; if ( opt_debug && !thr_id ) applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", - x16r_hash_order, ntime, timeHash ); + x16r_hash_order, bswap_32( pdata[17] ), timeHash ); } x16r_8way_prehash( vdata, pdata ); @@ -78,15 +78,15 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x0cff; static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t ntime = bswap_32( pdata[17] ); - if ( s_ntime != ntime ) + uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80; + if ( s_ntime != masked_ntime ) { - x16rt_getTimeHash( ntime, &timeHash ); + x16rt_getTimeHash( masked_ntime, &timeHash ); x16rt_getAlgoString( &timeHash[0], x16r_hash_order ); - s_ntime = ntime; + s_ntime = masked_ntime; if ( opt_debug && !thr_id ) applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", - x16r_hash_order, ntime, timeHash ); + x16r_hash_order, bswap_32( pdata[17] ), timeHash ); } x16r_4way_prehash( vdata, pdata ); diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c index 62507098..7ff8dc5d 100644 --- a/algo/x16/x16rt.c +++ b/algo/x16/x16rt.c @@ -20,15 +20,15 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce, mm128_bswap32_80( edata, pdata ); static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t ntime = swab32( pdata[17] ); - if ( s_ntime != ntime ) + uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80; + if ( s_ntime != masked_ntime ) { - x16rt_getTimeHash( ntime, &timeHash ); + x16rt_getTimeHash( masked_ntime, &timeHash ); x16rt_getAlgoString( &timeHash[0], x16r_hash_order ); - s_ntime = ntime; + s_ntime = masked_ntime; if ( opt_debug && !thr_id ) applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", - x16r_hash_order, ntime, timeHash ); + x16r_hash_order, swab32( pdata[17] ), timeHash ); } x16r_prehash( edata, pdata ); diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index de2dbe68..2f27116f 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -13,7 +13,7 @@ #include "algo/gost/sph_gost.h" #include "algo/lyra2/lyra2.h" #if defined(__SHA__) - #include "algo/sha/sph_sha2.h" + #include "algo/sha/sha256-hash.h" #endif #if defined (X21S_8WAY) @@ -208,9 +208,7 @@ union _x21s_4way_context_overlay haval256_5_4way_context haval; sph_tiger_context tiger; sph_gost512_context gost; -#if defined(__SHA__) - sph_sha256_context sha256; -#else +#if !defined(__SHA__) sha256_4way_context sha256; #endif } __attribute__ ((aligned (64))); @@ -275,18 +273,10 @@ int x21s_4way_hash( void* output, const void* input, int thrid ) #if defined(__SHA__) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0, 64 ); - sph_sha256_close( &ctx.sha256, output ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1, 64 ); - sph_sha256_close( &ctx.sha256, output+32 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2, 64 ); - sph_sha256_close( &ctx.sha256, output+64 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3, 64 ); - sph_sha256_close( &ctx.sha256, output+96 ); + sha256_full( output, hash0, 64 ); + sha256_full( output+32, hash1, 64 ); + sha256_full( output+64, hash2, 64 ); + sha256_full( output+96, hash3, 64 ); #else diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c index b81c07ec..96782e22 100644 --- a/algo/x16/x21s.c +++ b/algo/x16/x21s.c @@ -8,7 +8,7 @@ #include #include #include -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/gost/sph_gost.h" @@ -23,7 +23,7 @@ union _x21s_context_overlay sph_haval256_5_context haval; sph_tiger_context tiger; sph_gost512_context gost; - sph_sha256_context sha256; + sha256_context sha256; }; typedef union _x21s_context_overlay x21s_context_overlay; @@ -50,9 +50,7 @@ int x21s_hash( void* output, const void* input, int thrid ) sph_gost512 ( &ctx.gost, (const void*) hash, 64 ); sph_gost512_close( &ctx.gost, (void*) hash ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash, 64 ); - sph_sha256_close( &ctx.sha256, hash ); + sha256_full( hash, hash, 64 ); memcpy( output, hash, 32 ); diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c index d192b0df..926beb4c 100644 --- a/algo/x17/sonoa-gate.c +++ b/algo/x17/sonoa-gate.c @@ -12,7 +12,7 @@ bool register_sonoa_algo( algo_gate_t* gate ) init_sonoa_ctx(); gate->hash = (void*)&sonoa_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index fcff0b6e..1902a2de 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -37,7 +37,8 @@ union _x17_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cube_4way_context cube; +// cube_4way_context cube; + cube_4way_2buf_context cube; #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; @@ -119,8 +120,10 @@ int x17_8way_hash( void *state, const void *input, int thr_id ) luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 ); luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); - cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); + cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 ); + +// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); +// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); #if defined(__VAES__) diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c index 6ab09ff0..eee3d60d 100644 --- a/algo/x17/x17-gate.c +++ b/algo/x17/x17-gate.c @@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate ) #else gate->hash = (void*)&x17_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c index 545a0aa6..184ed2df 100644 --- a/algo/x17/xevan-gate.c +++ b/algo/x17/xevan-gate.c @@ -12,7 +12,7 @@ bool register_xevan_algo( algo_gate_t* gate ) init_xevan_ctx(); gate->hash = (void*)&xevan_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; opt_target_factor = 256.0; return true; }; diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index 94b34cc5..5acf3de5 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -28,7 +28,7 @@ #include "algo/echo/echo-hash-4way.h" #endif #if defined(__SHA__) - #include "algo/sha/sph_sha2.h" + #include "algo/sha/sha256-hash.h" #endif #if defined(X22I_8WAY) @@ -51,9 +51,7 @@ union _x22i_8way_ctx_overlay haval256_5_8way_context haval; sph_tiger_context tiger; sph_gost512_context gost; -#if defined(X22I_8WAY_SHA) - sph_sha256_context sha256; -#else +#if !defined(X22I_8WAY_SHA) sha256_8way_context sha256; #endif #if defined(__VAES__) @@ -391,30 +389,14 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) #if defined(X22I_8WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0, 64 ); - sph_sha256_close( &ctx.sha256, output ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1, 64 ); - sph_sha256_close( &ctx.sha256, output+32 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2, 64 ); - sph_sha256_close( &ctx.sha256, output+64 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3, 64 ); - sph_sha256_close( &ctx.sha256, output+96 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash4, 64 ); - sph_sha256_close( &ctx.sha256, output+128 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash5, 64 ); - sph_sha256_close( &ctx.sha256, output+160 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash6, 64 ); - sph_sha256_close( &ctx.sha256, output+192 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash7, 64 ); - sph_sha256_close( &ctx.sha256, output+224 ); + sha256_full( hash0, hash0, 64 ); + sha256_full( hash1, hash1, 64 ); + sha256_full( hash2, hash2, 64 ); + sha256_full( hash3, hash3, 64 ); + sha256_full( hash4, hash4, 64 ); + sha256_full( hash5, hash5, 64 ); + sha256_full( hash6, hash6, 64 ); + sha256_full( hash7, hash7, 64 ); #else @@ -551,9 +533,7 @@ union _x22i_4way_ctx_overlay haval256_5_4way_context haval; sph_tiger_context tiger; sph_gost512_context gost; -#if defined(X22I_4WAY_SHA) - sph_sha256_context sha256; -#else +#if !defined(X22I_4WAY_SHA) sha256_4way_context sha256; #endif }; @@ -757,18 +737,10 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) #if defined(X22I_4WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0, 64 ); - sph_sha256_close( &ctx.sha256, output ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1, 64 ); - sph_sha256_close( &ctx.sha256, output+32 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2, 64 ); - sph_sha256_close( &ctx.sha256, output+64 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3, 64 ); - sph_sha256_close( &ctx.sha256, output+96 ); + sha256_full( hash0, hash0, 64 ); + sha256_full( hash1, hash1, 64 ); + sha256_full( hash2, hash2, 64 ); + sha256_full( hash3, hash3, 64 ); #else diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c index ff0cc805..826f0f88 100644 --- a/algo/x22/x22i-gate.c +++ b/algo/x22/x22i-gate.c @@ -31,8 +31,8 @@ bool register_x22i_algo( algo_gate_t* gate ) #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT - | AVX512_OPT | VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT + | AVX512_OPT | VAES_OPT; return true; }; @@ -48,8 +48,8 @@ bool register_x25x_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x25x; gate->hash = (void*)&x25x_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | - AVX512_OPT | VAES_OPT | VAES256_OPT; + gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT | + AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c index 759e44c4..d63ddf24 100644 --- a/algo/x22/x22i.c +++ b/algo/x22/x22i.c @@ -24,6 +24,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" @@ -57,7 +58,6 @@ union _x22i_context_overlay sph_haval256_5_context haval; sph_tiger_context tiger; sph_gost512_context gost; - sph_sha256_context sha256; }; typedef union _x22i_context_overlay x22i_context_overlay; @@ -172,9 +172,7 @@ int x22i_hash( void *output, const void *input, int thrid ) sph_gost512 (&ctx.gost, (const void*) hash, 64); sph_gost512_close(&ctx.gost, (void*) hash); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash, 64 ); - sph_sha256_close( &ctx.sha256, hash ); + sha256_full( hash, hash, 64 ); memcpy(output, hash, 32); diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index 86f56997..ff2888ec 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -33,7 +33,7 @@ #include "algo/echo/echo-hash-4way.h" #endif #if defined(__SHA__) - #include "algo/sha/sph_sha2.h" + #include "algo/sha/sha256-hash.h" #endif void x25x_shuffle( void *hash ) @@ -84,7 +84,7 @@ union _x25x_8way_ctx_overlay sph_tiger_context tiger; sph_gost512_context gost; #if defined(X25X_8WAY_SHA) - sph_sha256_context sha256; + sha256_context sha256; #else sha256_8way_context sha256; #endif @@ -447,31 +447,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) #if defined(X25X_8WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0[20], 64 ); - sph_sha256_close( &ctx.sha256, hash0[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1[20], 64 ); - sph_sha256_close( &ctx.sha256, hash1[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2[20], 64 ); - sph_sha256_close( &ctx.sha256, hash2[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3[20], 64 ); - sph_sha256_close( &ctx.sha256, hash3[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash4[20], 64 ); - sph_sha256_close( &ctx.sha256, hash4[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash5[20], 64 ); - sph_sha256_close( &ctx.sha256, hash5[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash6[20], 64 ); - sph_sha256_close( &ctx.sha256, hash6[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash7[20], 64 ); - sph_sha256_close( &ctx.sha256, hash7[21] ); - + sha256_full( hash0[21], hash0[20], 64 ); + sha256_full( hash1[21], hash1[20], 64 ); + sha256_full( hash2[21], hash2[20], 64 ); + sha256_full( hash3[21], hash3[20], 64 ); + sha256_full( hash4[21], hash4[20], 64 ); + sha256_full( hash5[21], hash5[20], 64 ); + sha256_full( hash6[21], hash6[20], 64 ); + sha256_full( hash7[21], hash7[20], 64 ); + intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21], hash4[21], hash5[21], hash6[21], hash7[21] ); @@ -646,7 +630,7 @@ union _x25x_4way_ctx_overlay sph_tiger_context tiger; sph_gost512_context gost; #if defined(X25X_4WAY_SHA) - sph_sha256_context sha256; + sha256_context sha256; #else sha256_4way_context sha256; #endif @@ -848,18 +832,10 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) #if defined(X25X_4WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0[20], 64 ); - sph_sha256_close( &ctx.sha256, hash0[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1[20], 64 ); - sph_sha256_close( &ctx.sha256, hash1[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2[20], 64 ); - sph_sha256_close( &ctx.sha256, hash2[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3[20], 64 ); - sph_sha256_close( &ctx.sha256, hash3[21] ); + sha256_full( hash0[21], hash0[20], 64 ); + sha256_full( hash1[21], hash1[20], 64 ); + sha256_full( hash2[21], hash2[20], 64 ); + sha256_full( hash3[21], hash3[20], 64 ); intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] ); diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c index 42e7eda0..aade6e2b 100644 --- a/algo/x22/x25x.c +++ b/algo/x22/x25x.c @@ -23,7 +23,7 @@ #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" @@ -60,7 +60,7 @@ union _x25x_context_overlay sph_haval256_5_context haval; sph_tiger_context tiger; sph_gost512_context gost; - sph_sha256_context sha256; + sha256_context sha256; sph_panama_context panama; blake2s_state blake2s; }; @@ -174,9 +174,7 @@ int x25x_hash( void *output, const void *input, int thrid ) sph_gost512 (&ctx.gost, (const void*) &hash[19], 64); sph_gost512_close(&ctx.gost, (void*) &hash[20]); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, &hash[20], 64 ); - sph_sha256_close( &ctx.sha256, &hash[21] ); + sha256_full( &hash[21], &hash[20], 64 ); sph_panama_init(&ctx.panama); sph_panama (&ctx.panama, (const void*) &hash[21], 64 ); diff --git a/algo/yespower/crypto/blake2b-yp.c b/algo/yespower/crypto/blake2b-yp.c index 407d2dde..dc6eee6a 100644 --- a/algo/yespower/crypto/blake2b-yp.c +++ b/algo/yespower/crypto/blake2b-yp.c @@ -35,9 +35,11 @@ #include "blake2b-yp.h" // Cyclic right rotation. -#ifndef ROTR64 -#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) -#endif +//#ifndef ROTR64 +//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) +//#endif + +#define ROTR64(x, y) ror64( x, y ) // Little-endian byte access. #define B2B_GET64(p) \ diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c index 27d1fd85..b278c36b 100644 --- a/algo/yespower/yescrypt-r8g.c +++ b/algo/yespower/yescrypt-r8g.c @@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce, endiandata[19] = n; // do sha256 prehash - sph_sha256_init( &sha256_prehash_ctx ); - sph_sha256( &sha256_prehash_ctx, endiandata, 64 ); + sha256_ctx_init( &sha256_prehash_ctx ); + sha256_update( &sha256_prehash_ctx, endiandata, 64 ); do { yespower_tls( (unsigned char *)endiandata, params.perslen, diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index e0b3f44e..a14f6ca5 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -27,14 +27,11 @@ * coin. */ #include "yespower.h" - #include "algo-gate-api.h" yespower_params_t yespower_params; -//SHA256_CTX sha256_prehash_ctx; -__thread sph_sha256_context sha256_prehash_ctx; -//__thread SHA256_CTX sha256_prehash_ctx; +__thread sha256_context sha256_prehash_ctx; // YESPOWER @@ -61,8 +58,8 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce, endiandata[19] = n; // do sha256 prehash - sph_sha256_init( &sha256_prehash_ctx ); - sph_sha256( &sha256_prehash_ctx, endiandata, 64 ); + sha256_ctx_init( &sha256_prehash_ctx ); + sha256_update( &sha256_prehash_ctx, endiandata, 64 ); do { if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) ) @@ -101,10 +98,6 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce, be32enc( &endiandata[k], pdata[k] ); endiandata[19] = n; - // do sha256 prehash - sph_sha256_init( &sha256_prehash_ctx ); - sph_sha256( &sha256_prehash_ctx, endiandata, 64 ); - do { if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) ) if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark ) diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index e21e4f17..5e725af7 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -203,17 +203,17 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, ARX(X0, X3, X2, 18) \ /* Rearrange data */ \ X1 = _mm_shuffle_epi32(X1, 0x93); \ + X3 = _mm_shuffle_epi32(X3, 0x39); \ X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x39); \ /* Operate on "rows" */ \ ARX(X3, X0, X1, 7) \ ARX(X2, X3, X0, 9) \ ARX(X1, X2, X3, 13) \ ARX(X0, X1, X2, 18) \ /* Rearrange data */ \ + X3 = _mm_shuffle_epi32(X3, 0x93); \ X1 = _mm_shuffle_epi32(X1, 0x39); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); /** * Apply the Salsa20 core to the block provided in (X0 ... X3). @@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B, #define INTEGERIFY (uint32_t)X.d[0] #endif +// AVX512 ternary logic optimization +#if defined(__AVX512VL__) + +#define XOR_X_XOR_X( in1, in2 ) \ + X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \ + X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \ + X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \ + X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); + +#define XOR_X_2_XOR_X( in1, in2, in3 ) \ + X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \ + X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \ + X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \ + X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 ); + +#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \ + X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \ + X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \ + X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \ + X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \ + SALSA20(out) + +#else + +#define XOR_X_XOR_X( in1, in2 ) \ + XOR_X( in1 ) \ + XOR_X( in2 ) + +#define XOR_X_2_XOR_X( in1, in2, in3 ) \ + XOR_X_2( in1, in2 ) \ + XOR_X( in3 ) + +#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \ + XOR_X(in1) \ + XOR_X(in2) \ + SALSA20( out ) + +#endif + /** * Apply the Salsa20 core to the block provided in X ^ in. */ @@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1, { DECL_X - XOR_X_2(Bin1[1], Bin2[1]) - XOR_X(Bin1[0]) + XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] ) +// XOR_X_2(Bin1[1], Bin2[1]) +// XOR_X(Bin1[0]) SALSA20_XOR_MEM(Bin2[0], Bout[0]) - XOR_X(Bin1[1]) - SALSA20_XOR_MEM(Bin2[1], Bout[1]) + +// Factor out the XOR from salsa20 to do a xor3 + XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] ) +// XOR_X(Bin1[1]) +// SALSA20_XOR_MEM(Bin2[1], Bout[1]) return INTEGERIFY; } @@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1, i = 0; r--; do { - XOR_X(Bin1[i]) - XOR_X(Bin2[i]) + XOR_X_XOR_X( Bin1[i], Bin2[i] ) +// XOR_X(Bin1[i]) +// XOR_X(Bin2[i]) PWXFORM WRITE_X(Bout[i]) - XOR_X(Bin1[i + 1]) - XOR_X(Bin2[i + 1]) + XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] ) +// XOR_X(Bin1[i + 1]) +// XOR_X(Bin2[i + 1]) PWXFORM if (unlikely(i >= r)) @@ -1050,7 +1095,7 @@ int yespower(yespower_local_t *local, salsa20_blk_t *V, *XY; pwxform_ctx_t ctx; uint8_t sha256[32]; - sph_sha256_context sha256_ctx; + sha256_context sha256_ctx; /* Sanity-check parameters */ if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0) @@ -1093,10 +1138,9 @@ int yespower(yespower_local_t *local, // copy prehash, do tail memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx ); - - sph_sha256( &sha256_ctx, src+64, srclen-64 ); - sph_sha256_close( &sha256_ctx, sha256 ); - + sha256_update( &sha256_ctx, src+64, srclen-64 ); + sha256_final( &sha256_ctx, sha256 ); + if ( version == YESPOWER_0_5 ) { PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size ); @@ -1141,7 +1185,9 @@ int yespower(yespower_local_t *local, if ( work_restart[thrid].restart ) return 0; smix_1_0( B, r, N, V, XY, &ctx ); - + + if ( work_restart[thrid].restart ) return 0; + HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256), (uint8_t *)dst ); } diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h index c5b6d78a..aa190049 100644 --- a/algo/yespower/yespower.h +++ b/algo/yespower/yespower.h @@ -34,8 +34,7 @@ #include /* for size_t */ #include "miner.h" #include "simd-utils.h" -#include "algo/sha/sph_sha2.h" -#include +#include "algo/sha/sha256-hash.h" #ifdef __cplusplus extern "C" { @@ -79,9 +78,7 @@ typedef struct { extern yespower_params_t yespower_params; -//SHA256_CTX sha256_prehash_ctx; -extern __thread sph_sha256_context sha256_prehash_ctx; -//extern __thread SHA256_CTX sha256_prehash_ctx; +extern __thread sha256_context sha256_prehash_ctx; /** * yespower_init_local(local): diff --git a/build-allarch.sh b/build-allarch.sh index 4346e7fb..0fa850e6 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,128 +4,97 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null -# Icelake AVX512 SHA VAES +# AVX512 SHA VAES: Intel Core Icelake, Rocketlake make distclean || echo clean rm -f config.status ./autogen.sh || echo done CFLAGS="-O3 -march=icelake-client -Wall -fno-common" ./configure --with-curl +#CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx512-sha-vaes.exe strip -s cpuminer mv cpuminer cpuminer-avx512-sha-vaes -# Rocketlake AVX512 SHA AES +# AVX512 AES: Intel Core HEDT Sylake-X, Cascadelake make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=cascadelake -msha -Wall -fno-common" ./configure --with-curl -#CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl -# CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl +CFLAGS="-O3 -march=skylake-avx512 -maes -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx512-sha.exe strip -s cpuminer -mv cpuminer cpuminer-avx512-sha +mv cpuminer cpuminer-avx512 -# Slylake-X AVX512 AES -make clean || echo clean +# AVX2 SHA VAES: Intel Alderlake, AMD Zen3 +make clean || echo done rm -f config.status -CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-common" ./configure --with-curl +# vaes doesn't include aes +CFLAGS="-O3 -maes -mavx2 -msha -mvaes -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx512.exe strip -s cpuminer -mv cpuminer cpuminer-avx512 +mv cpuminer cpuminer-avx2-sha-vaes + +# AVX2 SHA AES: AMD Zen1 +make clean || echo done +rm -f config.status +#CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl +CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-avx2-sha -# Haswell AVX2 AES +# AVX2 AES: Intel Haswell..Cometlake make clean || echo clean rm -f config.status # GCC 9 doesn't include AES with core-avx2 CFLAGS="-O3 -march=core-avx2 -maes -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx2.exe strip -s cpuminer mv cpuminer cpuminer-avx2 -# Sandybridge AVX AES +# AVX AES: Intel Sandybridge, Ivybridge make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx.exe strip -s cpuminer mv cpuminer cpuminer-avx -# Westmere SSE4.2 AES +# SSE4.2 AES: Intel Westmere make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl +CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-aes-sse42.exe strip -s cpuminer mv cpuminer cpuminer-aes-sse42 -# Nehalem SSE4.2 +# SSE4.2: Intel Nehalem make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-sse42.exe strip -s cpuminer mv cpuminer cpuminer-sse42 -# Core2 SSSE3 +# SSSE3: Intel Core2 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-ssse3.exe strip -s cpuminer mv cpuminer cpuminer-ssse3 -# Generic SSE2 +# SSE2 make clean || echo clean rm -f config.status CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-sse2.exe strip -s cpuminer mv cpuminer cpuminer-sse2 -# AMD Zen1 AVX2 SHA -make clean || echo done -rm -f config.status -CFLAGS="-O3 -march=znver1 -Wall -fno-common" ./configure --with-curl -make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-zen.exe -strip -s cpuminer -mv cpuminer cpuminer-zen - -# AMD Zen3 AVX2 SHA VAES -make clean || echo done -rm -f config.status -CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl -# CFLAGS="-O3 -march=znver3 -Wall -fno-common" ./configure --with-curl -make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-zen3.exe -strip -s cpuminer -mv cpuminer cpuminer-zen3 - -# Native to current CPU +# Native to host CPU make clean || echo done rm -f config.status CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl make -j $(nproc) -strip -s cpuminer.exe strip -s cpuminer diff --git a/build-avx2.sh b/build-avx2.sh index 7a124733..25ac4b39 100755 --- a/build-avx2.sh +++ b/build-avx2.sh @@ -22,6 +22,6 @@ rm -f config.status CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl -make -j 4 +make -j $(nproc) strip -s cpuminer diff --git a/build-msys2.sh b/build-msys2.sh new file mode 100755 index 00000000..8f257d40 --- /dev/null +++ b/build-msys2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# +# Compile on Windows using MSYS2 and MinGW. + +make distclean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 --param=evrp-mode=legacy -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl +make -j 4 +strip -s cpuminer diff --git a/clean-all.sh b/clean-all.sh index e91bbb5b..87183d5e 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,8 +2,8 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes > /dev/null -rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null +rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe > /dev/null make distclean > /dev/null diff --git a/compat.h b/compat.h index 124bc40a..bd23f9c5 100644 --- a/compat.h +++ b/compat.h @@ -3,6 +3,10 @@ #ifdef WIN32 +#if _WIN32_WINNT==0x0601 // Windows 7 + #define WINDOWS_CPU_GROUPS_ENABLED 1 +#endif + #include #include diff --git a/configure b/configure index 07bb94c4..08b32c57 100755 --- a/configure +++ b/configure @@ -1,9 +1,10 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-kudaraidee 1.1.0. +# Generated by GNU Autoconf 2.71 for cpuminer-kudaraidee 1.1.0. # # -# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. +# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, +# Inc. # # # This configure script is free software; the Free Software Foundation @@ -14,14 +15,16 @@ # Be more Bourne compatible DUALCASE=1; export DUALCASE # for MKS sh -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : +as_nop=: +if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1 +then : emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' setopt NO_GLOB_SUBST -else +else $as_nop case `(set -o) 2>/dev/null` in #( *posix*) : set -o posix ;; #( @@ -31,46 +34,46 @@ esac fi + +# Reset variables that may have inherited troublesome values from +# the environment. + +# IFS needs to be set, to space, tab, and newline, in precisely that order. +# (If _AS_PATH_WALK were called with IFS unset, it would have the +# side effect of setting IFS to empty, thus disabling word splitting.) +# Quoting is to prevent editors from complaining about space-tab. as_nl=' ' export as_nl -# Printing a long string crashes Solaris 7 /usr/bin/printf. -as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo -# Prefer a ksh shell builtin over an external printf program on Solaris, -# but without wasting forks for bash or zsh. -if test -z "$BASH_VERSION$ZSH_VERSION" \ - && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='print -r --' - as_echo_n='print -rn --' -elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='printf %s\n' - as_echo_n='printf %s' -else - if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then - as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' - as_echo_n='/usr/ucb/echo -n' - else - as_echo_body='eval expr "X$1" : "X\\(.*\\)"' - as_echo_n_body='eval - arg=$1; - case $arg in #( - *"$as_nl"*) - expr "X$arg" : "X\\(.*\\)$as_nl"; - arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; - esac; - expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" - ' - export as_echo_n_body - as_echo_n='sh -c $as_echo_n_body as_echo' - fi - export as_echo_body - as_echo='sh -c $as_echo_body as_echo' -fi +IFS=" "" $as_nl" + +PS1='$ ' +PS2='> ' +PS4='+ ' + +# Ensure predictable behavior from utilities with locale-dependent output. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# We cannot yet rely on "unset" to work, but we need these variables +# to be unset--not just set to an empty or harmless value--now, to +# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh). This construct +# also avoids known problems related to "unset" and subshell syntax +# in other old shells (e.g. bash 2.01 and pdksh 5.2.14). +for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH +do eval test \${$as_var+y} \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done + +# Ensure that fds 0, 1, and 2 are open. +if (exec 3>&0) 2>/dev/null; then :; else exec 0&1) 2>/dev/null; then :; else exec 1>/dev/null; fi +if (exec 3>&2) ; then :; else exec 2>/dev/null; fi # The user is always right. -if test "${PATH_SEPARATOR+set}" != set; then +if ${PATH_SEPARATOR+false} :; then PATH_SEPARATOR=: (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || @@ -79,13 +82,6 @@ if test "${PATH_SEPARATOR+set}" != set; then fi -# IFS -# We need space, tab and new line, in precisely that order. Quoting is -# there to prevent editors from complaining about space-tab. -# (If _AS_PATH_WALK were called with IFS unset, it would disable word -# splitting by setting IFS to empty value.) -IFS=" "" $as_nl" - # Find who we are. Look in the path if we contain no directory separator. as_myself= case $0 in #(( @@ -94,8 +90,12 @@ case $0 in #(( for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + test -r "$as_dir$0" && as_myself=$as_dir$0 && break done IFS=$as_save_IFS @@ -107,30 +107,10 @@ if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then - $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 exit 1 fi -# Unset variables that we do not need and which cause bugs (e.g. in -# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" -# suppresses any "Segmentation fault" message there. '((' could -# trigger a bug in pdksh 5.2.14. -for as_var in BASH_ENV ENV MAIL MAILPATH -do eval test x\${$as_var+set} = xset \ - && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : -done -PS1='$ ' -PS2='> ' -PS4='+ ' - -# NLS nuisances. -LC_ALL=C -export LC_ALL -LANGUAGE=C -export LANGUAGE - -# CDPATH. -(unset CDPATH) >/dev/null 2>&1 && unset CDPATH # Use a proper internal environment variable to ensure we don't fall # into an infinite loop, continuously re-executing ourselves. @@ -152,20 +132,22 @@ esac exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} # Admittedly, this is quite paranoid, since all the known shells bail # out after a failed `exec'. -$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 -as_fn_exit 255 +printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2 +exit 255 fi # We don't want this to propagate to other subprocesses. { _as_can_reexec=; unset _as_can_reexec;} if test "x$CONFIG_SHELL" = x; then - as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : + as_bourne_compatible="as_nop=: +if test \${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1 +then : emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which # is contrary to our usage. Disable this feature. alias -g '\${1+\"\$@\"}'='\"\$@\"' setopt NO_GLOB_SUBST -else +else \$as_nop case \`(set -o) 2>/dev/null\` in #( *posix*) : set -o posix ;; #( @@ -185,42 +167,53 @@ as_fn_success || { exitcode=1; echo as_fn_success failed.; } as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; } as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; } as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; } -if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : +if ( set x; as_fn_ret_success y && test x = \"\$1\" ) +then : -else +else \$as_nop exitcode=1; echo positional parameters were not saved. fi test x\$exitcode = x0 || exit 1 +blah=\$(echo \$(echo blah)) +test x\"\$blah\" = xblah || exit 1 test -x / || exit 1" as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1 test \$(( 1 + 1 )) = 2 || exit 1" - if (eval "$as_required") 2>/dev/null; then : + if (eval "$as_required") 2>/dev/null +then : as_have_required=yes -else +else $as_nop as_have_required=no fi - if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then : + if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null +then : -else +else $as_nop as_save_IFS=$IFS; IFS=$PATH_SEPARATOR as_found=false for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac as_found=: case $as_dir in #( /*) for as_base in sh bash ksh sh5; do # Try only shells that exist, to save several forks. - as_shell=$as_dir/$as_base + as_shell=$as_dir$as_base if { test -f "$as_shell" || test -f "$as_shell.exe"; } && - { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then : + as_run=a "$as_shell" -c "$as_bourne_compatible""$as_required" 2>/dev/null +then : CONFIG_SHELL=$as_shell as_have_required=yes - if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then : + if as_run=a "$as_shell" -c "$as_bourne_compatible""$as_suggested" 2>/dev/null +then : break 2 fi fi @@ -228,14 +221,21 @@ fi esac as_found=false done -$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } && - { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then : - CONFIG_SHELL=$SHELL as_have_required=yes -fi; } IFS=$as_save_IFS +if $as_found +then : + +else $as_nop + if { test -f "$SHELL" || test -f "$SHELL.exe"; } && + as_run=a "$SHELL" -c "$as_bourne_compatible""$as_required" 2>/dev/null +then : + CONFIG_SHELL=$SHELL as_have_required=yes +fi +fi - if test "x$CONFIG_SHELL" != x; then : + if test "x$CONFIG_SHELL" != x +then : export CONFIG_SHELL # We cannot yet assume a decent shell, so we have to provide a # neutralization value for shells without unset; and this also @@ -253,18 +253,19 @@ esac exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} # Admittedly, this is quite paranoid, since all the known shells bail # out after a failed `exec'. -$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2 exit 255 fi - if test x$as_have_required = xno; then : - $as_echo "$0: This script requires a shell more modern than all" - $as_echo "$0: the shells that I found on your system." - if test x${ZSH_VERSION+set} = xset ; then - $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should" - $as_echo "$0: be upgraded to zsh 4.3.4 or later." + if test x$as_have_required = xno +then : + printf "%s\n" "$0: This script requires a shell more modern than all" + printf "%s\n" "$0: the shells that I found on your system." + if test ${ZSH_VERSION+y} ; then + printf "%s\n" "$0: In particular, zsh $ZSH_VERSION has bugs and should" + printf "%s\n" "$0: be upgraded to zsh 4.3.4 or later." else - $as_echo "$0: Please tell bug-autoconf@gnu.org about your system, + printf "%s\n" "$0: Please tell bug-autoconf@gnu.org about your system, $0: including any error possibly output before this $0: message. Then install a modern shell, or manually run $0: the script under such a shell if you do have one." @@ -291,6 +292,7 @@ as_fn_unset () } as_unset=as_fn_unset + # as_fn_set_status STATUS # ----------------------- # Set $? to STATUS, without forking. @@ -308,6 +310,14 @@ as_fn_exit () as_fn_set_status $1 exit $1 } # as_fn_exit +# as_fn_nop +# --------- +# Do nothing but, unlike ":", preserve the value of $?. +as_fn_nop () +{ + return $? +} +as_nop=as_fn_nop # as_fn_mkdir_p # ------------- @@ -322,7 +332,7 @@ as_fn_mkdir_p () as_dirs= while :; do case $as_dir in #( - *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( *) as_qdir=$as_dir;; esac as_dirs="'$as_qdir' $as_dirs" @@ -331,7 +341,7 @@ $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$as_dir" | +printf "%s\n" X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q @@ -370,12 +380,13 @@ as_fn_executable_p () # advantage of any shell optimizations that allow amortized linear growth over # repeated appends, instead of the typical quadratic growth present in naive # implementations. -if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null +then : eval 'as_fn_append () { eval $1+=\$2 }' -else +else $as_nop as_fn_append () { eval $1=\$$1\$2 @@ -387,18 +398,27 @@ fi # as_fn_append # Perform arithmetic evaluation on the ARGs, and store the result in the # global $as_val. Take advantage of shells that can avoid forks. The arguments # must be portable across $(()) and expr. -if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null +then : eval 'as_fn_arith () { as_val=$(( $* )) }' -else +else $as_nop as_fn_arith () { as_val=`expr "$@" || test $? -eq 1` } fi # as_fn_arith +# as_fn_nop +# --------- +# Do nothing but, unlike ":", preserve the value of $?. +as_fn_nop () +{ + return $? +} +as_nop=as_fn_nop # as_fn_error STATUS ERROR [LINENO LOG_FD] # ---------------------------------------- @@ -410,9 +430,9 @@ as_fn_error () as_status=$1; test $as_status -eq 0 && as_status=1 if test "$4"; then as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 fi - $as_echo "$as_me: error: $2" >&2 + printf "%s\n" "$as_me: error: $2" >&2 as_fn_exit $as_status } # as_fn_error @@ -439,7 +459,7 @@ as_me=`$as_basename -- "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X/"$0" | +printf "%s\n" X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/ q @@ -483,7 +503,7 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits s/-\n.*// ' >$as_me.lineno && chmod +x "$as_me.lineno" || - { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } + { printf "%s\n" "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } # If we had to re-execute with $CONFIG_SHELL, we're ensured to have # already done that, so ensure we don't try to do so again and fall @@ -497,6 +517,10 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits exit } + +# Determine whether it's possible to make 'echo' print without a newline. +# These variables are no longer used directly by Autoconf, but are AC_SUBSTed +# for compatibility with existing Makefiles. ECHO_C= ECHO_N= ECHO_T= case `echo -n x` in #((((( -n*) @@ -510,6 +534,13 @@ case `echo -n x` in #((((( ECHO_N='-n';; esac +# For backward compatibility with old third-party macros, we provide +# the shell variables $as_echo and $as_echo_n. New code should use +# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively. +as_echo='printf %s\n' +as_echo_n='printf %s' + + rm -f conf$$ conf$$.exe conf$$.file if test -d conf$$.dir; then rm -f conf$$.dir/conf$$.file @@ -585,40 +616,36 @@ PACKAGE_URL='' ac_unique_file="cpu-miner.c" # Factoring default headers for most tests. ac_includes_default="\ -#include -#ifdef HAVE_SYS_TYPES_H -# include -#endif -#ifdef HAVE_SYS_STAT_H -# include +#include +#ifdef HAVE_STDIO_H +# include #endif -#ifdef STDC_HEADERS +#ifdef HAVE_STDLIB_H # include -# include -#else -# ifdef HAVE_STDLIB_H -# include -# endif #endif #ifdef HAVE_STRING_H -# if !defined STDC_HEADERS && defined HAVE_MEMORY_H -# include -# endif # include #endif -#ifdef HAVE_STRINGS_H -# include -#endif #ifdef HAVE_INTTYPES_H # include #endif #ifdef HAVE_STDINT_H # include #endif +#ifdef HAVE_STRINGS_H +# include +#endif +#ifdef HAVE_SYS_TYPES_H +# include +#endif +#ifdef HAVE_SYS_STAT_H +# include +#endif #ifdef HAVE_UNISTD_H # include #endif" +ac_header_c_list= ac_subst_vars='am__EXEEXT_FALSE am__EXEEXT_TRUE LTLIBOBJS @@ -667,7 +694,6 @@ am__nodep AMDEPBACKSLASH AMDEP_FALSE AMDEP_TRUE -am__quote am__include DEPDIR OBJEXT @@ -757,7 +783,8 @@ PACKAGE_VERSION PACKAGE_TARNAME PACKAGE_NAME PATH_SEPARATOR -SHELL' +SHELL +am__quote' ac_subst_files='' ac_user_opts=' enable_option_checking @@ -850,8 +877,6 @@ do *) ac_optarg=yes ;; esac - # Accept the important Cygnus configure options, so we can diagnose typos. - case $ac_dashdash$ac_option in --) ac_dashdash=yes ;; @@ -892,9 +917,9 @@ do ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid feature name: $ac_useropt" + as_fn_error $? "invalid feature name: \`$ac_useropt'" ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "enable_$ac_useropt" @@ -918,9 +943,9 @@ do ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid feature name: $ac_useropt" + as_fn_error $? "invalid feature name: \`$ac_useropt'" ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "enable_$ac_useropt" @@ -1131,9 +1156,9 @@ do ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid package name: $ac_useropt" + as_fn_error $? "invalid package name: \`$ac_useropt'" ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "with_$ac_useropt" @@ -1147,9 +1172,9 @@ do ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid package name: $ac_useropt" + as_fn_error $? "invalid package name: \`$ac_useropt'" ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "with_$ac_useropt" @@ -1193,9 +1218,9 @@ Try \`$0 --help' for more information" *) # FIXME: should be removed in autoconf 3.0. - $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 + printf "%s\n" "$as_me: WARNING: you should use --build, --host, --target" >&2 expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && - $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 + printf "%s\n" "$as_me: WARNING: invalid host type: $ac_option" >&2 : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}" ;; @@ -1211,7 +1236,7 @@ if test -n "$ac_unrecognized_opts"; then case $enable_option_checking in no) ;; fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;; - *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; + *) printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; esac fi @@ -1275,7 +1300,7 @@ $as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_myself" : 'X\(//\)[^/]' \| \ X"$as_myself" : 'X\(//\)$' \| \ X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$as_myself" | +printf "%s\n" X"$as_myself" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q @@ -1381,8 +1406,7 @@ Fine tuning of the installation directories: --infodir=DIR info documentation [DATAROOTDIR/info] --localedir=DIR locale-dependent data [DATAROOTDIR/locale] --mandir=DIR man documentation [DATAROOTDIR/man] - --docdir=DIR documentation root - [DATAROOTDIR/doc/cpuminer-kudaraidee] + --docdir=DIR documentation root [DATAROOTDIR/doc/cpuminer-kudaraidee] --htmldir=DIR html documentation [DOCDIR] --dvidir=DIR dvi documentation [DOCDIR] --pdfdir=DIR pdf documentation [DOCDIR] @@ -1463,9 +1487,9 @@ if test "$ac_init_help" = "recursive"; then case "$ac_dir" in .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; *) - ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'` # A ".." for each directory in $ac_dir_suffix. - ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` case $ac_top_builddir_sub in "") ac_top_builddir_sub=. ac_top_build_prefix= ;; *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; @@ -1493,7 +1517,8 @@ esac ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix cd "$ac_dir" || { ac_status=$?; continue; } - # Check for guested configure. + # Check for configure.gnu first; this name is used for a wrapper for + # Metaconfig's "Configure" on case-insensitive file systems. if test -f "$ac_srcdir/configure.gnu"; then echo && $SHELL "$ac_srcdir/configure.gnu" --help=recursive @@ -1501,7 +1526,7 @@ ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix echo && $SHELL "$ac_srcdir/configure" --help=recursive else - $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 + printf "%s\n" "$as_me: WARNING: no configuration information is in $ac_dir" >&2 fi || ac_status=$? cd "$ac_pwd" || { ac_status=$?; break; } done @@ -1511,9 +1536,9 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF cpuminer-kudaraidee configure 1.1.0 -generated by GNU Autoconf 2.69 +generated by GNU Autoconf 2.71 -Copyright (C) 2012 Free Software Foundation, Inc. +Copyright (C) 2021 Free Software Foundation, Inc. This configure script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. _ACEOF @@ -1530,14 +1555,14 @@ fi ac_fn_c_try_compile () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - rm -f conftest.$ac_objext + rm -f conftest.$ac_objext conftest.beam if { { ac_try="$ac_compile" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_compile") 2>conftest.err ac_status=$? if test -s conftest.err; then @@ -1545,14 +1570,15 @@ $as_echo "$ac_try_echo"; } >&5 cat conftest.er1 >&5 mv -f conftest.er1 conftest.err fi - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } && { test -z "$ac_c_werror_flag" || test ! -s conftest.err - } && test -s conftest.$ac_objext; then : + } && test -s conftest.$ac_objext +then : ac_retval=0 -else - $as_echo "$as_me: failed program was:" >&5 +else $as_nop + printf "%s\n" "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_retval=1 @@ -1574,7 +1600,7 @@ case "(($ac_try" in *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err ac_status=$? if test -s conftest.err; then @@ -1582,14 +1608,15 @@ $as_echo "$ac_try_echo"; } >&5 cat conftest.er1 >&5 mv -f conftest.er1 conftest.err fi - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } > conftest.i && { test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" || test ! -s conftest.err - }; then : + } +then : ac_retval=0 -else - $as_echo "$as_me: failed program was:" >&5 +else $as_nop + printf "%s\n" "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_retval=1 @@ -1605,14 +1632,14 @@ fi ac_fn_cxx_try_compile () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - rm -f conftest.$ac_objext + rm -f conftest.$ac_objext conftest.beam if { { ac_try="$ac_compile" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_compile") 2>conftest.err ac_status=$? if test -s conftest.err; then @@ -1620,14 +1647,15 @@ $as_echo "$ac_try_echo"; } >&5 cat conftest.er1 >&5 mv -f conftest.er1 conftest.err fi - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } && { test -z "$ac_cxx_werror_flag" || test ! -s conftest.err - } && test -s conftest.$ac_objext; then : + } && test -s conftest.$ac_objext +then : ac_retval=0 -else - $as_echo "$as_me: failed program was:" >&5 +else $as_nop + printf "%s\n" "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_retval=1 @@ -1637,135 +1665,6 @@ fi } # ac_fn_cxx_try_compile -# ac_fn_c_try_run LINENO -# ---------------------- -# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes -# that executables *can* be run. -ac_fn_c_try_run () -{ - as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - if { { ac_try="$ac_link" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 - (eval "$ac_link") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; } && { ac_try='./conftest$ac_exeext' - { { case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 - (eval "$ac_try") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; }; then : - ac_retval=0 -else - $as_echo "$as_me: program exited with status $ac_status" >&5 - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_retval=$ac_status -fi - rm -rf conftest.dSYM conftest_ipa8_conftest.oo - eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno - as_fn_set_status $ac_retval - -} # ac_fn_c_try_run - -# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES -# ------------------------------------------------------- -# Tests whether HEADER exists, giving a warning if it cannot be compiled using -# the include files in INCLUDES and setting the cache variable VAR -# accordingly. -ac_fn_c_check_header_mongrel () -{ - as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - if eval \${$3+:} false; then : - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 -$as_echo_n "checking for $2... " >&6; } -if eval \${$3+:} false; then : - $as_echo_n "(cached) " >&6 -fi -eval ac_res=\$$3 - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } -else - # Is the header compilable? -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5 -$as_echo_n "checking $2 usability... " >&6; } -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -$4 -#include <$2> -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - ac_header_compiler=yes -else - ac_header_compiler=no -fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5 -$as_echo "$ac_header_compiler" >&6; } - -# Is the header present? -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5 -$as_echo_n "checking $2 presence... " >&6; } -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include <$2> -_ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : - ac_header_preproc=yes -else - ac_header_preproc=no -fi -rm -f conftest.err conftest.i conftest.$ac_ext -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5 -$as_echo "$ac_header_preproc" >&6; } - -# So? What about this header? -case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #(( - yes:no: ) - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5 -$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5 -$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;} - ;; - no:yes:* ) - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5 -$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: check for missing prerequisite headers?" >&5 -$as_echo "$as_me: WARNING: $2: check for missing prerequisite headers?" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5 -$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: section \"Present But Cannot Be Compiled\"" >&5 -$as_echo "$as_me: WARNING: $2: section \"Present But Cannot Be Compiled\"" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5 -$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;} - ;; -esac - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 -$as_echo_n "checking for $2... " >&6; } -if eval \${$3+:} false; then : - $as_echo_n "(cached) " >&6 -else - eval "$3=\$ac_header_compiler" -fi -eval ac_res=\$$3 - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } -fi - eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno - -} # ac_fn_c_check_header_mongrel - # ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES # ------------------------------------------------------- # Tests whether HEADER exists and can be compiled using the include files in @@ -1773,49 +1672,54 @@ fi ac_fn_c_check_header_compile () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 -$as_echo_n "checking for $2... " >&6; } -if eval \${$3+:} false; then : - $as_echo_n "(cached) " >&6 -else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +printf %s "checking for $2... " >&6; } +if eval test \${$3+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ $4 #include <$2> _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : eval "$3=yes" -else +else $as_nop eval "$3=no" fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi eval ac_res=\$$3 - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +printf "%s\n" "$ac_res" >&6; } eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno } # ac_fn_c_check_header_compile -# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES -# --------------------------------------------- +# ac_fn_check_decl LINENO SYMBOL VAR INCLUDES EXTRA-OPTIONS FLAG-VAR +# ------------------------------------------------------------------ # Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR -# accordingly. -ac_fn_c_check_decl () +# accordingly. Pass EXTRA-OPTIONS to the compiler, using FLAG-VAR. +ac_fn_check_decl () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack as_decl_name=`echo $2|sed 's/ *(.*//'` + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5 +printf %s "checking whether $as_decl_name is declared... " >&6; } +if eval test \${$3+y} +then : + printf %s "(cached) " >&6 +else $as_nop as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'` - { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5 -$as_echo_n "checking whether $as_decl_name is declared... " >&6; } -if eval \${$3+:} false; then : - $as_echo_n "(cached) " >&6 -else + eval ac_save_FLAGS=\$$6 + as_fn_append $6 " $5" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ $4 int -main () +main (void) { #ifndef $as_decl_name #ifdef __cplusplus @@ -1829,19 +1733,22 @@ main () return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : eval "$3=yes" -else +else $as_nop eval "$3=no" fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + eval $6=\$ac_save_FLAGS + fi eval ac_res=\$$3 - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +printf "%s\n" "$ac_res" >&6; } eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno -} # ac_fn_c_check_decl +} # ac_fn_check_decl # ac_fn_c_check_type LINENO TYPE VAR INCLUDES # ------------------------------------------- @@ -1850,17 +1757,18 @@ $as_echo "$ac_res" >&6; } ac_fn_c_check_type () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 -$as_echo_n "checking for $2... " >&6; } -if eval \${$3+:} false; then : - $as_echo_n "(cached) " >&6 -else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +printf %s "checking for $2... " >&6; } +if eval test \${$3+y} +then : + printf %s "(cached) " >&6 +else $as_nop eval "$3=no" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ $4 int -main () +main (void) { if (sizeof ($2)) return 0; @@ -1868,12 +1776,13 @@ if (sizeof ($2)) return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ $4 int -main () +main (void) { if (sizeof (($2))) return 0; @@ -1881,18 +1790,19 @@ if (sizeof (($2))) return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : -else +else $as_nop eval "$3=yes" fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi eval ac_res=\$$3 - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +printf "%s\n" "$ac_res" >&6; } eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno } # ac_fn_c_check_type @@ -1903,14 +1813,14 @@ $as_echo "$ac_res" >&6; } ac_fn_c_try_link () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - rm -f conftest.$ac_objext conftest$ac_exeext + rm -f conftest.$ac_objext conftest.beam conftest$ac_exeext if { { ac_try="$ac_link" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_link") 2>conftest.err ac_status=$? if test -s conftest.err; then @@ -1918,17 +1828,18 @@ $as_echo "$ac_try_echo"; } >&5 cat conftest.er1 >&5 mv -f conftest.er1 conftest.err fi - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } && { test -z "$ac_c_werror_flag" || test ! -s conftest.err } && test -s conftest$ac_exeext && { test "$cross_compiling" = yes || test -x conftest$ac_exeext - }; then : + } +then : ac_retval=0 -else - $as_echo "$as_me: failed program was:" >&5 +else $as_nop + printf "%s\n" "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_retval=1 @@ -1943,17 +1854,61 @@ fi } # ac_fn_c_try_link +# ac_fn_c_try_run LINENO +# ---------------------- +# Try to run conftest.$ac_ext, and return whether this succeeded. Assumes that +# executables *can* be run. +ac_fn_c_try_run () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +printf "%s\n" "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { ac_try='./conftest$ac_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +printf "%s\n" "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } +then : + ac_retval=0 +else $as_nop + printf "%s\n" "$as_me: program exited with status $ac_status" >&5 + printf "%s\n" "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=$ac_status +fi + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_c_try_run + # ac_fn_c_check_func LINENO FUNC VAR # ---------------------------------- # Tests whether FUNC exists, setting the cache variable VAR accordingly ac_fn_c_check_func () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 -$as_echo_n "checking for $2... " >&6; } -if eval \${$3+:} false; then : - $as_echo_n "(cached) " >&6 -else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +printf %s "checking for $2... " >&6; } +if eval test \${$3+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ /* Define $2 to an innocuous variant, in case declares $2. @@ -1961,16 +1916,9 @@ else #define $2 innocuous_$2 /* System header to define __stub macros and hopefully few prototypes, - which can conflict with char $2 (); below. - Prefer to if __STDC__ is defined, since - exists even on freestanding compilers. */ - -#ifdef __STDC__ -# include -#else -# include -#endif + which can conflict with char $2 (); below. */ +#include #undef $2 /* Override any GCC internal prototype to avoid an error. @@ -1988,35 +1936,56 @@ choke me #endif int -main () +main (void) { return $2 (); ; return 0; } _ACEOF -if ac_fn_c_try_link "$LINENO"; then : +if ac_fn_c_try_link "$LINENO" +then : eval "$3=yes" -else +else $as_nop eval "$3=no" fi -rm -f core conftest.err conftest.$ac_objext \ +rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext fi eval ac_res=\$$3 - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +printf "%s\n" "$ac_res" >&6; } eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno } # ac_fn_c_check_func +ac_configure_args_raw= +for ac_arg +do + case $ac_arg in + *\'*) + ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + as_fn_append ac_configure_args_raw " '$ac_arg'" +done + +case $ac_configure_args_raw in + *$as_nl*) + ac_safe_unquote= ;; + *) + ac_unsafe_z='|&;<>()$`\\"*?[ '' ' # This string ends in space, tab. + ac_unsafe_a="$ac_unsafe_z#~" + ac_safe_unquote="s/ '\\([^$ac_unsafe_a][^$ac_unsafe_z]*\\)'/ \\1/g" + ac_configure_args_raw=` printf "%s\n" "$ac_configure_args_raw" | sed "$ac_safe_unquote"`;; +esac + cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. It was created by cpuminer-kudaraidee $as_me 1.1.0, which was -generated by GNU Autoconf 2.69. Invocation command line was +generated by GNU Autoconf 2.71. Invocation command line was - $ $0 $@ + $ $0$ac_configure_args_raw _ACEOF exec 5>>config.log @@ -2049,8 +2018,12 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - $as_echo "PATH: $as_dir" + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + printf "%s\n" "PATH: $as_dir" done IFS=$as_save_IFS @@ -2085,7 +2058,7 @@ do | -silent | --silent | --silen | --sile | --sil) continue ;; *\'*) - ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; + ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; esac case $ac_pass in 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;; @@ -2120,11 +2093,13 @@ done # WARNING: Use '\'' to represent an apostrophe within the trap. # WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. trap 'exit_status=$? + # Sanitize IFS. + IFS=" "" $as_nl" # Save into config.log some information that might help in debugging. { echo - $as_echo "## ---------------- ## + printf "%s\n" "## ---------------- ## ## Cache variables. ## ## ---------------- ##" echo @@ -2135,8 +2110,8 @@ trap 'exit_status=$? case $ac_val in #( *${as_nl}*) case $ac_var in #( - *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 -$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; esac case $ac_var in #( _ | IFS | as_nl) ;; #( @@ -2160,7 +2135,7 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; ) echo - $as_echo "## ----------------- ## + printf "%s\n" "## ----------------- ## ## Output variables. ## ## ----------------- ##" echo @@ -2168,14 +2143,14 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; do eval ac_val=\$$ac_var case $ac_val in - *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + *\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; esac - $as_echo "$ac_var='\''$ac_val'\''" + printf "%s\n" "$ac_var='\''$ac_val'\''" done | sort echo if test -n "$ac_subst_files"; then - $as_echo "## ------------------- ## + printf "%s\n" "## ------------------- ## ## File substitutions. ## ## ------------------- ##" echo @@ -2183,15 +2158,15 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; do eval ac_val=\$$ac_var case $ac_val in - *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + *\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; esac - $as_echo "$ac_var='\''$ac_val'\''" + printf "%s\n" "$ac_var='\''$ac_val'\''" done | sort echo fi if test -s confdefs.h; then - $as_echo "## ----------- ## + printf "%s\n" "## ----------- ## ## confdefs.h. ## ## ----------- ##" echo @@ -2199,8 +2174,8 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; echo fi test "$ac_signal" != 0 && - $as_echo "$as_me: caught signal $ac_signal" - $as_echo "$as_me: exit $exit_status" + printf "%s\n" "$as_me: caught signal $ac_signal" + printf "%s\n" "$as_me: exit $exit_status" } >&5 rm -f core *.core core.conftest.* && rm -f -r conftest* confdefs* conf$$* $ac_clean_files && @@ -2214,63 +2189,48 @@ ac_signal=0 # confdefs.h avoids OS command line length limits that DEFS can exceed. rm -f -r conftest* confdefs.h -$as_echo "/* confdefs.h */" > confdefs.h +printf "%s\n" "/* confdefs.h */" > confdefs.h # Predefined preprocessor variables. -cat >>confdefs.h <<_ACEOF -#define PACKAGE_NAME "$PACKAGE_NAME" -_ACEOF +printf "%s\n" "#define PACKAGE_NAME \"$PACKAGE_NAME\"" >>confdefs.h -cat >>confdefs.h <<_ACEOF -#define PACKAGE_TARNAME "$PACKAGE_TARNAME" -_ACEOF +printf "%s\n" "#define PACKAGE_TARNAME \"$PACKAGE_TARNAME\"" >>confdefs.h -cat >>confdefs.h <<_ACEOF -#define PACKAGE_VERSION "$PACKAGE_VERSION" -_ACEOF +printf "%s\n" "#define PACKAGE_VERSION \"$PACKAGE_VERSION\"" >>confdefs.h -cat >>confdefs.h <<_ACEOF -#define PACKAGE_STRING "$PACKAGE_STRING" -_ACEOF +printf "%s\n" "#define PACKAGE_STRING \"$PACKAGE_STRING\"" >>confdefs.h -cat >>confdefs.h <<_ACEOF -#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" -_ACEOF +printf "%s\n" "#define PACKAGE_BUGREPORT \"$PACKAGE_BUGREPORT\"" >>confdefs.h -cat >>confdefs.h <<_ACEOF -#define PACKAGE_URL "$PACKAGE_URL" -_ACEOF +printf "%s\n" "#define PACKAGE_URL \"$PACKAGE_URL\"" >>confdefs.h # Let the site file select an alternate cache file if it wants to. # Prefer an explicitly selected file to automatically selected ones. -ac_site_file1=NONE -ac_site_file2=NONE if test -n "$CONFIG_SITE"; then - # We do not want a PATH search for config.site. - case $CONFIG_SITE in #(( - -*) ac_site_file1=./$CONFIG_SITE;; - */*) ac_site_file1=$CONFIG_SITE;; - *) ac_site_file1=./$CONFIG_SITE;; - esac + ac_site_files="$CONFIG_SITE" elif test "x$prefix" != xNONE; then - ac_site_file1=$prefix/share/config.site - ac_site_file2=$prefix/etc/config.site + ac_site_files="$prefix/share/config.site $prefix/etc/config.site" else - ac_site_file1=$ac_default_prefix/share/config.site - ac_site_file2=$ac_default_prefix/etc/config.site + ac_site_files="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" fi -for ac_site_file in "$ac_site_file1" "$ac_site_file2" + +for ac_site_file in $ac_site_files do - test "x$ac_site_file" = xNONE && continue - if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 -$as_echo "$as_me: loading site script $ac_site_file" >&6;} + case $ac_site_file in #( + */*) : + ;; #( + *) : + ac_site_file=./$ac_site_file ;; +esac + if test -f "$ac_site_file" && test -r "$ac_site_file"; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 +printf "%s\n" "$as_me: loading site script $ac_site_file" >&6;} sed 's/^/| /' "$ac_site_file" >&5 . "$ac_site_file" \ - || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} + || { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "failed to load site script $ac_site_file See \`config.log' for more details" "$LINENO" 5; } fi @@ -2280,138 +2240,745 @@ if test -r "$cache_file"; then # Some versions of bash will fail to source /dev/null (special files # actually), so we avoid doing that. DJGPP emulates it as a regular file. if test /dev/null != "$cache_file" && test -f "$cache_file"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 -$as_echo "$as_me: loading cache $cache_file" >&6;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 +printf "%s\n" "$as_me: loading cache $cache_file" >&6;} case $cache_file in [\\/]* | ?:[\\/]* ) . "$cache_file";; *) . "./$cache_file";; esac fi else - { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 -$as_echo "$as_me: creating cache $cache_file" >&6;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 +printf "%s\n" "$as_me: creating cache $cache_file" >&6;} >$cache_file fi -# Check that the precious variables saved in the cache have kept the same -# value. -ac_cache_corrupted=false -for ac_var in $ac_precious_vars; do - eval ac_old_set=\$ac_cv_env_${ac_var}_set - eval ac_new_set=\$ac_env_${ac_var}_set - eval ac_old_val=\$ac_cv_env_${ac_var}_value - eval ac_new_val=\$ac_env_${ac_var}_value - case $ac_old_set,$ac_new_set in - set,) - { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 -$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} - ac_cache_corrupted=: ;; - ,set) - { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 -$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} - ac_cache_corrupted=: ;; - ,);; - *) - if test "x$ac_old_val" != "x$ac_new_val"; then - # differences in whitespace do not lead to failure. - ac_old_val_w=`echo x $ac_old_val` - ac_new_val_w=`echo x $ac_new_val` - if test "$ac_old_val_w" != "$ac_new_val_w"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 -$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} - ac_cache_corrupted=: - else - { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 -$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} - eval $ac_var=\$ac_old_val - fi - { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 -$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 -$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} - fi;; - esac - # Pass precious variables to config.status. - if test "$ac_new_set" = set; then - case $ac_new_val in - *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; - *) ac_arg=$ac_var=$ac_new_val ;; - esac - case " $ac_configure_args " in - *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. - *) as_fn_append ac_configure_args " '$ac_arg'" ;; - esac - fi -done -if $ac_cache_corrupted; then - { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 -$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} - as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5 -fi -## -------------------- ## -## Main body of script. ## -## -------------------- ## +# Test code for whether the C compiler supports C89 (global declarations) +ac_c_conftest_c89_globals=' +/* Does the compiler advertise C89 conformance? + Do not test the value of __STDC__, because some compilers set it to 0 + while being otherwise adequately conformant. */ +#if !defined __STDC__ +# error "Compiler does not advertise C89 conformance" +#endif -ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu +#include +#include +struct stat; +/* Most of the following tests are stolen from RCS 5.7 src/conf.sh. */ +struct buf { int x; }; +struct buf * (*rcsopen) (struct buf *, struct stat *, int); +static char *e (p, i) + char **p; + int i; +{ + return p[i]; +} +static char *f (char * (*g) (char **, int), char **p, ...) +{ + char *s; + va_list v; + va_start (v,p); + s = g (p, va_arg (v,int)); + va_end (v); + return s; +} + +/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has + function prototypes and stuff, but not \xHH hex character constants. + These do not provoke an error unfortunately, instead are silently treated + as an "x". The following induces an error, until -std is added to get + proper ANSI mode. Curiously \x00 != x always comes out true, for an + array size at least. It is necessary to write \x00 == 0 to get something + that is true only with -std. */ +int osf4_cc_array ['\''\x00'\'' == 0 ? 1 : -1]; + +/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters + inside strings and character constants. */ +#define FOO(x) '\''x'\'' +int xlc6_cc_array[FOO(a) == '\''x'\'' ? 1 : -1]; + +int test (int i, double x); +struct s1 {int (*f) (int a);}; +struct s2 {int (*f) (double a);}; +int pairnames (int, char **, int *(*)(struct buf *, struct stat *, int), + int, int);' + +# Test code for whether the C compiler supports C89 (body of main). +ac_c_conftest_c89_main=' +ok |= (argc == 0 || f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]); +' + +# Test code for whether the C compiler supports C99 (global declarations) +ac_c_conftest_c99_globals=' +// Does the compiler advertise C99 conformance? +#if !defined __STDC_VERSION__ || __STDC_VERSION__ < 199901L +# error "Compiler does not advertise C99 conformance" +#endif + +#include +extern int puts (const char *); +extern int printf (const char *, ...); +extern int dprintf (int, const char *, ...); +extern void *malloc (size_t); + +// Check varargs macros. These examples are taken from C99 6.10.3.5. +// dprintf is used instead of fprintf to avoid needing to declare +// FILE and stderr. +#define debug(...) dprintf (2, __VA_ARGS__) +#define showlist(...) puts (#__VA_ARGS__) +#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__)) +static void +test_varargs_macros (void) +{ + int x = 1234; + int y = 5678; + debug ("Flag"); + debug ("X = %d\n", x); + showlist (The first, second, and third items.); + report (x>y, "x is %d but y is %d", x, y); +} + +// Check long long types. +#define BIG64 18446744073709551615ull +#define BIG32 4294967295ul +#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0) +#if !BIG_OK + #error "your preprocessor is broken" +#endif +#if BIG_OK +#else + #error "your preprocessor is broken" +#endif +static long long int bignum = -9223372036854775807LL; +static unsigned long long int ubignum = BIG64; + +struct incomplete_array +{ + int datasize; + double data[]; +}; + +struct named_init { + int number; + const wchar_t *name; + double average; +}; + +typedef const char *ccp; + +static inline int +test_restrict (ccp restrict text) +{ + // See if C++-style comments work. + // Iterate through items via the restricted pointer. + // Also check for declarations in for loops. + for (unsigned int i = 0; *(text+i) != '\''\0'\''; ++i) + continue; + return 0; +} + +// Check varargs and va_copy. +static bool +test_varargs (const char *format, ...) +{ + va_list args; + va_start (args, format); + va_list args_copy; + va_copy (args_copy, args); + + const char *str = ""; + int number = 0; + float fnumber = 0; + + while (*format) + { + switch (*format++) + { + case '\''s'\'': // string + str = va_arg (args_copy, const char *); + break; + case '\''d'\'': // int + number = va_arg (args_copy, int); + break; + case '\''f'\'': // float + fnumber = va_arg (args_copy, double); + break; + default: + break; + } + } + va_end (args_copy); + va_end (args); + + return *str && number && fnumber; +} +' + +# Test code for whether the C compiler supports C99 (body of main). +ac_c_conftest_c99_main=' + // Check bool. + _Bool success = false; + success |= (argc != 0); + + // Check restrict. + if (test_restrict ("String literal") == 0) + success = true; + char *restrict newvar = "Another string"; + // Check varargs. + success &= test_varargs ("s, d'\'' f .", "string", 65, 34.234); + test_varargs_macros (); + // Check flexible array members. + struct incomplete_array *ia = + malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10)); + ia->datasize = 10; + for (int i = 0; i < ia->datasize; ++i) + ia->data[i] = i * 1.234; + + // Check named initializers. + struct named_init ni = { + .number = 34, + .name = L"Test wide string", + .average = 543.34343, + }; + ni.number = 58; + + int dynamic_array[ni.number]; + dynamic_array[0] = argv[0][0]; + dynamic_array[ni.number - 1] = 543; + + // work around unused variable warnings + ok |= (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == '\''x'\'' + || dynamic_array[ni.number - 1] != 543); +' + +# Test code for whether the C compiler supports C11 (global declarations) +ac_c_conftest_c11_globals=' +// Does the compiler advertise C11 conformance? +#if !defined __STDC_VERSION__ || __STDC_VERSION__ < 201112L +# error "Compiler does not advertise C11 conformance" +#endif + +// Check _Alignas. +char _Alignas (double) aligned_as_double; +char _Alignas (0) no_special_alignment; +extern char aligned_as_int; +char _Alignas (0) _Alignas (int) aligned_as_int; + +// Check _Alignof. +enum +{ + int_alignment = _Alignof (int), + int_array_alignment = _Alignof (int[100]), + char_alignment = _Alignof (char) +}; +_Static_assert (0 < -_Alignof (int), "_Alignof is signed"); + +// Check _Noreturn. +int _Noreturn does_not_return (void) { for (;;) continue; } + +// Check _Static_assert. +struct test_static_assert +{ + int x; + _Static_assert (sizeof (int) <= sizeof (long int), + "_Static_assert does not work in struct"); + long int y; +}; + +// Check UTF-8 literals. +#define u8 syntax error! +char const utf8_literal[] = u8"happens to be ASCII" "another string"; + +// Check duplicate typedefs. +typedef long *long_ptr; +typedef long int *long_ptr; +typedef long_ptr long_ptr; + +// Anonymous structures and unions -- taken from C11 6.7.2.1 Example 1. +struct anonymous +{ + union { + struct { int i; int j; }; + struct { int k; long int l; } w; + }; + int m; +} v1; +' + +# Test code for whether the C compiler supports C11 (body of main). +ac_c_conftest_c11_main=' + _Static_assert ((offsetof (struct anonymous, i) + == offsetof (struct anonymous, w.k)), + "Anonymous union alignment botch"); + v1.i = 2; + v1.w.k = 5; + ok |= v1.i != 5; +' + +# Test code for whether the C compiler supports C11 (complete). +ac_c_conftest_c11_program="${ac_c_conftest_c89_globals} +${ac_c_conftest_c99_globals} +${ac_c_conftest_c11_globals} + +int +main (int argc, char **argv) +{ + int ok = 0; + ${ac_c_conftest_c89_main} + ${ac_c_conftest_c99_main} + ${ac_c_conftest_c11_main} + return ok; +} +" + +# Test code for whether the C compiler supports C99 (complete). +ac_c_conftest_c99_program="${ac_c_conftest_c89_globals} +${ac_c_conftest_c99_globals} + +int +main (int argc, char **argv) +{ + int ok = 0; + ${ac_c_conftest_c89_main} + ${ac_c_conftest_c99_main} + return ok; +} +" + +# Test code for whether the C compiler supports C89 (complete). +ac_c_conftest_c89_program="${ac_c_conftest_c89_globals} + +int +main (int argc, char **argv) +{ + int ok = 0; + ${ac_c_conftest_c89_main} + return ok; +} +" + +# Test code for whether the C++ compiler supports C++98 (global declarations) +ac_cxx_conftest_cxx98_globals=' +// Does the compiler advertise C++98 conformance? +#if !defined __cplusplus || __cplusplus < 199711L +# error "Compiler does not advertise C++98 conformance" +#endif + +// These inclusions are to reject old compilers that +// lack the unsuffixed header files. +#include +#include + +// and are *not* freestanding headers in C++98. +extern void assert (int); +namespace std { + extern int strcmp (const char *, const char *); +} + +// Namespaces, exceptions, and templates were all added after "C++ 2.0". +using std::exception; +using std::strcmp; + +namespace { + +void test_exception_syntax() +{ + try { + throw "test"; + } catch (const char *s) { + // Extra parentheses suppress a warning when building autoconf itself, + // due to lint rules shared with more typical C programs. + assert (!(strcmp) (s, "test")); + } +} + +template struct test_template +{ + T const val; + explicit test_template(T t) : val(t) {} + template T add(U u) { return static_cast(u) + val; } +}; + +} // anonymous namespace +' + +# Test code for whether the C++ compiler supports C++98 (body of main) +ac_cxx_conftest_cxx98_main=' + assert (argc); + assert (! argv[0]); +{ + test_exception_syntax (); + test_template tt (2.0); + assert (tt.add (4) == 6.0); + assert (true && !false); +} +' + +# Test code for whether the C++ compiler supports C++11 (global declarations) +ac_cxx_conftest_cxx11_globals=' +// Does the compiler advertise C++ 2011 conformance? +#if !defined __cplusplus || __cplusplus < 201103L +# error "Compiler does not advertise C++11 conformance" +#endif + +namespace cxx11test +{ + constexpr int get_val() { return 20; } + + struct testinit + { + int i; + double d; + }; + + class delegate + { + public: + delegate(int n) : n(n) {} + delegate(): delegate(2354) {} + + virtual int getval() { return this->n; }; + protected: + int n; + }; + + class overridden : public delegate + { + public: + overridden(int n): delegate(n) {} + virtual int getval() override final { return this->n * 2; } + }; + + class nocopy + { + public: + nocopy(int i): i(i) {} + nocopy() = default; + nocopy(const nocopy&) = delete; + nocopy & operator=(const nocopy&) = delete; + private: + int i; + }; + + // for testing lambda expressions + template Ret eval(Fn f, Ret v) + { + return f(v); + } + + // for testing variadic templates and trailing return types + template auto sum(V first) -> V + { + return first; + } + template auto sum(V first, Args... rest) -> V + { + return first + sum(rest...); + } +} +' + +# Test code for whether the C++ compiler supports C++11 (body of main) +ac_cxx_conftest_cxx11_main=' +{ + // Test auto and decltype + auto a1 = 6538; + auto a2 = 48573953.4; + auto a3 = "String literal"; + + int total = 0; + for (auto i = a3; *i; ++i) { total += *i; } + + decltype(a2) a4 = 34895.034; +} +{ + // Test constexpr + short sa[cxx11test::get_val()] = { 0 }; +} +{ + // Test initializer lists + cxx11test::testinit il = { 4323, 435234.23544 }; +} +{ + // Test range-based for + int array[] = {9, 7, 13, 15, 4, 18, 12, 10, 5, 3, + 14, 19, 17, 8, 6, 20, 16, 2, 11, 1}; + for (auto &x : array) { x += 23; } +} +{ + // Test lambda expressions + using cxx11test::eval; + assert (eval ([](int x) { return x*2; }, 21) == 42); + double d = 2.0; + assert (eval ([&](double x) { return d += x; }, 3.0) == 5.0); + assert (d == 5.0); + assert (eval ([=](double x) mutable { return d += x; }, 4.0) == 9.0); + assert (d == 5.0); +} +{ + // Test use of variadic templates + using cxx11test::sum; + auto a = sum(1); + auto b = sum(1, 2); + auto c = sum(1.0, 2.0, 3.0); +} +{ + // Test constructor delegation + cxx11test::delegate d1; + cxx11test::delegate d2(); + cxx11test::delegate d3(45); +} +{ + // Test override and final + cxx11test::overridden o1(55464); +} +{ + // Test nullptr + char *c = nullptr; +} +{ + // Test template brackets + test_template<::test_template> v(test_template(12)); +} +{ + // Unicode literals + char const *utf8 = u8"UTF-8 string \u2500"; + char16_t const *utf16 = u"UTF-8 string \u2500"; + char32_t const *utf32 = U"UTF-32 string \u2500"; +} +' + +# Test code for whether the C compiler supports C++11 (complete). +ac_cxx_conftest_cxx11_program="${ac_cxx_conftest_cxx98_globals} +${ac_cxx_conftest_cxx11_globals} + +int +main (int argc, char **argv) +{ + int ok = 0; + ${ac_cxx_conftest_cxx98_main} + ${ac_cxx_conftest_cxx11_main} + return ok; +} +" + +# Test code for whether the C compiler supports C++98 (complete). +ac_cxx_conftest_cxx98_program="${ac_cxx_conftest_cxx98_globals} +int +main (int argc, char **argv) +{ + int ok = 0; + ${ac_cxx_conftest_cxx98_main} + return ok; +} +" + +as_fn_append ac_header_c_list " stdio.h stdio_h HAVE_STDIO_H" +as_fn_append ac_header_c_list " stdlib.h stdlib_h HAVE_STDLIB_H" +as_fn_append ac_header_c_list " string.h string_h HAVE_STRING_H" +as_fn_append ac_header_c_list " inttypes.h inttypes_h HAVE_INTTYPES_H" +as_fn_append ac_header_c_list " stdint.h stdint_h HAVE_STDINT_H" +as_fn_append ac_header_c_list " strings.h strings_h HAVE_STRINGS_H" +as_fn_append ac_header_c_list " sys/stat.h sys_stat_h HAVE_SYS_STAT_H" +as_fn_append ac_header_c_list " sys/types.h sys_types_h HAVE_SYS_TYPES_H" +as_fn_append ac_header_c_list " unistd.h unistd_h HAVE_UNISTD_H" + +# Auxiliary files required by this configure script. +ac_aux_files="compile missing install-sh config.guess config.sub" + +# Locations in which to look for auxiliary files. +ac_aux_dir_candidates="${srcdir}${PATH_SEPARATOR}${srcdir}/..${PATH_SEPARATOR}${srcdir}/../.." + +# Search for a directory containing all of the required auxiliary files, +# $ac_aux_files, from the $PATH-style list $ac_aux_dir_candidates. +# If we don't find one directory that contains all the files we need, +# we report the set of missing files from the *first* directory in +# $ac_aux_dir_candidates and give up. +ac_missing_aux_files="" +ac_first_candidate=: +printf "%s\n" "$as_me:${as_lineno-$LINENO}: looking for aux files: $ac_aux_files" >&5 +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +as_found=false +for as_dir in $ac_aux_dir_candidates +do + IFS=$as_save_IFS + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + as_found=: -ac_aux_dir= -for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do - if test -f "$ac_dir/install-sh"; then - ac_aux_dir=$ac_dir - ac_install_sh="$ac_aux_dir/install-sh -c" - break - elif test -f "$ac_dir/install.sh"; then - ac_aux_dir=$ac_dir - ac_install_sh="$ac_aux_dir/install.sh -c" - break - elif test -f "$ac_dir/shtool"; then - ac_aux_dir=$ac_dir - ac_install_sh="$ac_aux_dir/shtool install -c" + printf "%s\n" "$as_me:${as_lineno-$LINENO}: trying $as_dir" >&5 + ac_aux_dir_found=yes + ac_install_sh= + for ac_aux in $ac_aux_files + do + # As a special case, if "install-sh" is required, that requirement + # can be satisfied by any of "install-sh", "install.sh", or "shtool", + # and $ac_install_sh is set appropriately for whichever one is found. + if test x"$ac_aux" = x"install-sh" + then + if test -f "${as_dir}install-sh"; then + printf "%s\n" "$as_me:${as_lineno-$LINENO}: ${as_dir}install-sh found" >&5 + ac_install_sh="${as_dir}install-sh -c" + elif test -f "${as_dir}install.sh"; then + printf "%s\n" "$as_me:${as_lineno-$LINENO}: ${as_dir}install.sh found" >&5 + ac_install_sh="${as_dir}install.sh -c" + elif test -f "${as_dir}shtool"; then + printf "%s\n" "$as_me:${as_lineno-$LINENO}: ${as_dir}shtool found" >&5 + ac_install_sh="${as_dir}shtool install -c" + else + ac_aux_dir_found=no + if $ac_first_candidate; then + ac_missing_aux_files="${ac_missing_aux_files} install-sh" + else + break + fi + fi + else + if test -f "${as_dir}${ac_aux}"; then + printf "%s\n" "$as_me:${as_lineno-$LINENO}: ${as_dir}${ac_aux} found" >&5 + else + ac_aux_dir_found=no + if $ac_first_candidate; then + ac_missing_aux_files="${ac_missing_aux_files} ${ac_aux}" + else + break + fi + fi + fi + done + if test "$ac_aux_dir_found" = yes; then + ac_aux_dir="$as_dir" break fi + ac_first_candidate=false + + as_found=false done -if test -z "$ac_aux_dir"; then - as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5 +IFS=$as_save_IFS +if $as_found +then : + +else $as_nop + as_fn_error $? "cannot find required auxiliary files:$ac_missing_aux_files" "$LINENO" 5 fi + # These three variables are undocumented and unsupported, # and are intended to be withdrawn in a future Autoconf release. # They can cause serious problems if a builder's source tree is in a directory # whose full name contains unusual characters. -ac_config_guess="$SHELL $ac_aux_dir/config.guess" # Please don't use this var. -ac_config_sub="$SHELL $ac_aux_dir/config.sub" # Please don't use this var. -ac_configure="$SHELL $ac_aux_dir/configure" # Please don't use this var. +if test -f "${ac_aux_dir}config.guess"; then + ac_config_guess="$SHELL ${ac_aux_dir}config.guess" +fi +if test -f "${ac_aux_dir}config.sub"; then + ac_config_sub="$SHELL ${ac_aux_dir}config.sub" +fi +if test -f "$ac_aux_dir/configure"; then + ac_configure="$SHELL ${ac_aux_dir}configure" +fi + +# Check that the precious variables saved in the cache have kept the same +# value. +ac_cache_corrupted=false +for ac_var in $ac_precious_vars; do + eval ac_old_set=\$ac_cv_env_${ac_var}_set + eval ac_new_set=\$ac_env_${ac_var}_set + eval ac_old_val=\$ac_cv_env_${ac_var}_value + eval ac_new_val=\$ac_env_${ac_var}_value + case $ac_old_set,$ac_new_set in + set,) + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 +printf "%s\n" "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,set) + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 +printf "%s\n" "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,);; + *) + if test "x$ac_old_val" != "x$ac_new_val"; then + # differences in whitespace do not lead to failure. + ac_old_val_w=`echo x $ac_old_val` + ac_new_val_w=`echo x $ac_new_val` + if test "$ac_old_val_w" != "$ac_new_val_w"; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 +printf "%s\n" "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} + ac_cache_corrupted=: + else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 +printf "%s\n" "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} + eval $ac_var=\$ac_old_val + fi + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 +printf "%s\n" "$as_me: former value: \`$ac_old_val'" >&2;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 +printf "%s\n" "$as_me: current value: \`$ac_new_val'" >&2;} + fi;; + esac + # Pass precious variables to config.status. + if test "$ac_new_set" = set; then + case $ac_new_val in + *\'*) ac_arg=$ac_var=`printf "%s\n" "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; + *) ac_arg=$ac_var=$ac_new_val ;; + esac + case " $ac_configure_args " in + *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. + *) as_fn_append ac_configure_args " '$ac_arg'" ;; + esac + fi +done +if $ac_cache_corrupted; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 +printf "%s\n" "$as_me: error: changes in the environment can compromise the build" >&2;} + as_fn_error $? "run \`${MAKE-make} distclean' and/or \`rm $cache_file' + and start over" "$LINENO" 5 +fi +## -------------------- ## +## Main body of script. ## +## -------------------- ## + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu -# Make sure we can run config.sub. -$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 || - as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5 -$as_echo_n "checking build system type... " >&6; } -if ${ac_cv_build+:} false; then : - $as_echo_n "(cached) " >&6 -else + + + + + # Make sure we can run config.sub. +$SHELL "${ac_aux_dir}config.sub" sun4 >/dev/null 2>&1 || + as_fn_error $? "cannot run $SHELL ${ac_aux_dir}config.sub" "$LINENO" 5 + +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking build system type" >&5 +printf %s "checking build system type... " >&6; } +if test ${ac_cv_build+y} +then : + printf %s "(cached) " >&6 +else $as_nop ac_build_alias=$build_alias test "x$ac_build_alias" = x && - ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"` + ac_build_alias=`$SHELL "${ac_aux_dir}config.guess"` test "x$ac_build_alias" = x && as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5 -ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` || - as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5 +ac_cv_build=`$SHELL "${ac_aux_dir}config.sub" $ac_build_alias` || + as_fn_error $? "$SHELL ${ac_aux_dir}config.sub $ac_build_alias failed" "$LINENO" 5 fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5 -$as_echo "$ac_cv_build" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5 +printf "%s\n" "$ac_cv_build" >&6; } case $ac_cv_build in *-*-*) ;; *) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;; @@ -2430,21 +2997,22 @@ IFS=$ac_save_IFS case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5 -$as_echo_n "checking host system type... " >&6; } -if ${ac_cv_host+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking host system type" >&5 +printf %s "checking host system type... " >&6; } +if test ${ac_cv_host+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test "x$host_alias" = x; then ac_cv_host=$ac_cv_build else - ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` || - as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5 + ac_cv_host=`$SHELL "${ac_aux_dir}config.sub" $host_alias` || + as_fn_error $? "$SHELL ${ac_aux_dir}config.sub $host_alias failed" "$LINENO" 5 fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5 -$as_echo "$ac_cv_host" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5 +printf "%s\n" "$ac_cv_host" >&6; } case $ac_cv_host in *-*-*) ;; *) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;; @@ -2463,21 +3031,22 @@ IFS=$ac_save_IFS case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking target system type" >&5 -$as_echo_n "checking target system type... " >&6; } -if ${ac_cv_target+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking target system type" >&5 +printf %s "checking target system type... " >&6; } +if test ${ac_cv_target+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test "x$target_alias" = x; then ac_cv_target=$ac_cv_host else - ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` || - as_fn_error $? "$SHELL $ac_aux_dir/config.sub $target_alias failed" "$LINENO" 5 + ac_cv_target=`$SHELL "${ac_aux_dir}config.sub" $target_alias` || + as_fn_error $? "$SHELL ${ac_aux_dir}config.sub $target_alias failed" "$LINENO" 5 fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5 -$as_echo "$ac_cv_target" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5 +printf "%s\n" "$ac_cv_target" >&6; } case $ac_cv_target in *-*-*) ;; *) as_fn_error $? "invalid value of canonical target" "$LINENO" 5;; @@ -2508,9 +3077,10 @@ ac_config_headers="$ac_config_headers cpuminer-config.h" -am__api_version='1.15' +am__api_version='1.16' + -# Find a good install program. We prefer a C program (faster), + # Find a good install program. We prefer a C program (faster), # so one script is as good as another. But avoid the broken or # incompatible versions: # SysV /etc/install, /usr/sbin/install @@ -2524,20 +3094,25 @@ am__api_version='1.15' # OS/2's system install, which has a completely different semantic # ./install, which can be erroneously created by make from ./install.sh. # Reject install programs that cannot install multiple files. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5 -$as_echo_n "checking for a BSD-compatible install... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5 +printf %s "checking for a BSD-compatible install... " >&6; } if test -z "$INSTALL"; then -if ${ac_cv_path_install+:} false; then : - $as_echo_n "(cached) " >&6 -else +if test ${ac_cv_path_install+y} +then : + printf %s "(cached) " >&6 +else $as_nop as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - # Account for people who put trailing slashes in PATH elements. -case $as_dir/ in #(( - ./ | .// | /[cC]/* | \ + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + # Account for fact that we put trailing slashes in our PATH walk. +case $as_dir in #(( + ./ | /[cC]/* | \ /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \ ?:[\\/]os2[\\/]install[\\/]* | ?:[\\/]OS2[\\/]INSTALL[\\/]* | \ /usr/ucb/* ) ;; @@ -2547,13 +3122,13 @@ case $as_dir/ in #(( # by default. for ac_prog in ginstall scoinst install; do for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_prog$ac_exec_ext"; then if test $ac_prog = install && - grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then + grep dspmsg "$as_dir$ac_prog$ac_exec_ext" >/dev/null 2>&1; then # AIX install. It has an incompatible calling convention. : elif test $ac_prog = install && - grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then + grep pwplus "$as_dir$ac_prog$ac_exec_ext" >/dev/null 2>&1; then # program-specific install script used by HP pwplus--don't use. : else @@ -2561,12 +3136,12 @@ case $as_dir/ in #(( echo one > conftest.one echo two > conftest.two mkdir conftest.dir - if "$as_dir/$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir" && + if "$as_dir$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir/" && test -s conftest.one && test -s conftest.two && test -s conftest.dir/conftest.one && test -s conftest.dir/conftest.two then - ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c" + ac_cv_path_install="$as_dir$ac_prog$ac_exec_ext -c" break 3 fi fi @@ -2582,7 +3157,7 @@ IFS=$as_save_IFS rm -rf conftest.one conftest.two conftest.dir fi - if test "${ac_cv_path_install+set}" = set; then + if test ${ac_cv_path_install+y}; then INSTALL=$ac_cv_path_install else # As a last resort, use the slow shell script. Don't cache a @@ -2592,8 +3167,8 @@ fi INSTALL=$ac_install_sh fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5 -$as_echo "$INSTALL" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5 +printf "%s\n" "$INSTALL" >&6; } # Use test -z because SunOS4 sh mishandles braces in ${var-val}. # It thinks the first close brace ends the variable substitution. @@ -2603,8 +3178,8 @@ test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}' test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5 -$as_echo_n "checking whether build environment is sane... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5 +printf %s "checking whether build environment is sane... " >&6; } # Reject unsafe characters in $srcdir or the absolute working directory # name. Accept space and tab only in the latter. am_lf=' @@ -2658,8 +3233,8 @@ else as_fn_error $? "newly created file is older than distributed files! Check your system clock" "$LINENO" 5 fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } # If we didn't sleep, we still need to ensure time stamps of config.status and # generated files are strictly newer. am_sleep_pid= @@ -2678,26 +3253,23 @@ test "$program_suffix" != NONE && # Double any \ or $. # By default was `s,x,x', remove it if useless. ac_script='s/[\\$]/&&/g;s/;s,x,x,$//' -program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"` +program_transform_name=`printf "%s\n" "$program_transform_name" | sed "$ac_script"` + # Expand $ac_aux_dir to an absolute path. am_aux_dir=`cd "$ac_aux_dir" && pwd` -if test x"${MISSING+set}" != xset; then - case $am_aux_dir in - *\ * | *\ *) - MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;; - *) - MISSING="\${SHELL} $am_aux_dir/missing" ;; - esac + + if test x"${MISSING+set}" != xset; then + MISSING="\${SHELL} '$am_aux_dir/missing'" fi # Use eval to expand $SHELL if eval "$MISSING --is-lightweight"; then am_missing_run="$MISSING " else am_missing_run= - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: 'missing' script is too old or missing" >&5 -$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: 'missing' script is too old or missing" >&5 +printf "%s\n" "$as_me: WARNING: 'missing' script is too old or missing" >&2;} fi if test x"${install_sh+set}" != xset; then @@ -2717,11 +3289,12 @@ if test "$cross_compiling" != no; then if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args. set dummy ${ac_tool_prefix}strip; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_STRIP+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_STRIP+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$STRIP"; then ac_cv_prog_STRIP="$STRIP" # Let the user override the test. else @@ -2729,11 +3302,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_STRIP="${ac_tool_prefix}strip" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -2744,11 +3321,11 @@ fi fi STRIP=$ac_cv_prog_STRIP if test -n "$STRIP"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5 -$as_echo "$STRIP" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5 +printf "%s\n" "$STRIP" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -2757,11 +3334,12 @@ if test -z "$ac_cv_prog_STRIP"; then ac_ct_STRIP=$STRIP # Extract the first word of "strip", so it can be a program name with args. set dummy strip; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_ac_ct_STRIP+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_ac_ct_STRIP+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$ac_ct_STRIP"; then ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test. else @@ -2769,11 +3347,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_STRIP="strip" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -2784,11 +3366,11 @@ fi fi ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP if test -n "$ac_ct_STRIP"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5 -$as_echo "$ac_ct_STRIP" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5 +printf "%s\n" "$ac_ct_STRIP" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi if test "x$ac_ct_STRIP" = x; then @@ -2796,8 +3378,8 @@ fi else case $cross_compiling:$ac_tool_warned in yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} ac_tool_warned=yes ;; esac STRIP=$ac_ct_STRIP @@ -2809,25 +3391,31 @@ fi fi INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a thread-safe mkdir -p" >&5 -$as_echo_n "checking for a thread-safe mkdir -p... " >&6; } + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for a race-free mkdir -p" >&5 +printf %s "checking for a race-free mkdir -p... " >&6; } if test -z "$MKDIR_P"; then - if ${ac_cv_path_mkdir+:} false; then : - $as_echo_n "(cached) " >&6 -else + if test ${ac_cv_path_mkdir+y} +then : + printf %s "(cached) " >&6 +else $as_nop as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_prog in mkdir gmkdir; do for ac_exec_ext in '' $ac_executable_extensions; do - as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext" || continue - case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #( - 'mkdir (GNU coreutils) '* | \ - 'mkdir (coreutils) '* | \ + as_fn_executable_p "$as_dir$ac_prog$ac_exec_ext" || continue + case `"$as_dir$ac_prog$ac_exec_ext" --version 2>&1` in #( + 'mkdir ('*'coreutils) '* | \ + 'BusyBox '* | \ 'mkdir (fileutils) '4.1*) - ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext + ac_cv_path_mkdir=$as_dir$ac_prog$ac_exec_ext break 3;; esac done @@ -2838,7 +3426,7 @@ IFS=$as_save_IFS fi test -d ./--version && rmdir ./--version - if test "${ac_cv_path_mkdir+set}" = set; then + if test ${ac_cv_path_mkdir+y}; then MKDIR_P="$ac_cv_path_mkdir -p" else # As a last resort, use the slow shell script. Don't cache a @@ -2848,18 +3436,19 @@ fi MKDIR_P="$ac_install_sh -d" fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5 -$as_echo "$MKDIR_P" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5 +printf "%s\n" "$MKDIR_P" >&6; } for ac_prog in gawk mawk nawk awk do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_AWK+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_AWK+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$AWK"; then ac_cv_prog_AWK="$AWK" # Let the user override the test. else @@ -2867,11 +3456,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_AWK="$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -2882,24 +3475,25 @@ fi fi AWK=$ac_cv_prog_AWK if test -n "$AWK"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5 -$as_echo "$AWK" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5 +printf "%s\n" "$AWK" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi test -n "$AWK" && break done -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5 -$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5 +printf %s "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; } set x ${MAKE-make} -ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'` -if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then : - $as_echo_n "(cached) " >&6 -else +ac_make=`printf "%s\n" "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'` +if eval test \${ac_cv_prog_make_${ac_make}_set+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat >conftest.make <<\_ACEOF SHELL = /bin/sh all: @@ -2915,12 +3509,12 @@ esac rm -f conftest.make fi if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } SET_MAKE= else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } SET_MAKE="MAKE=${MAKE-make}" fi @@ -2934,7 +3528,8 @@ fi rmdir .tst 2>/dev/null # Check whether --enable-silent-rules was given. -if test "${enable_silent_rules+set}" = set; then : +if test ${enable_silent_rules+y} +then : enableval=$enable_silent_rules; fi @@ -2944,12 +3539,13 @@ case $enable_silent_rules in # ((( *) AM_DEFAULT_VERBOSITY=1;; esac am_make=${MAKE-make} -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5 -$as_echo_n "checking whether $am_make supports nested variables... " >&6; } -if ${am_cv_make_support_nested_variables+:} false; then : - $as_echo_n "(cached) " >&6 -else - if $as_echo 'TRUE=$(BAR$(V)) +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5 +printf %s "checking whether $am_make supports nested variables... " >&6; } +if test ${am_cv_make_support_nested_variables+y} +then : + printf %s "(cached) " >&6 +else $as_nop + if printf "%s\n" 'TRUE=$(BAR$(V)) BAR0=false BAR1=true V=1 @@ -2961,8 +3557,8 @@ else am_cv_make_support_nested_variables=no fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5 -$as_echo "$am_cv_make_support_nested_variables" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5 +printf "%s\n" "$am_cv_make_support_nested_variables" >&6; } if test $am_cv_make_support_nested_variables = yes; then AM_V='$(V)' AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)' @@ -2997,14 +3593,10 @@ fi VERSION='1.1.0' -cat >>confdefs.h <<_ACEOF -#define PACKAGE "$PACKAGE" -_ACEOF +printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h -cat >>confdefs.h <<_ACEOF -#define VERSION "$VERSION" -_ACEOF +printf "%s\n" "#define VERSION \"$VERSION\"" >>confdefs.h # Some tools Automake needs. @@ -3024,8 +3616,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"} # For better backward compatibility. To be removed once Automake 1.9.x # dies out for good. For more background, see: -# -# +# +# mkdir_p='$(MKDIR_P)' # We need awk for the "check" target (and possibly the TAP driver). The @@ -3076,7 +3668,7 @@ END Aborting the configuration process, to ensure you take notice of the issue. You can download and install GNU coreutils to get an 'rm' implementation -that behaves properly: . +that behaves properly: . If you want to complete the configuration process using your problematic 'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM @@ -3089,17 +3681,18 @@ fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5 -$as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5 +printf %s "checking whether to enable maintainer-specific portions of Makefiles... " >&6; } # Check whether --enable-maintainer-mode was given. -if test "${enable_maintainer_mode+set}" = set; then : +if test ${enable_maintainer_mode+y} +then : enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval -else +else $as_nop USE_MAINTAINER_MODE=no fi - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5 -$as_echo "$USE_MAINTAINER_MODE" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5 +printf "%s\n" "$USE_MAINTAINER_MODE" >&6; } if test $USE_MAINTAINER_MODE = yes; then MAINTAINER_MODE_TRUE= MAINTAINER_MODE_FALSE='#' @@ -3112,52 +3705,62 @@ fi + + + + + + + + + DEPDIR="${am__leading_dot}deps" ac_config_commands="$ac_config_commands depfiles" - -am_make=${MAKE-make} -cat > confinc << 'END' +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} supports the include directive" >&5 +printf %s "checking whether ${MAKE-make} supports the include directive... " >&6; } +cat > confinc.mk << 'END' am__doit: - @echo this is the am__doit target + @echo this is the am__doit target >confinc.out .PHONY: am__doit END -# If we don't find an include directive, just comment out the code. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5 -$as_echo_n "checking for style of include used by $am_make... " >&6; } am__include="#" am__quote= -_am_result=none -# First try GNU make style include. -echo "include confinc" > confmf -# Ignore all kinds of additional output from 'make'. -case `$am_make -s -f confmf 2> /dev/null` in #( -*the\ am__doit\ target*) - am__include=include - am__quote= - _am_result=GNU - ;; -esac -# Now try BSD make style include. -if test "$am__include" = "#"; then - echo '.include "confinc"' > confmf - case `$am_make -s -f confmf 2> /dev/null` in #( - *the\ am__doit\ target*) - am__include=.include - am__quote="\"" - _am_result=BSD +# BSD make does it like this. +echo '.include "confinc.mk" # ignored' > confmf.BSD +# Other make implementations (GNU, Solaris 10, AIX) do it like this. +echo 'include confinc.mk # ignored' > confmf.GNU +_am_result=no +for s in GNU BSD; do + { echo "$as_me:$LINENO: ${MAKE-make} -f confmf.$s && cat confinc.out" >&5 + (${MAKE-make} -f confmf.$s && cat confinc.out) >&5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } + case $?:`cat confinc.out 2>/dev/null` in #( + '0:this is the am__doit target') : + case $s in #( + BSD) : + am__include='.include' am__quote='"' ;; #( + *) : + am__include='include' am__quote='' ;; +esac ;; #( + *) : ;; - esac -fi - - -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5 -$as_echo "$_am_result" >&6; } -rm -f confinc confmf +esac + if test "$am__include" != "#"; then + _am_result="yes ($s style)" + break + fi +done +rm -f confinc.* confmf.* +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${_am_result}" >&5 +printf "%s\n" "${_am_result}" >&6; } # Check whether --enable-dependency-tracking was given. -if test "${enable_dependency_tracking+set}" = set; then : +if test ${enable_dependency_tracking+y} +then : enableval=$enable_dependency_tracking; fi @@ -3183,11 +3786,12 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. set dummy ${ac_tool_prefix}gcc; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_CC+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else @@ -3195,11 +3799,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_CC="${ac_tool_prefix}gcc" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -3210,11 +3818,11 @@ fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 -$as_echo "$CC" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +printf "%s\n" "$CC" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -3223,11 +3831,12 @@ if test -z "$ac_cv_prog_CC"; then ac_ct_CC=$CC # Extract the first word of "gcc", so it can be a program name with args. set dummy gcc; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_ac_ct_CC+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_ac_ct_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$ac_ct_CC"; then ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. else @@ -3235,11 +3844,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CC="gcc" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -3250,11 +3863,11 @@ fi fi ac_ct_CC=$ac_cv_prog_ac_ct_CC if test -n "$ac_ct_CC"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 -$as_echo "$ac_ct_CC" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +printf "%s\n" "$ac_ct_CC" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi if test "x$ac_ct_CC" = x; then @@ -3262,8 +3875,8 @@ fi else case $cross_compiling:$ac_tool_warned in yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} ac_tool_warned=yes ;; esac CC=$ac_ct_CC @@ -3276,11 +3889,12 @@ if test -z "$CC"; then if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. set dummy ${ac_tool_prefix}cc; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_CC+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else @@ -3288,11 +3902,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_CC="${ac_tool_prefix}cc" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -3303,11 +3921,11 @@ fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 -$as_echo "$CC" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +printf "%s\n" "$CC" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -3316,11 +3934,12 @@ fi if test -z "$CC"; then # Extract the first word of "cc", so it can be a program name with args. set dummy cc; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_CC+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else @@ -3329,15 +3948,19 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then + if test "$as_dir$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then ac_prog_rejected=yes continue fi ac_cv_prog_CC="cc" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -3353,18 +3976,18 @@ if test $ac_prog_rejected = yes; then # However, it has the same basename, so the bogon will be chosen # first if we set CC to just the basename; use the full file name. shift - ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" + ac_cv_prog_CC="$as_dir$ac_word${1+' '}$@" fi fi fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 -$as_echo "$CC" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +printf "%s\n" "$CC" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -3375,11 +3998,12 @@ if test -z "$CC"; then do # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. set dummy $ac_tool_prefix$ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_CC+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else @@ -3387,11 +4011,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_CC="$ac_tool_prefix$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -3402,11 +4030,11 @@ fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 -$as_echo "$CC" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +printf "%s\n" "$CC" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -3419,11 +4047,12 @@ if test -z "$CC"; then do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_ac_ct_CC+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_ac_ct_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$ac_ct_CC"; then ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. else @@ -3431,11 +4060,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CC="$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -3446,11 +4079,11 @@ fi fi ac_ct_CC=$ac_cv_prog_ac_ct_CC if test -n "$ac_ct_CC"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 -$as_echo "$ac_ct_CC" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +printf "%s\n" "$ac_ct_CC" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -3462,34 +4095,138 @@ done else case $cross_compiling:$ac_tool_warned in yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +fi + +fi +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}clang", so it can be a program name with args. +set dummy ${ac_tool_prefix}clang; ac_word=$2 +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="${ac_tool_prefix}clang" + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +printf "%s\n" "$CC" >&6; } +else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_CC"; then + ac_ct_CC=$CC + # Extract the first word of "clang", so it can be a program name with args. +set dummy clang; ac_word=$2 +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_ac_ct_CC+y} +then : + printf %s "(cached) " >&6 +else $as_nop + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_CC="clang" + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +printf "%s\n" "$ac_ct_CC" >&6; } +else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } +fi + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} ac_tool_warned=yes ;; esac CC=$ac_ct_CC fi +else + CC="$ac_cv_prog_CC" fi fi -test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +test -z "$CC" && { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "no acceptable C compiler found in \$PATH See \`config.log' for more details" "$LINENO" 5; } # Provide some information about the compiler. -$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5 +printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5 set X $ac_compile ac_compiler=$2 -for ac_option in --version -v -V -qversion; do +for ac_option in --version -v -V -qversion -version; do { { ac_try="$ac_compiler $ac_option >&5" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_compiler $ac_option >&5") 2>conftest.err ac_status=$? if test -s conftest.err; then @@ -3499,7 +4236,7 @@ $as_echo "$ac_try_echo"; } >&5 cat conftest.er1 >&5 fi rm -f conftest.er1 conftest.err - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } done @@ -3507,7 +4244,7 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { ; @@ -3519,9 +4256,9 @@ ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" # Try to create an executable without -o first, disregard a.out. # It will help us diagnose broken compilers, and finding out an intuition # of exeext. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5 -$as_echo_n "checking whether the C compiler works... " >&6; } -ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5 +printf %s "checking whether the C compiler works... " >&6; } +ac_link_default=`printf "%s\n" "$ac_link" | sed 's/ -o *conftest[^ ]*//'` # The possible output files: ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" @@ -3542,11 +4279,12 @@ case "(($ac_try" in *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_link_default") 2>&5 ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then : + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +then : # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. # So ignore a value of `no', otherwise this would lead to `EXEEXT = no' # in a Makefile. We should not override ac_cv_exeext if it was cached, @@ -3563,7 +4301,7 @@ do # certainly right. break;; *.* ) - if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; + if test ${ac_cv_exeext+y} && test "$ac_cv_exeext" != no; then :; else ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` fi @@ -3579,44 +4317,46 @@ do done test "$ac_cv_exeext" = no && ac_cv_exeext= -else +else $as_nop ac_file='' fi -if test -z "$ac_file"; then : - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -$as_echo "$as_me: failed program was:" >&5 +if test -z "$ac_file" +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } +printf "%s\n" "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 -{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error 77 "C compiler cannot create executables See \`config.log' for more details" "$LINENO" 5; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5 -$as_echo_n "checking for C compiler default output file name... " >&6; } -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 -$as_echo "$ac_file" >&6; } +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5 +printf %s "checking for C compiler default output file name... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 +printf "%s\n" "$ac_file" >&6; } ac_exeext=$ac_cv_exeext rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out ac_clean_files=$ac_clean_files_save -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5 -$as_echo_n "checking for suffix of executables... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5 +printf %s "checking for suffix of executables... " >&6; } if { { ac_try="$ac_link" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_link") 2>&5 ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then : + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +then : # If both `conftest.exe' and `conftest' are `present' (well, observable) # catch `conftest.exe'. For instance with Cygwin, `ls conftest' will # work properly (i.e., refer to `conftest.exe'), while it won't with @@ -3630,15 +4370,15 @@ for ac_file in conftest.exe conftest conftest.*; do * ) break;; esac done -else - { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +else $as_nop + { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "cannot compute suffix of executables: cannot compile and link See \`config.log' for more details" "$LINENO" 5; } fi rm -f conftest conftest$ac_cv_exeext -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5 -$as_echo "$ac_cv_exeext" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5 +printf "%s\n" "$ac_cv_exeext" >&6; } rm -f conftest.$ac_ext EXEEXT=$ac_cv_exeext @@ -3647,7 +4387,7 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include int -main () +main (void) { FILE *f = fopen ("conftest.out", "w"); return ferror (f) || fclose (f) != 0; @@ -3659,8 +4399,8 @@ _ACEOF ac_clean_files="$ac_clean_files conftest.out" # Check that the compiler produces executables we can run. If not, either # the compiler is broken, or we cross compile. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 -$as_echo_n "checking whether we are cross compiling... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 +printf %s "checking whether we are cross compiling... " >&6; } if test "$cross_compiling" != yes; then { { ac_try="$ac_link" case "(($ac_try" in @@ -3668,10 +4408,10 @@ case "(($ac_try" in *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_link") 2>&5 ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } if { ac_try='./conftest$ac_cv_exeext' { { case "(($ac_try" in @@ -3679,39 +4419,40 @@ $as_echo "$ac_try_echo"; } >&5 *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_try") 2>&5 ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; }; then cross_compiling=no else if test "$cross_compiling" = maybe; then cross_compiling=yes else - { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -as_fn_error $? "cannot run C compiled programs. + { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot run C compiled programs. If you meant to cross compile, use \`--host'. See \`config.log' for more details" "$LINENO" 5; } fi fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 -$as_echo "$cross_compiling" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 +printf "%s\n" "$cross_compiling" >&6; } rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out ac_clean_files=$ac_clean_files_save -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5 -$as_echo_n "checking for suffix of object files... " >&6; } -if ${ac_cv_objext+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5 +printf %s "checking for suffix of object files... " >&6; } +if test ${ac_cv_objext+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { ; @@ -3725,11 +4466,12 @@ case "(($ac_try" in *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_compile") 2>&5 ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then : + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +then : for ac_file in conftest.o conftest.obj conftest.*; do test -f "$ac_file" || continue; case $ac_file in @@ -3738,31 +4480,32 @@ $as_echo "$ac_try_echo"; } >&5 break;; esac done -else - $as_echo "$as_me: failed program was:" >&5 +else $as_nop + printf "%s\n" "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 -{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "cannot compute suffix of object files: cannot compile See \`config.log' for more details" "$LINENO" 5; } fi rm -f conftest.$ac_cv_objext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5 -$as_echo "$ac_cv_objext" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5 +printf "%s\n" "$ac_cv_objext" >&6; } OBJEXT=$ac_cv_objext ac_objext=$OBJEXT -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5 -$as_echo_n "checking whether we are using the GNU C compiler... " >&6; } -if ${ac_cv_c_compiler_gnu+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports GNU C" >&5 +printf %s "checking whether the compiler supports GNU C... " >&6; } +if test ${ac_cv_c_compiler_gnu+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { #ifndef __GNUC__ choke me @@ -3772,29 +4515,33 @@ main () return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : ac_compiler_gnu=yes -else +else $as_nop ac_compiler_gnu=no fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ac_cv_c_compiler_gnu=$ac_compiler_gnu fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5 -$as_echo "$ac_cv_c_compiler_gnu" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5 +printf "%s\n" "$ac_cv_c_compiler_gnu" >&6; } +ac_compiler_gnu=$ac_cv_c_compiler_gnu + if test $ac_compiler_gnu = yes; then GCC=yes else GCC= fi -ac_test_CFLAGS=${CFLAGS+set} +ac_test_CFLAGS=${CFLAGS+y} ac_save_CFLAGS=$CFLAGS -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5 -$as_echo_n "checking whether $CC accepts -g... " >&6; } -if ${ac_cv_prog_cc_g+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5 +printf %s "checking whether $CC accepts -g... " >&6; } +if test ${ac_cv_prog_cc_g+y} +then : + printf %s "(cached) " >&6 +else $as_nop ac_save_c_werror_flag=$ac_c_werror_flag ac_c_werror_flag=yes ac_cv_prog_cc_g=no @@ -3803,57 +4550,60 @@ else /* end confdefs.h. */ int -main () +main (void) { ; return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : ac_cv_prog_cc_g=yes -else +else $as_nop CFLAGS="" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { ; return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : -else +else $as_nop ac_c_werror_flag=$ac_save_c_werror_flag CFLAGS="-g" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { ; return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : ac_cv_prog_cc_g=yes fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ac_c_werror_flag=$ac_save_c_werror_flag fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5 -$as_echo "$ac_cv_prog_cc_g" >&6; } -if test "$ac_test_CFLAGS" = set; then +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5 +printf "%s\n" "$ac_cv_prog_cc_g" >&6; } +if test $ac_test_CFLAGS; then CFLAGS=$ac_save_CFLAGS elif test $ac_cv_prog_cc_g = yes; then if test "$GCC" = yes; then @@ -3868,94 +4618,144 @@ else CFLAGS= fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5 -$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } -if ${ac_cv_prog_cc_c89+:} false; then : - $as_echo_n "(cached) " >&6 -else - ac_cv_prog_cc_c89=no +ac_prog_cc_stdc=no +if test x$ac_prog_cc_stdc = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option to enable C11 features" >&5 +printf %s "checking for $CC option to enable C11 features... " >&6; } +if test ${ac_cv_prog_cc_c11+y} +then : + printf %s "(cached) " >&6 +else $as_nop + ac_cv_prog_cc_c11=no ac_save_CC=$CC cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include -#include -struct stat; -/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ -struct buf { int x; }; -FILE * (*rcsopen) (struct buf *, struct stat *, int); -static char *e (p, i) - char **p; - int i; -{ - return p[i]; -} -static char *f (char * (*g) (char **, int), char **p, ...) -{ - char *s; - va_list v; - va_start (v,p); - s = g (p, va_arg (v,int)); - va_end (v); - return s; -} - -/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has - function prototypes and stuff, but not '\xHH' hex character constants. - These don't provoke an error unfortunately, instead are silently treated - as 'x'. The following induces an error, until -std is added to get - proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an - array size at least. It's necessary to write '\x00'==0 to get something - that's true only with -std. */ -int osf4_cc_array ['\x00' == 0 ? 1 : -1]; +$ac_c_conftest_c11_program +_ACEOF +for ac_arg in '' -std=gnu11 +do + CC="$ac_save_CC $ac_arg" + if ac_fn_c_try_compile "$LINENO" +then : + ac_cv_prog_cc_c11=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam + test "x$ac_cv_prog_cc_c11" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC +fi -/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters - inside strings and character constants. */ -#define FOO(x) 'x' -int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; +if test "x$ac_cv_prog_cc_c11" = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +printf "%s\n" "unsupported" >&6; } +else $as_nop + if test "x$ac_cv_prog_cc_c11" = x +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +printf "%s\n" "none needed" >&6; } +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c11" >&5 +printf "%s\n" "$ac_cv_prog_cc_c11" >&6; } + CC="$CC $ac_cv_prog_cc_c11" +fi + ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c11 + ac_prog_cc_stdc=c11 +fi +fi +if test x$ac_prog_cc_stdc = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option to enable C99 features" >&5 +printf %s "checking for $CC option to enable C99 features... " >&6; } +if test ${ac_cv_prog_cc_c99+y} +then : + printf %s "(cached) " >&6 +else $as_nop + ac_cv_prog_cc_c99=no +ac_save_CC=$CC +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$ac_c_conftest_c99_program +_ACEOF +for ac_arg in '' -std=gnu99 -std=c99 -c99 -qlanglvl=extc1x -qlanglvl=extc99 -AC99 -D_STDC_C99= +do + CC="$ac_save_CC $ac_arg" + if ac_fn_c_try_compile "$LINENO" +then : + ac_cv_prog_cc_c99=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam + test "x$ac_cv_prog_cc_c99" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC +fi -int test (int i, double x); -struct s1 {int (*f) (int a);}; -struct s2 {int (*f) (double a);}; -int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); -int argc; -char **argv; -int -main () -{ -return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; - ; - return 0; -} +if test "x$ac_cv_prog_cc_c99" = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +printf "%s\n" "unsupported" >&6; } +else $as_nop + if test "x$ac_cv_prog_cc_c99" = x +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +printf "%s\n" "none needed" >&6; } +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5 +printf "%s\n" "$ac_cv_prog_cc_c99" >&6; } + CC="$CC $ac_cv_prog_cc_c99" +fi + ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c99 + ac_prog_cc_stdc=c99 +fi +fi +if test x$ac_prog_cc_stdc = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option to enable C89 features" >&5 +printf %s "checking for $CC option to enable C89 features... " >&6; } +if test ${ac_cv_prog_cc_c89+y} +then : + printf %s "(cached) " >&6 +else $as_nop + ac_cv_prog_cc_c89=no +ac_save_CC=$CC +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$ac_c_conftest_c89_program _ACEOF -for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ - -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" +for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" do CC="$ac_save_CC $ac_arg" - if ac_fn_c_try_compile "$LINENO"; then : + if ac_fn_c_try_compile "$LINENO" +then : ac_cv_prog_cc_c89=$ac_arg fi -rm -f core conftest.err conftest.$ac_objext +rm -f core conftest.err conftest.$ac_objext conftest.beam test "x$ac_cv_prog_cc_c89" != "xno" && break done rm -f conftest.$ac_ext CC=$ac_save_CC - fi -# AC_CACHE_VAL -case "x$ac_cv_prog_cc_c89" in - x) - { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 -$as_echo "none needed" >&6; } ;; - xno) - { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 -$as_echo "unsupported" >&6; } ;; - *) - CC="$CC $ac_cv_prog_cc_c89" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5 -$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; -esac -if test "x$ac_cv_prog_cc_c89" != xno; then : +if test "x$ac_cv_prog_cc_c89" = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +printf "%s\n" "unsupported" >&6; } +else $as_nop + if test "x$ac_cv_prog_cc_c89" = x +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +printf "%s\n" "none needed" >&6; } +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5 +printf "%s\n" "$ac_cv_prog_cc_c89" >&6; } + CC="$CC $ac_cv_prog_cc_c89" +fi + ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c89 + ac_prog_cc_stdc=c89 +fi fi ac_ext=c @@ -3964,21 +4764,23 @@ ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu -ac_ext=c + + ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5 -$as_echo_n "checking whether $CC understands -c and -o together... " >&6; } -if ${am_cv_prog_cc_c_o+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5 +printf %s "checking whether $CC understands -c and -o together... " >&6; } +if test ${am_cv_prog_cc_c_o+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { ; @@ -4006,8 +4808,8 @@ _ACEOF rm -f core conftest* unset am_i fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5 -$as_echo "$am_cv_prog_cc_c_o" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5 +printf "%s\n" "$am_cv_prog_cc_c_o" >&6; } if test "$am_cv_prog_cc_c_o" != yes; then # Losing compiler, so override with the script. # FIXME: It is wrong to rewrite CC. @@ -4025,11 +4827,12 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu depcc="$CC" am_compiler_list= -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5 -$as_echo_n "checking dependency style of $depcc... " >&6; } -if ${am_cv_CC_dependencies_compiler_type+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5 +printf %s "checking dependency style of $depcc... " >&6; } +if test ${am_cv_CC_dependencies_compiler_type+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For @@ -4040,334 +4843,154 @@ else mkdir conftest.dir # Copy depcomp to subdir because otherwise we won't find it if we're # using a relative directory. - cp "$am_depcomp" conftest.dir - cd conftest.dir - # We will build objects and dependencies in a subdirectory because - # it helps to detect inapplicable dependency modes. For instance - # both Tru64's cc and ICC support -MD to output dependencies as a - # side effect of compilation, but ICC will put the dependencies in - # the current directory while Tru64 will put them in the object - # directory. - mkdir sub - - am_cv_CC_dependencies_compiler_type=none - if test "$am_compiler_list" = ""; then - am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp` - fi - am__universal=false - case " $depcc " in #( - *\ -arch\ *\ -arch\ *) am__universal=true ;; - esac - - for depmode in $am_compiler_list; do - # Setup a source with many dependencies, because some compilers - # like to wrap large dependency lists on column 80 (with \), and - # we should not choose a depcomp mode which is confused by this. - # - # We need to recreate these files for each test, as the compiler may - # overwrite some of them when testing with obscure command lines. - # This happens at least with the AIX C compiler. - : > sub/conftest.c - for i in 1 2 3 4 5 6; do - echo '#include "conftst'$i'.h"' >> sub/conftest.c - # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with - # Solaris 10 /bin/sh. - echo '/* dummy */' > sub/conftst$i.h - done - echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf - - # We check with '-c' and '-o' for the sake of the "dashmstdout" - # mode. It turns out that the SunPro C++ compiler does not properly - # handle '-M -o', and we need to detect this. Also, some Intel - # versions had trouble with output in subdirs. - am__obj=sub/conftest.${OBJEXT-o} - am__minus_obj="-o $am__obj" - case $depmode in - gcc) - # This depmode causes a compiler race in universal mode. - test "$am__universal" = false || continue - ;; - nosideeffect) - # After this tag, mechanisms are not by side-effect, so they'll - # only be used when explicitly requested. - if test "x$enable_dependency_tracking" = xyes; then - continue - else - break - fi - ;; - msvc7 | msvc7msys | msvisualcpp | msvcmsys) - # This compiler won't grok '-c -o', but also, the minuso test has - # not run yet. These depmodes are late enough in the game, and - # so weak that their functioning should not be impacted. - am__obj=conftest.${OBJEXT-o} - am__minus_obj= - ;; - none) break ;; - esac - if depmode=$depmode \ - source=sub/conftest.c object=$am__obj \ - depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ - $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \ - >/dev/null 2>conftest.err && - grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 && - grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && - grep $am__obj sub/conftest.Po > /dev/null 2>&1 && - ${MAKE-make} -s -f confmf > /dev/null 2>&1; then - # icc doesn't choke on unknown options, it will just issue warnings - # or remarks (even with -Werror). So we grep stderr for any message - # that says an option was ignored or not supported. - # When given -MP, icc 7.0 and 7.1 complain thusly: - # icc: Command line warning: ignoring option '-M'; no argument required - # The diagnosis changed in icc 8.0: - # icc: Command line remark: option '-MP' not supported - if (grep 'ignoring option' conftest.err || - grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else - am_cv_CC_dependencies_compiler_type=$depmode - break - fi - fi - done - - cd .. - rm -rf conftest.dir -else - am_cv_CC_dependencies_compiler_type=none -fi - -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5 -$as_echo "$am_cv_CC_dependencies_compiler_type" >&6; } -CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type - - if - test "x$enable_dependency_tracking" != xno \ - && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then - am__fastdepCC_TRUE= - am__fastdepCC_FALSE='#' -else - am__fastdepCC_TRUE='#' - am__fastdepCC_FALSE= -fi - - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5 -$as_echo_n "checking for $CC option to accept ISO C99... " >&6; } -if ${ac_cv_prog_cc_c99+:} false; then : - $as_echo_n "(cached) " >&6 -else - ac_cv_prog_cc_c99=no -ac_save_CC=$CC -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -#include -#include -#include -#include - -// Check varargs macros. These examples are taken from C99 6.10.3.5. -#define debug(...) fprintf (stderr, __VA_ARGS__) -#define showlist(...) puts (#__VA_ARGS__) -#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__)) -static void -test_varargs_macros (void) -{ - int x = 1234; - int y = 5678; - debug ("Flag"); - debug ("X = %d\n", x); - showlist (The first, second, and third items.); - report (x>y, "x is %d but y is %d", x, y); -} - -// Check long long types. -#define BIG64 18446744073709551615ull -#define BIG32 4294967295ul -#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0) -#if !BIG_OK - your preprocessor is broken; -#endif -#if BIG_OK -#else - your preprocessor is broken; -#endif -static long long int bignum = -9223372036854775807LL; -static unsigned long long int ubignum = BIG64; - -struct incomplete_array -{ - int datasize; - double data[]; -}; - -struct named_init { - int number; - const wchar_t *name; - double average; -}; - -typedef const char *ccp; - -static inline int -test_restrict (ccp restrict text) -{ - // See if C++-style comments work. - // Iterate through items via the restricted pointer. - // Also check for declarations in for loops. - for (unsigned int i = 0; *(text+i) != '\0'; ++i) - continue; - return 0; -} - -// Check varargs and va_copy. -static void -test_varargs (const char *format, ...) -{ - va_list args; - va_start (args, format); - va_list args_copy; - va_copy (args_copy, args); - - const char *str; - int number; - float fnumber; - - while (*format) - { - switch (*format++) - { - case 's': // string - str = va_arg (args_copy, const char *); - break; - case 'd': // int - number = va_arg (args_copy, int); - break; - case 'f': // float - fnumber = va_arg (args_copy, double); - break; - default: - break; - } - } - va_end (args_copy); - va_end (args); -} - -int -main () -{ - - // Check bool. - _Bool success = false; - - // Check restrict. - if (test_restrict ("String literal") == 0) - success = true; - char *restrict newvar = "Another string"; - - // Check varargs. - test_varargs ("s, d' f .", "string", 65, 34.234); - test_varargs_macros (); - - // Check flexible array members. - struct incomplete_array *ia = - malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10)); - ia->datasize = 10; - for (int i = 0; i < ia->datasize; ++i) - ia->data[i] = i * 1.234; - - // Check named initializers. - struct named_init ni = { - .number = 34, - .name = L"Test wide string", - .average = 543.34343, - }; + cp "$am_depcomp" conftest.dir + cd conftest.dir + # We will build objects and dependencies in a subdirectory because + # it helps to detect inapplicable dependency modes. For instance + # both Tru64's cc and ICC support -MD to output dependencies as a + # side effect of compilation, but ICC will put the dependencies in + # the current directory while Tru64 will put them in the object + # directory. + mkdir sub - ni.number = 58; + am_cv_CC_dependencies_compiler_type=none + if test "$am_compiler_list" = ""; then + am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp` + fi + am__universal=false + case " $depcc " in #( + *\ -arch\ *\ -arch\ *) am__universal=true ;; + esac - int dynamic_array[ni.number]; - dynamic_array[ni.number - 1] = 543; + for depmode in $am_compiler_list; do + # Setup a source with many dependencies, because some compilers + # like to wrap large dependency lists on column 80 (with \), and + # we should not choose a depcomp mode which is confused by this. + # + # We need to recreate these files for each test, as the compiler may + # overwrite some of them when testing with obscure command lines. + # This happens at least with the AIX C compiler. + : > sub/conftest.c + for i in 1 2 3 4 5 6; do + echo '#include "conftst'$i'.h"' >> sub/conftest.c + # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with + # Solaris 10 /bin/sh. + echo '/* dummy */' > sub/conftst$i.h + done + echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf - // work around unused variable warnings - return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x' - || dynamic_array[ni.number - 1] != 543); + # We check with '-c' and '-o' for the sake of the "dashmstdout" + # mode. It turns out that the SunPro C++ compiler does not properly + # handle '-M -o', and we need to detect this. Also, some Intel + # versions had trouble with output in subdirs. + am__obj=sub/conftest.${OBJEXT-o} + am__minus_obj="-o $am__obj" + case $depmode in + gcc) + # This depmode causes a compiler race in universal mode. + test "$am__universal" = false || continue + ;; + nosideeffect) + # After this tag, mechanisms are not by side-effect, so they'll + # only be used when explicitly requested. + if test "x$enable_dependency_tracking" = xyes; then + continue + else + break + fi + ;; + msvc7 | msvc7msys | msvisualcpp | msvcmsys) + # This compiler won't grok '-c -o', but also, the minuso test has + # not run yet. These depmodes are late enough in the game, and + # so weak that their functioning should not be impacted. + am__obj=conftest.${OBJEXT-o} + am__minus_obj= + ;; + none) break ;; + esac + if depmode=$depmode \ + source=sub/conftest.c object=$am__obj \ + depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ + $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \ + >/dev/null 2>conftest.err && + grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 && + grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && + grep $am__obj sub/conftest.Po > /dev/null 2>&1 && + ${MAKE-make} -s -f confmf > /dev/null 2>&1; then + # icc doesn't choke on unknown options, it will just issue warnings + # or remarks (even with -Werror). So we grep stderr for any message + # that says an option was ignored or not supported. + # When given -MP, icc 7.0 and 7.1 complain thusly: + # icc: Command line warning: ignoring option '-M'; no argument required + # The diagnosis changed in icc 8.0: + # icc: Command line remark: option '-MP' not supported + if (grep 'ignoring option' conftest.err || + grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else + am_cv_CC_dependencies_compiler_type=$depmode + break + fi + fi + done - ; - return 0; -} -_ACEOF -for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99 -do - CC="$ac_save_CC $ac_arg" - if ac_fn_c_try_compile "$LINENO"; then : - ac_cv_prog_cc_c99=$ac_arg + cd .. + rm -rf conftest.dir +else + am_cv_CC_dependencies_compiler_type=none fi -rm -f core conftest.err conftest.$ac_objext - test "x$ac_cv_prog_cc_c99" != "xno" && break -done -rm -f conftest.$ac_ext -CC=$ac_save_CC fi -# AC_CACHE_VAL -case "x$ac_cv_prog_cc_c99" in - x) - { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 -$as_echo "none needed" >&6; } ;; - xno) - { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 -$as_echo "unsupported" >&6; } ;; - *) - CC="$CC $ac_cv_prog_cc_c99" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5 -$as_echo "$ac_cv_prog_cc_c99" >&6; } ;; -esac -if test "x$ac_cv_prog_cc_c99" != xno; then : +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5 +printf "%s\n" "$am_cv_CC_dependencies_compiler_type" >&6; } +CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type + if + test "x$enable_dependency_tracking" != xno \ + && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then + am__fastdepCC_TRUE= + am__fastdepCC_FALSE='#' +else + am__fastdepCC_TRUE='#' + am__fastdepCC_FALSE= fi + ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5 -$as_echo_n "checking how to run the C preprocessor... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5 +printf %s "checking how to run the C preprocessor... " >&6; } # On Suns, sometimes $CPP names a directory. if test -n "$CPP" && test -d "$CPP"; then CPP= fi if test -z "$CPP"; then - if ${ac_cv_prog_CPP+:} false; then : - $as_echo_n "(cached) " >&6 -else - # Double quotes because CPP needs to be expanded - for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp" + if test ${ac_cv_prog_CPP+y} +then : + printf %s "(cached) " >&6 +else $as_nop + # Double quotes because $CC needs to be expanded + for CPP in "$CC -E" "$CC -E -traditional-cpp" cpp /lib/cpp do ac_preproc_ok=false for ac_c_preproc_warn_flag in '' yes do # Use a header file that comes with gcc, so configuring glibc # with a fresh cross-compiler works. - # Prefer to if __STDC__ is defined, since - # exists even on freestanding compilers. # On the NeXT, cc -E runs the code through the compiler's parser, # not just through cpp. "Syntax error" is here to catch this case. cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#ifdef __STDC__ -# include -#else -# include -#endif +#include Syntax error _ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : +if ac_fn_c_try_cpp "$LINENO" +then : -else +else $as_nop # Broken: fails on valid input. continue fi @@ -4379,10 +5002,11 @@ rm -f conftest.err conftest.i conftest.$ac_ext /* end confdefs.h. */ #include _ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : +if ac_fn_c_try_cpp "$LINENO" +then : # Broken: success on invalid input. continue -else +else $as_nop # Passes both tests. ac_preproc_ok=: break @@ -4392,7 +5016,8 @@ rm -f conftest.err conftest.i conftest.$ac_ext done # Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. rm -f conftest.i conftest.err conftest.$ac_ext -if $ac_preproc_ok; then : +if $ac_preproc_ok +then : break fi @@ -4404,29 +5029,24 @@ fi else ac_cv_prog_CPP=$CPP fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5 -$as_echo "$CPP" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5 +printf "%s\n" "$CPP" >&6; } ac_preproc_ok=false for ac_c_preproc_warn_flag in '' yes do # Use a header file that comes with gcc, so configuring glibc # with a fresh cross-compiler works. - # Prefer to if __STDC__ is defined, since - # exists even on freestanding compilers. # On the NeXT, cc -E runs the code through the compiler's parser, # not just through cpp. "Syntax error" is here to catch this case. cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#ifdef __STDC__ -# include -#else -# include -#endif +#include Syntax error _ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : +if ac_fn_c_try_cpp "$LINENO" +then : -else +else $as_nop # Broken: fails on valid input. continue fi @@ -4438,10 +5058,11 @@ rm -f conftest.err conftest.i conftest.$ac_ext /* end confdefs.h. */ #include _ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : +if ac_fn_c_try_cpp "$LINENO" +then : # Broken: success on invalid input. continue -else +else $as_nop # Passes both tests. ac_preproc_ok=: break @@ -4451,11 +5072,12 @@ rm -f conftest.err conftest.i conftest.$ac_ext done # Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. rm -f conftest.i conftest.err conftest.$ac_ext -if $ac_preproc_ok; then : +if $ac_preproc_ok +then : -else - { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +else $as_nop + { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "C preprocessor \"$CPP\" fails sanity check See \`config.log' for more details" "$LINENO" 5; } fi @@ -4467,11 +5089,12 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $ ac_compiler_gnu=$ac_cv_c_compiler_gnu -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5 -$as_echo_n "checking for grep that handles long lines and -e... " >&6; } -if ${ac_cv_path_GREP+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5 +printf %s "checking for grep that handles long lines and -e... " >&6; } +if test ${ac_cv_path_GREP+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -z "$GREP"; then ac_path_GREP_found=false # Loop through the user's path and test for each of PROGNAME-LIST @@ -4479,10 +5102,15 @@ else for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_prog in grep ggrep; do + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + for ac_prog in grep ggrep + do for ac_exec_ext in '' $ac_executable_extensions; do - ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext" + ac_path_GREP="$as_dir$ac_prog$ac_exec_ext" as_fn_executable_p "$ac_path_GREP" || continue # Check for GNU ac_path_GREP and select it if it is found. # Check for GNU $ac_path_GREP @@ -4491,13 +5119,13 @@ case `"$ac_path_GREP" --version 2>&1` in ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;; *) ac_count=0 - $as_echo_n 0123456789 >"conftest.in" + printf %s 0123456789 >"conftest.in" while : do cat "conftest.in" "conftest.in" >"conftest.tmp" mv "conftest.tmp" "conftest.in" cp "conftest.in" "conftest.nl" - $as_echo 'GREP' >> "conftest.nl" + printf "%s\n" 'GREP' >> "conftest.nl" "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break as_fn_arith $ac_count + 1 && ac_count=$as_val @@ -4525,16 +5153,17 @@ else fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5 -$as_echo "$ac_cv_path_GREP" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5 +printf "%s\n" "$ac_cv_path_GREP" >&6; } GREP="$ac_cv_path_GREP" -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5 -$as_echo_n "checking for egrep... " >&6; } -if ${ac_cv_path_EGREP+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5 +printf %s "checking for egrep... " >&6; } +if test ${ac_cv_path_EGREP+y} +then : + printf %s "(cached) " >&6 +else $as_nop if echo a | $GREP -E '(a|b)' >/dev/null 2>&1 then ac_cv_path_EGREP="$GREP -E" else @@ -4545,10 +5174,15 @@ else for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_prog in egrep; do + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + for ac_prog in egrep + do for ac_exec_ext in '' $ac_executable_extensions; do - ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext" + ac_path_EGREP="$as_dir$ac_prog$ac_exec_ext" as_fn_executable_p "$ac_path_EGREP" || continue # Check for GNU ac_path_EGREP and select it if it is found. # Check for GNU $ac_path_EGREP @@ -4557,13 +5191,13 @@ case `"$ac_path_EGREP" --version 2>&1` in ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;; *) ac_count=0 - $as_echo_n 0123456789 >"conftest.in" + printf %s 0123456789 >"conftest.in" while : do cat "conftest.in" "conftest.in" >"conftest.tmp" mv "conftest.tmp" "conftest.in" cp "conftest.in" "conftest.nl" - $as_echo 'EGREP' >> "conftest.nl" + printf "%s\n" 'EGREP' >> "conftest.nl" "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break as_fn_arith $ac_count + 1 && ac_count=$as_val @@ -4592,17 +5226,18 @@ fi fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5 -$as_echo "$ac_cv_path_EGREP" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5 +printf "%s\n" "$ac_cv_path_EGREP" >&6; } EGREP="$ac_cv_path_EGREP" if test $ac_cv_c_compiler_gnu = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC needs -traditional" >&5 -$as_echo_n "checking whether $CC needs -traditional... " >&6; } -if ${ac_cv_prog_gcc_traditional+:} false; then : - $as_echo_n "(cached) " >&6 -else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $CC needs -traditional" >&5 +printf %s "checking whether $CC needs -traditional... " >&6; } +if test ${ac_cv_prog_gcc_traditional+y} +then : + printf %s "(cached) " >&6 +else $as_nop ac_pattern="Autoconf.*'x'" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ @@ -4610,12 +5245,13 @@ else Autoconf TIOCGETP _ACEOF if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "$ac_pattern" >/dev/null 2>&1; then : + $EGREP "$ac_pattern" >/dev/null 2>&1 +then : ac_cv_prog_gcc_traditional=yes -else +else $as_nop ac_cv_prog_gcc_traditional=no fi -rm -f conftest* +rm -rf conftest* if test $ac_cv_prog_gcc_traditional = no; then @@ -4625,15 +5261,16 @@ rm -f conftest* Autoconf TCGETA _ACEOF if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "$ac_pattern" >/dev/null 2>&1; then : + $EGREP "$ac_pattern" >/dev/null 2>&1 +then : ac_cv_prog_gcc_traditional=yes fi -rm -f conftest* +rm -rf conftest* fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_gcc_traditional" >&5 -$as_echo "$ac_cv_prog_gcc_traditional" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_gcc_traditional" >&5 +printf "%s\n" "$ac_cv_prog_gcc_traditional" >&6; } if test $ac_cv_prog_gcc_traditional = yes; then CC="$CC -traditional" fi @@ -4649,11 +5286,12 @@ test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS depcc="$CCAS" am_compiler_list= -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5 -$as_echo_n "checking dependency style of $depcc... " >&6; } -if ${am_cv_CCAS_dependencies_compiler_type+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5 +printf %s "checking dependency style of $depcc... " >&6; } +if test ${am_cv_CCAS_dependencies_compiler_type+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For @@ -4758,8 +5396,8 @@ else fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5 -$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5 +printf "%s\n" "$am_cv_CCAS_dependencies_compiler_type" >&6; } CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type if @@ -4776,11 +5414,12 @@ fi if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args. set dummy ${ac_tool_prefix}ranlib; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_RANLIB+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_RANLIB+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$RANLIB"; then ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. else @@ -4788,11 +5427,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -4803,11 +5446,11 @@ fi fi RANLIB=$ac_cv_prog_RANLIB if test -n "$RANLIB"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5 -$as_echo "$RANLIB" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5 +printf "%s\n" "$RANLIB" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -4816,11 +5459,12 @@ if test -z "$ac_cv_prog_RANLIB"; then ac_ct_RANLIB=$RANLIB # Extract the first word of "ranlib", so it can be a program name with args. set dummy ranlib; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_ac_ct_RANLIB+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_ac_ct_RANLIB+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$ac_ct_RANLIB"; then ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test. else @@ -4828,11 +5472,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_RANLIB="ranlib" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -4843,11 +5491,11 @@ fi fi ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB if test -n "$ac_ct_RANLIB"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5 -$as_echo "$ac_ct_RANLIB" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5 +printf "%s\n" "$ac_ct_RANLIB" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi if test "x$ac_ct_RANLIB" = x; then @@ -4855,8 +5503,8 @@ fi else case $cross_compiling:$ac_tool_warned in yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} ac_tool_warned=yes ;; esac RANLIB=$ac_ct_RANLIB @@ -4865,6 +5513,12 @@ else RANLIB="$ac_cv_prog_RANLIB" fi + + + + + + ac_ext=cpp ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' @@ -4875,15 +5529,16 @@ if test -z "$CXX"; then CXX=$CCC else if test -n "$ac_tool_prefix"; then - for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC + for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC clang++ do # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. set dummy $ac_tool_prefix$ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_CXX+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_CXX+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$CXX"; then ac_cv_prog_CXX="$CXX" # Let the user override the test. else @@ -4891,11 +5546,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_CXX="$ac_tool_prefix$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -4906,11 +5565,11 @@ fi fi CXX=$ac_cv_prog_CXX if test -n "$CXX"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5 -$as_echo "$CXX" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5 +printf "%s\n" "$CXX" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -4919,15 +5578,16 @@ fi fi if test -z "$CXX"; then ac_ct_CXX=$CXX - for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC + for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC clang++ do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_ac_ct_CXX+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +printf %s "checking for $ac_word... " >&6; } +if test ${ac_cv_prog_ac_ct_CXX+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -n "$ac_ct_CXX"; then ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test. else @@ -4935,11 +5595,15 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CXX="$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5 break 2 fi done @@ -4950,11 +5614,11 @@ fi fi ac_ct_CXX=$ac_cv_prog_ac_ct_CXX if test -n "$ac_ct_CXX"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5 -$as_echo "$ac_ct_CXX" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5 +printf "%s\n" "$ac_ct_CXX" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi @@ -4966,8 +5630,8 @@ done else case $cross_compiling:$ac_tool_warned in yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} ac_tool_warned=yes ;; esac CXX=$ac_ct_CXX @@ -4977,7 +5641,7 @@ fi fi fi # Provide some information about the compiler. -$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5 +printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5 set X $ac_compile ac_compiler=$2 for ac_option in --version -v -V -qversion; do @@ -4987,7 +5651,7 @@ case "(($ac_try" in *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 +printf "%s\n" "$ac_try_echo"; } >&5 (eval "$ac_compiler $ac_option >&5") 2>conftest.err ac_status=$? if test -s conftest.err; then @@ -4997,20 +5661,21 @@ $as_echo "$ac_try_echo"; } >&5 cat conftest.er1 >&5 fi rm -f conftest.er1 conftest.err - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } done -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5 -$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; } -if ${ac_cv_cxx_compiler_gnu+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports GNU C++" >&5 +printf %s "checking whether the compiler supports GNU C++... " >&6; } +if test ${ac_cv_cxx_compiler_gnu+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { #ifndef __GNUC__ choke me @@ -5020,29 +5685,33 @@ main () return 0; } _ACEOF -if ac_fn_cxx_try_compile "$LINENO"; then : +if ac_fn_cxx_try_compile "$LINENO" +then : ac_compiler_gnu=yes -else +else $as_nop ac_compiler_gnu=no fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ac_cv_cxx_compiler_gnu=$ac_compiler_gnu fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5 -$as_echo "$ac_cv_cxx_compiler_gnu" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5 +printf "%s\n" "$ac_cv_cxx_compiler_gnu" >&6; } +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + if test $ac_compiler_gnu = yes; then GXX=yes else GXX= fi -ac_test_CXXFLAGS=${CXXFLAGS+set} +ac_test_CXXFLAGS=${CXXFLAGS+y} ac_save_CXXFLAGS=$CXXFLAGS -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5 -$as_echo_n "checking whether $CXX accepts -g... " >&6; } -if ${ac_cv_prog_cxx_g+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5 +printf %s "checking whether $CXX accepts -g... " >&6; } +if test ${ac_cv_prog_cxx_g+y} +then : + printf %s "(cached) " >&6 +else $as_nop ac_save_cxx_werror_flag=$ac_cxx_werror_flag ac_cxx_werror_flag=yes ac_cv_prog_cxx_g=no @@ -5051,57 +5720,60 @@ else /* end confdefs.h. */ int -main () +main (void) { ; return 0; } _ACEOF -if ac_fn_cxx_try_compile "$LINENO"; then : +if ac_fn_cxx_try_compile "$LINENO" +then : ac_cv_prog_cxx_g=yes -else +else $as_nop CXXFLAGS="" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { ; return 0; } _ACEOF -if ac_fn_cxx_try_compile "$LINENO"; then : +if ac_fn_cxx_try_compile "$LINENO" +then : -else +else $as_nop ac_cxx_werror_flag=$ac_save_cxx_werror_flag CXXFLAGS="-g" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { ; return 0; } _ACEOF -if ac_fn_cxx_try_compile "$LINENO"; then : +if ac_fn_cxx_try_compile "$LINENO" +then : ac_cv_prog_cxx_g=yes fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ac_cxx_werror_flag=$ac_save_cxx_werror_flag fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5 -$as_echo "$ac_cv_prog_cxx_g" >&6; } -if test "$ac_test_CXXFLAGS" = set; then +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5 +printf "%s\n" "$ac_cv_prog_cxx_g" >&6; } +if test $ac_test_CXXFLAGS; then CXXFLAGS=$ac_save_CXXFLAGS elif test $ac_cv_prog_cxx_g = yes; then if test "$GXX" = yes; then @@ -5116,6 +5788,100 @@ else CXXFLAGS= fi fi +ac_prog_cxx_stdcxx=no +if test x$ac_prog_cxx_stdcxx = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5 +printf %s "checking for $CXX option to enable C++11 features... " >&6; } +if test ${ac_cv_prog_cxx_11+y} +then : + printf %s "(cached) " >&6 +else $as_nop + ac_cv_prog_cxx_11=no +ac_save_CXX=$CXX +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$ac_cxx_conftest_cxx11_program +_ACEOF +for ac_arg in '' -std=gnu++11 -std=gnu++0x -std=c++11 -std=c++0x -qlanglvl=extended0x -AA +do + CXX="$ac_save_CXX $ac_arg" + if ac_fn_cxx_try_compile "$LINENO" +then : + ac_cv_prog_cxx_cxx11=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam + test "x$ac_cv_prog_cxx_cxx11" != "xno" && break +done +rm -f conftest.$ac_ext +CXX=$ac_save_CXX +fi + +if test "x$ac_cv_prog_cxx_cxx11" = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +printf "%s\n" "unsupported" >&6; } +else $as_nop + if test "x$ac_cv_prog_cxx_cxx11" = x +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +printf "%s\n" "none needed" >&6; } +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_cxx11" >&5 +printf "%s\n" "$ac_cv_prog_cxx_cxx11" >&6; } + CXX="$CXX $ac_cv_prog_cxx_cxx11" +fi + ac_cv_prog_cxx_stdcxx=$ac_cv_prog_cxx_cxx11 + ac_prog_cxx_stdcxx=cxx11 +fi +fi +if test x$ac_prog_cxx_stdcxx = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5 +printf %s "checking for $CXX option to enable C++98 features... " >&6; } +if test ${ac_cv_prog_cxx_98+y} +then : + printf %s "(cached) " >&6 +else $as_nop + ac_cv_prog_cxx_98=no +ac_save_CXX=$CXX +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$ac_cxx_conftest_cxx98_program +_ACEOF +for ac_arg in '' -std=gnu++98 -std=c++98 -qlanglvl=extended -AA +do + CXX="$ac_save_CXX $ac_arg" + if ac_fn_cxx_try_compile "$LINENO" +then : + ac_cv_prog_cxx_cxx98=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam + test "x$ac_cv_prog_cxx_cxx98" != "xno" && break +done +rm -f conftest.$ac_ext +CXX=$ac_save_CXX +fi + +if test "x$ac_cv_prog_cxx_cxx98" = xno +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +printf "%s\n" "unsupported" >&6; } +else $as_nop + if test "x$ac_cv_prog_cxx_cxx98" = x +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +printf "%s\n" "none needed" >&6; } +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_cxx98" >&5 +printf "%s\n" "$ac_cv_prog_cxx_cxx98" >&6; } + CXX="$CXX $ac_cv_prog_cxx_cxx98" +fi + ac_cv_prog_cxx_stdcxx=$ac_cv_prog_cxx_cxx98 + ac_prog_cxx_stdcxx=cxx98 +fi +fi + ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' @@ -5124,11 +5890,12 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu depcc="$CXX" am_compiler_list= -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5 -$as_echo_n "checking dependency style of $depcc... " >&6; } -if ${am_cv_CXX_dependencies_compiler_type+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5 +printf %s "checking dependency style of $depcc... " >&6; } +if test ${am_cv_CXX_dependencies_compiler_type+y} +then : + printf %s "(cached) " >&6 +else $as_nop if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For @@ -5227,307 +5994,353 @@ else fi fi done - - cd .. - rm -rf conftest.dir + + cd .. + rm -rf conftest.dir +else + am_cv_CXX_dependencies_compiler_type=none +fi + +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5 +printf "%s\n" "$am_cv_CXX_dependencies_compiler_type" >&6; } +CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type + + if + test "x$enable_dependency_tracking" != xno \ + && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then + am__fastdepCXX_TRUE= + am__fastdepCXX_FALSE='#' +else + am__fastdepCXX_TRUE='#' + am__fastdepCXX_FALSE= +fi + + + +ac_header= ac_cache= +for ac_item in $ac_header_c_list +do + if test $ac_cache; then + ac_fn_c_check_header_compile "$LINENO" $ac_header ac_cv_header_$ac_cache "$ac_includes_default" + if eval test \"x\$ac_cv_header_$ac_cache\" = xyes; then + printf "%s\n" "#define $ac_item 1" >> confdefs.h + fi + ac_header= ac_cache= + elif test $ac_header; then + ac_cache=$ac_item + else + ac_header=$ac_item + fi +done + + + + + + + + +if test $ac_cv_header_stdlib_h = yes && test $ac_cv_header_string_h = yes +then : + +printf "%s\n" "#define STDC_HEADERS 1" >>confdefs.h + +fi +# Autoupdate added the next two lines to ensure that your configure +# script's behavior did not change. They are probably safe to remove. + +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5 +printf %s "checking for egrep... " >&6; } +if test ${ac_cv_path_EGREP+y} +then : + printf %s "(cached) " >&6 +else $as_nop + if echo a | $GREP -E '(a|b)' >/dev/null 2>&1 + then ac_cv_path_EGREP="$GREP -E" + else + if test -z "$EGREP"; then + ac_path_EGREP_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin +do + IFS=$as_save_IFS + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + for ac_prog in egrep + do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_EGREP="$as_dir$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_EGREP" || continue +# Check for GNU ac_path_EGREP and select it if it is found. + # Check for GNU $ac_path_EGREP +case `"$ac_path_EGREP" --version 2>&1` in +*GNU*) + ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;; +*) + ac_count=0 + printf %s 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + printf "%s\n" 'EGREP' >> "conftest.nl" + "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_EGREP_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_EGREP="$ac_path_EGREP" + ac_path_EGREP_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_EGREP_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_EGREP"; then + as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 + fi else - am_cv_CXX_dependencies_compiler_type=none -fi - + ac_cv_path_EGREP=$EGREP fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5 -$as_echo "$am_cv_CXX_dependencies_compiler_type" >&6; } -CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type - if - test "x$enable_dependency_tracking" != xno \ - && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then - am__fastdepCXX_TRUE= - am__fastdepCXX_FALSE='#' -else - am__fastdepCXX_TRUE='#' - am__fastdepCXX_FALSE= + fi fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5 +printf "%s\n" "$ac_cv_path_EGREP" >&6; } + EGREP="$ac_cv_path_EGREP" -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 -$as_echo_n "checking for ANSI C header files... " >&6; } -if ${ac_cv_header_stdc+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -#include -#include -#include - -int -main () -{ +ac_fn_c_check_header_compile "$LINENO" "sys/endian.h" "ac_cv_header_sys_endian_h" "$ac_includes_default" +if test "x$ac_cv_header_sys_endian_h" = xyes +then : + printf "%s\n" "#define HAVE_SYS_ENDIAN_H 1" >>confdefs.h - ; - return 0; -} -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - ac_cv_header_stdc=yes -else - ac_cv_header_stdc=no fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext - -if test $ac_cv_header_stdc = yes; then - # SunOS 4.x string.h does not declare mem*, contrary to ANSI. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include +ac_fn_c_check_header_compile "$LINENO" "sys/param.h" "ac_cv_header_sys_param_h" "$ac_includes_default" +if test "x$ac_cv_header_sys_param_h" = xyes +then : + printf "%s\n" "#define HAVE_SYS_PARAM_H 1" >>confdefs.h -_ACEOF -if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "memchr" >/dev/null 2>&1; then : - -else - ac_cv_header_stdc=no fi -rm -f conftest* +ac_fn_c_check_header_compile "$LINENO" "syslog.h" "ac_cv_header_syslog_h" "$ac_includes_default" +if test "x$ac_cv_header_syslog_h" = xyes +then : + printf "%s\n" "#define HAVE_SYSLOG_H 1" >>confdefs.h fi -if test $ac_cv_header_stdc = yes; then - # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include +# sys/sysctl.h requires sys/types.h on FreeBSD +# sys/sysctl.h requires sys/param.h on OpenBSD +ac_fn_c_check_header_compile "$LINENO" "sys/sysctl.h" "ac_cv_header_sys_sysctl_h" "#include +#ifdef HAVE_SYS_PARAM_H +#include +#endif -_ACEOF -if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "free" >/dev/null 2>&1; then : +" +if test "x$ac_cv_header_sys_sysctl_h" = xyes +then : + printf "%s\n" "#define HAVE_SYS_SYSCTL_H 1" >>confdefs.h -else - ac_cv_header_stdc=no fi -rm -f conftest* -fi -if test $ac_cv_header_stdc = yes; then - # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. - if test "$cross_compiling" = yes; then : - : -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC options needed to detect all undeclared functions" >&5 +printf %s "checking for $CC options needed to detect all undeclared functions... " >&6; } +if test ${ac_cv_c_undeclared_builtin_options+y} +then : + printf %s "(cached) " >&6 +else $as_nop + ac_save_CFLAGS=$CFLAGS + ac_cv_c_undeclared_builtin_options='cannot detect' + for ac_arg in '' -fno-builtin; do + CFLAGS="$ac_save_CFLAGS $ac_arg" + # This test program should *not* compile successfully. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include -#include -#if ((' ' & 0x0FF) == 0x020) -# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') -# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) -#else -# define ISLOWER(c) \ - (('a' <= (c) && (c) <= 'i') \ - || ('j' <= (c) && (c) <= 'r') \ - || ('s' <= (c) && (c) <= 'z')) -# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) -#endif -#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) int -main () +main (void) { - int i; - for (i = 0; i < 256; i++) - if (XOR (islower (i), ISLOWER (i)) - || toupper (i) != TOUPPER (i)) - return 2; +(void) strchr; + ; return 0; } _ACEOF -if ac_fn_c_try_run "$LINENO"; then : - -else - ac_cv_header_stdc=no -fi -rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ - conftest.$ac_objext conftest.beam conftest.$ac_ext -fi - -fi -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 -$as_echo "$ac_cv_header_stdc" >&6; } -if test $ac_cv_header_stdc = yes; then - -$as_echo "#define STDC_HEADERS 1" >>confdefs.h +if ac_fn_c_try_compile "$LINENO" +then : + +else $as_nop + # This test program should compile successfully. + # No library function is consistently available on + # freestanding implementations, so test against a dummy + # declaration. Include always-available headers on the + # off chance that they somehow elicit warnings. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include +extern void ac_decl (int, char *); -fi +int +main (void) +{ +(void) ac_decl (0, (char *) 0); + (void) ac_decl; -# On IRIX 5.3, sys/types and inttypes.h are conflicting. -for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \ - inttypes.h stdint.h unistd.h -do : - as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` -ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default -" -if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : - cat >>confdefs.h <<_ACEOF -#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 + ; + return 0; +} _ACEOF - +if ac_fn_c_try_compile "$LINENO" +then : + if test x"$ac_arg" = x +then : + ac_cv_c_undeclared_builtin_options='none needed' +else $as_nop + ac_cv_c_undeclared_builtin_options=$ac_arg fi - -done - - -for ac_header in sys/endian.h sys/param.h syslog.h -do : - as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` -ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" -if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : - cat >>confdefs.h <<_ACEOF -#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 -_ACEOF - + break fi - -done - -# sys/sysctl.h requires sys/types.h on FreeBSD -# sys/sysctl.h requires sys/param.h on OpenBSD -for ac_header in sys/sysctl.h -do : - ac_fn_c_check_header_compile "$LINENO" "sys/sysctl.h" "ac_cv_header_sys_sysctl_h" "#include -#ifdef HAVE_SYS_PARAM_H -#include -#endif - -" -if test "x$ac_cv_header_sys_sysctl_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_SYS_SYSCTL_H 1 -_ACEOF - +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + done + CFLAGS=$ac_save_CFLAGS + +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_undeclared_builtin_options" >&5 +printf "%s\n" "$ac_cv_c_undeclared_builtin_options" >&6; } + case $ac_cv_c_undeclared_builtin_options in #( + 'cannot detect') : + { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot make $CC report undeclared builtins +See \`config.log' for more details" "$LINENO" 5; } ;; #( + 'none needed') : + ac_c_undeclared_builtin_options='' ;; #( + *) : + ac_c_undeclared_builtin_options=$ac_cv_c_undeclared_builtin_options ;; +esac -done - - -ac_fn_c_check_decl "$LINENO" "be32dec" "ac_cv_have_decl_be32dec" "$ac_includes_default +ac_fn_check_decl "$LINENO" "be32dec" "ac_cv_have_decl_be32dec" "$ac_includes_default #ifdef HAVE_SYS_ENDIAN_H #include #endif -" -if test "x$ac_cv_have_decl_be32dec" = xyes; then : +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_be32dec" = xyes +then : ac_have_decl=1 -else +else $as_nop ac_have_decl=0 fi - -cat >>confdefs.h <<_ACEOF -#define HAVE_DECL_BE32DEC $ac_have_decl -_ACEOF -ac_fn_c_check_decl "$LINENO" "le32dec" "ac_cv_have_decl_le32dec" "$ac_includes_default +printf "%s\n" "#define HAVE_DECL_BE32DEC $ac_have_decl" >>confdefs.h +ac_fn_check_decl "$LINENO" "le32dec" "ac_cv_have_decl_le32dec" "$ac_includes_default #ifdef HAVE_SYS_ENDIAN_H #include #endif -" -if test "x$ac_cv_have_decl_le32dec" = xyes; then : +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_le32dec" = xyes +then : ac_have_decl=1 -else +else $as_nop ac_have_decl=0 fi - -cat >>confdefs.h <<_ACEOF -#define HAVE_DECL_LE32DEC $ac_have_decl -_ACEOF -ac_fn_c_check_decl "$LINENO" "be32enc" "ac_cv_have_decl_be32enc" "$ac_includes_default +printf "%s\n" "#define HAVE_DECL_LE32DEC $ac_have_decl" >>confdefs.h +ac_fn_check_decl "$LINENO" "be32enc" "ac_cv_have_decl_be32enc" "$ac_includes_default #ifdef HAVE_SYS_ENDIAN_H #include #endif -" -if test "x$ac_cv_have_decl_be32enc" = xyes; then : +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_be32enc" = xyes +then : ac_have_decl=1 -else +else $as_nop ac_have_decl=0 fi - -cat >>confdefs.h <<_ACEOF -#define HAVE_DECL_BE32ENC $ac_have_decl -_ACEOF -ac_fn_c_check_decl "$LINENO" "le32enc" "ac_cv_have_decl_le32enc" "$ac_includes_default +printf "%s\n" "#define HAVE_DECL_BE32ENC $ac_have_decl" >>confdefs.h +ac_fn_check_decl "$LINENO" "le32enc" "ac_cv_have_decl_le32enc" "$ac_includes_default #ifdef HAVE_SYS_ENDIAN_H #include #endif -" -if test "x$ac_cv_have_decl_le32enc" = xyes; then : +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_le32enc" = xyes +then : ac_have_decl=1 -else +else $as_nop ac_have_decl=0 fi - -cat >>confdefs.h <<_ACEOF -#define HAVE_DECL_LE32ENC $ac_have_decl -_ACEOF -ac_fn_c_check_decl "$LINENO" "le16dec" "ac_cv_have_decl_le16dec" "$ac_includes_default +printf "%s\n" "#define HAVE_DECL_LE32ENC $ac_have_decl" >>confdefs.h +ac_fn_check_decl "$LINENO" "le16dec" "ac_cv_have_decl_le16dec" "$ac_includes_default #ifdef HAVE_SYS_ENDIAN_H #include #endif -" -if test "x$ac_cv_have_decl_le16dec" = xyes; then : +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_le16dec" = xyes +then : ac_have_decl=1 -else +else $as_nop ac_have_decl=0 fi - -cat >>confdefs.h <<_ACEOF -#define HAVE_DECL_LE16DEC $ac_have_decl -_ACEOF -ac_fn_c_check_decl "$LINENO" "le16enc" "ac_cv_have_decl_le16enc" "$ac_includes_default +printf "%s\n" "#define HAVE_DECL_LE16DEC $ac_have_decl" >>confdefs.h +ac_fn_check_decl "$LINENO" "le16enc" "ac_cv_have_decl_le16enc" "$ac_includes_default #ifdef HAVE_SYS_ENDIAN_H #include #endif -" -if test "x$ac_cv_have_decl_le16enc" = xyes; then : +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_le16enc" = xyes +then : ac_have_decl=1 -else +else $as_nop ac_have_decl=0 fi - -cat >>confdefs.h <<_ACEOF -#define HAVE_DECL_LE16ENC $ac_have_decl -_ACEOF +printf "%s\n" "#define HAVE_DECL_LE16ENC $ac_have_decl" >>confdefs.h ac_fn_c_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default" -if test "x$ac_cv_type_size_t" = xyes; then : +if test "x$ac_cv_type_size_t" = xyes +then : -else +else $as_nop -cat >>confdefs.h <<_ACEOF -#define size_t unsigned int -_ACEOF +printf "%s\n" "#define size_t unsigned int" >>confdefs.h fi # The Ultrix 4.2 mips builtin alloca declared by alloca.h only works # for constant arguments. Useless! -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working alloca.h" >&5 -$as_echo_n "checking for working alloca.h... " >&6; } -if ${ac_cv_working_alloca_h+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for working alloca.h" >&5 +printf %s "checking for working alloca.h... " >&6; } +if test ${ac_cv_working_alloca_h+y} +then : + printf %s "(cached) " >&6 +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include int -main () +main (void) { char *p = (char *) alloca (2 * sizeof (int)); if (p) return 0; @@ -5535,52 +6348,52 @@ char *p = (char *) alloca (2 * sizeof (int)); return 0; } _ACEOF -if ac_fn_c_try_link "$LINENO"; then : +if ac_fn_c_try_link "$LINENO" +then : ac_cv_working_alloca_h=yes -else +else $as_nop ac_cv_working_alloca_h=no fi -rm -f core conftest.err conftest.$ac_objext \ +rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_working_alloca_h" >&5 -$as_echo "$ac_cv_working_alloca_h" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_working_alloca_h" >&5 +printf "%s\n" "$ac_cv_working_alloca_h" >&6; } if test $ac_cv_working_alloca_h = yes; then -$as_echo "#define HAVE_ALLOCA_H 1" >>confdefs.h +printf "%s\n" "#define HAVE_ALLOCA_H 1" >>confdefs.h fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for alloca" >&5 -$as_echo_n "checking for alloca... " >&6; } -if ${ac_cv_func_alloca_works+:} false; then : - $as_echo_n "(cached) " >&6 +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for alloca" >&5 +printf %s "checking for alloca... " >&6; } +if test ${ac_cv_func_alloca_works+y} +then : + printf %s "(cached) " >&6 +else $as_nop + if test $ac_cv_working_alloca_h = yes; then + ac_cv_func_alloca_works=yes else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#ifdef __GNUC__ -# define alloca __builtin_alloca -#else -# ifdef _MSC_VER +#include +#include +#ifndef alloca +# ifdef __GNUC__ +# define alloca __builtin_alloca +# elif defined _MSC_VER # include # define alloca _alloca # else -# ifdef HAVE_ALLOCA_H -# include -# else -# ifdef _AIX - #pragma alloca -# else -# ifndef alloca /* predefined by HP cc +Olibcalls */ -void *alloca (size_t); -# endif -# endif +# ifdef __cplusplus +extern "C" # endif +void *alloca (size_t); # endif #endif int -main () +main (void) { char *p = (char *) alloca (1); if (p) return 0; @@ -5588,20 +6401,22 @@ char *p = (char *) alloca (1); return 0; } _ACEOF -if ac_fn_c_try_link "$LINENO"; then : +if ac_fn_c_try_link "$LINENO" +then : ac_cv_func_alloca_works=yes -else +else $as_nop ac_cv_func_alloca_works=no fi -rm -f core conftest.err conftest.$ac_objext \ +rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_alloca_works" >&5 -$as_echo "$ac_cv_func_alloca_works" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_alloca_works" >&5 +printf "%s\n" "$ac_cv_func_alloca_works" >&6; } +fi if test $ac_cv_func_alloca_works = yes; then -$as_echo "#define HAVE_ALLOCA 1" >>confdefs.h +printf "%s\n" "#define HAVE_ALLOCA 1" >>confdefs.h else # The SVR3 libPW and SVR4 libucb both contain incompatible functions @@ -5611,58 +6426,19 @@ else ALLOCA=\${LIBOBJDIR}alloca.$ac_objext -$as_echo "#define C_ALLOCA 1" >>confdefs.h - - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether \`alloca.c' needs Cray hooks" >&5 -$as_echo_n "checking whether \`alloca.c' needs Cray hooks... " >&6; } -if ${ac_cv_os_cray+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#if defined CRAY && ! defined CRAY2 -webecray -#else -wenotbecray -#endif - -_ACEOF -if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "webecray" >/dev/null 2>&1; then : - ac_cv_os_cray=yes -else - ac_cv_os_cray=no -fi -rm -f conftest* - -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_os_cray" >&5 -$as_echo "$ac_cv_os_cray" >&6; } -if test $ac_cv_os_cray = yes; then - for ac_func in _getb67 GETB67 getb67; do - as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` -ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" -if eval test \"x\$"$as_ac_var"\" = x"yes"; then : - -cat >>confdefs.h <<_ACEOF -#define CRAY_STACKSEG_END $ac_func -_ACEOF - - break -fi +printf "%s\n" "#define C_ALLOCA 1" >>confdefs.h - done -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking stack direction for C alloca" >&5 -$as_echo_n "checking stack direction for C alloca... " >&6; } -if ${ac_cv_c_stack_direction+:} false; then : - $as_echo_n "(cached) " >&6 -else - if test "$cross_compiling" = yes; then : +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking stack direction for C alloca" >&5 +printf %s "checking stack direction for C alloca... " >&6; } +if test ${ac_cv_c_stack_direction+y} +then : + printf %s "(cached) " >&6 +else $as_nop + if test "$cross_compiling" = yes +then : ac_cv_c_stack_direction=0 -else +else $as_nop cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ $ac_includes_default @@ -5683,9 +6459,10 @@ main (int argc, char **argv) return find_stack_direction (0, argc + !argv + 20) < 0; } _ACEOF -if ac_fn_c_try_run "$LINENO"; then : +if ac_fn_c_try_run "$LINENO" +then : ac_cv_c_stack_direction=1 -else +else $as_nop ac_cv_c_stack_direction=-1 fi rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ @@ -5693,25 +6470,19 @@ rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_stack_direction" >&5 -$as_echo "$ac_cv_c_stack_direction" >&6; } -cat >>confdefs.h <<_ACEOF -#define STACK_DIRECTION $ac_cv_c_stack_direction -_ACEOF +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_stack_direction" >&5 +printf "%s\n" "$ac_cv_c_stack_direction" >&6; } +printf "%s\n" "#define STACK_DIRECTION $ac_cv_c_stack_direction" >>confdefs.h fi -for ac_func in getopt_long -do : - ac_fn_c_check_func "$LINENO" "getopt_long" "ac_cv_func_getopt_long" -if test "x$ac_cv_func_getopt_long" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_GETOPT_LONG 1 -_ACEOF +ac_fn_c_check_func "$LINENO" "getopt_long" "ac_cv_func_getopt_long" +if test "x$ac_cv_func_getopt_long" = xyes +then : + printf "%s\n" "#define HAVE_GETOPT_LONG 1" >>confdefs.h fi -done MINGW_TARGET=`$CC -dumpmachine 2>&1` @@ -5743,109 +6514,114 @@ case $MINGW_TARGET in esac # Check whether --enable-assembly was given. -if test "${enable_assembly+set}" = set; then : +if test ${enable_assembly+y} +then : enableval=$enable_assembly; fi if test x$enable_assembly != xno; then -$as_echo "#define USE_ASM 1" >>confdefs.h +printf "%s\n" "#define USE_ASM 1" >>confdefs.h fi if test x$enable_assembly != xno -a x$have_x86_64 = xtrue then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5 -$as_echo_n "checking whether we can compile AVX code... " >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5 +printf %s "checking whether we can compile AVX code... " >&6; } cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { asm ("vmovdqa %ymm0, %ymm1"); ; return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : -$as_echo "#define USE_AVX 1" >>confdefs.h +printf "%s\n" "#define USE_AVX 1" >>confdefs.h - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5 -$as_echo_n "checking whether we can compile XOP code... " >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5 +printf %s "checking whether we can compile XOP code... " >&6; } cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { asm ("vprotd \$7, %xmm0, %xmm1"); ; return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : -$as_echo "#define USE_XOP 1" >>confdefs.h +printf "%s\n" "#define USE_XOP 1" >>confdefs.h - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5 -$as_echo "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;} +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5 +printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;} fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext - { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5 -$as_echo_n "checking whether we can compile AVX2 code... " >&6; } +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5 +printf %s "checking whether we can compile AVX2 code... " >&6; } cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int -main () +main (void) { asm ("vpaddd %ymm0, %ymm1, %ymm2"); ; return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : -$as_echo "#define USE_AVX2 1" >>confdefs.h +printf "%s\n" "#define USE_AVX2 1" >>confdefs.h - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5 -$as_echo "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;} +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5 +printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;} fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5 -$as_echo "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;} +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5 +printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;} fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5 -$as_echo_n "checking for json_loads in -ljansson... " >&6; } -if ${ac_cv_lib_jansson_json_loads+:} false; then : - $as_echo_n "(cached) " >&6 -else +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5 +printf %s "checking for json_loads in -ljansson... " >&6; } +if test ${ac_cv_lib_jansson_json_loads+y} +then : + printf %s "(cached) " >&6 +else $as_nop ac_check_lib_save_LIBS=$LIBS LIBS="-ljansson $LIBS" cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -5854,32 +6630,31 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* Override any GCC internal prototype to avoid an error. Use char because int might match the return type of a GCC builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif char json_loads (); int -main () +main (void) { return json_loads (); ; return 0; } _ACEOF -if ac_fn_c_try_link "$LINENO"; then : +if ac_fn_c_try_link "$LINENO" +then : ac_cv_lib_jansson_json_loads=yes -else +else $as_nop ac_cv_lib_jansson_json_loads=no fi -rm -f core conftest.err conftest.$ac_objext \ +rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_jansson_json_loads" >&5 -$as_echo "$ac_cv_lib_jansson_json_loads" >&6; } -if test "x$ac_cv_lib_jansson_json_loads" = xyes; then : +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_jansson_json_loads" >&5 +printf "%s\n" "$ac_cv_lib_jansson_json_loads" >&6; } +if test "x$ac_cv_lib_jansson_json_loads" = xyes +then : request_jansson=false -else +else $as_nop request_jansson=true fi @@ -5887,11 +6662,12 @@ fi # GC2 for GNU static if test "x$have_win32" = "xtrue" ; then # MinGW - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5 -$as_echo_n "checking for pthread_create in -lpthread... " >&6; } -if ${ac_cv_lib_pthread_pthread_create+:} false; then : - $as_echo_n "(cached) " >&6 -else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5 +printf %s "checking for pthread_create in -lpthread... " >&6; } +if test ${ac_cv_lib_pthread_pthread_create+y} +then : + printf %s "(cached) " >&6 +else $as_nop ac_check_lib_save_LIBS=$LIBS LIBS="-lpthread $LIBS" cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -5900,39 +6676,39 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* Override any GCC internal prototype to avoid an error. Use char because int might match the return type of a GCC builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif char pthread_create (); int -main () +main (void) { return pthread_create (); ; return 0; } _ACEOF -if ac_fn_c_try_link "$LINENO"; then : +if ac_fn_c_try_link "$LINENO" +then : ac_cv_lib_pthread_pthread_create=yes -else +else $as_nop ac_cv_lib_pthread_pthread_create=no fi -rm -f core conftest.err conftest.$ac_objext \ +rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5 -$as_echo "$ac_cv_lib_pthread_pthread_create" >&6; } -if test "x$ac_cv_lib_pthread_pthread_create" = xyes; then : +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5 +printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; } +if test "x$ac_cv_lib_pthread_pthread_create" = xyes +then : PTHREAD_LIBS="-lpthreadGC2" fi else - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5 -$as_echo_n "checking for pthread_create in -lpthread... " >&6; } -if ${ac_cv_lib_pthread_pthread_create+:} false; then : - $as_echo_n "(cached) " >&6 -else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5 +printf %s "checking for pthread_create in -lpthread... " >&6; } +if test ${ac_cv_lib_pthread_pthread_create+y} +then : + printf %s "(cached) " >&6 +else $as_nop ac_check_lib_save_LIBS=$LIBS LIBS="-lpthread $LIBS" cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -5941,30 +6717,29 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* Override any GCC internal prototype to avoid an error. Use char because int might match the return type of a GCC builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif char pthread_create (); int -main () +main (void) { return pthread_create (); ; return 0; } _ACEOF -if ac_fn_c_try_link "$LINENO"; then : +if ac_fn_c_try_link "$LINENO" +then : ac_cv_lib_pthread_pthread_create=yes -else +else $as_nop ac_cv_lib_pthread_pthread_create=no fi -rm -f core conftest.err conftest.$ac_objext \ +rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5 -$as_echo "$ac_cv_lib_pthread_pthread_create" >&6; } -if test "x$ac_cv_lib_pthread_pthread_create" = xyes; then : +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5 +printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; } +if test "x$ac_cv_lib_pthread_pthread_create" = xyes +then : PTHREAD_LIBS="-lpthread" fi @@ -5973,32 +6748,33 @@ fi LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS" # PTHREAD_LIBS="$PTHREAD_LIBS" -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5 -$as_echo_n "checking whether __uint128_t is supported... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5 +printf %s "checking whether __uint128_t is supported... " >&6; } cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ static __uint128_t i = 100; int -main () +main (void) { ; return 0; } _ACEOF -if ac_fn_c_try_compile "$LINENO"; then : +if ac_fn_c_try_compile "$LINENO" +then : -$as_echo "#define USE_INT128 1" >>confdefs.h +printf "%s\n" "#define USE_INT128 1" >>confdefs.h - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } +else $as_nop + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext # allow if in Makefile.am if test x$request_jansson = xtrue; then @@ -6067,7 +6843,8 @@ fi # libcurl install path (for mingw : --with-curl=/usr/local) # Check whether --with-curl was given. -if test "${with_curl+set}" = set; then : +if test ${with_curl+y} +then : withval=$with_curl; fi @@ -6082,7 +6859,8 @@ fi # SSL install path (for mingw : --with-crypto=/usr/local/ssl) # Check whether --with-crypto was given. -if test "${with_crypto+set}" = set; then : +if test ${with_crypto+y} +then : withval=$with_crypto; fi @@ -6119,7 +6897,7 @@ LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS" -ac_config_files="$ac_config_files Makefile compat/Makefile compat/jansson/Makefile" +ac_config_files="$ac_config_files Makefile compat/Makefile compat/jansson/Makefile " cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure @@ -6148,8 +6926,8 @@ _ACEOF case $ac_val in #( *${as_nl}*) case $ac_var in #( - *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 -$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; esac case $ac_var in #( _ | IFS | as_nl) ;; #( @@ -6179,15 +6957,15 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; /^ac_cv_env_/b end t clear :clear - s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ + s/^\([^=]*\)=\(.*[{}].*\)$/test ${\1+y} || &/ t end s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ :end' >>confcache if diff "$cache_file" confcache >/dev/null 2>&1; then :; else if test -w "$cache_file"; then if test "x$cache_file" != "x/dev/null"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 -$as_echo "$as_me: updating cache $cache_file" >&6;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 +printf "%s\n" "$as_me: updating cache $cache_file" >&6;} if test ! -f "$cache_file" || test -h "$cache_file"; then cat confcache >"$cache_file" else @@ -6201,8 +6979,8 @@ $as_echo "$as_me: updating cache $cache_file" >&6;} fi fi else - { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 -$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 +printf "%s\n" "$as_me: not updating unwritable cache $cache_file" >&6;} fi fi rm -f confcache @@ -6219,7 +6997,7 @@ U= for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue # 1. Remove the extension, and $U if already installed. ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' - ac_i=`$as_echo "$ac_i" | sed "$ac_script"` + ac_i=`printf "%s\n" "$ac_i" | sed "$ac_script"` # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR # will be set to the directory where LIBOBJS objects are built. as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext" @@ -6230,14 +7008,14 @@ LIBOBJS=$ac_libobjs LTLIBOBJS=$ac_ltlibobjs -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking that generated files are newer than configure" >&5 -$as_echo_n "checking that generated files are newer than configure... " >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking that generated files are newer than configure" >&5 +printf %s "checking that generated files are newer than configure... " >&6; } if test -n "$am_sleep_pid"; then # Hide warnings about reused PIDs. wait $am_sleep_pid 2>/dev/null fi - { $as_echo "$as_me:${as_lineno-$LINENO}: result: done" >&5 -$as_echo "done" >&6; } + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: done" >&5 +printf "%s\n" "done" >&6; } if test -n "$EXEEXT"; then am__EXEEXT_TRUE= am__EXEEXT_FALSE='#' @@ -6299,8 +7077,8 @@ fi ac_write_fail=0 ac_clean_files_save=$ac_clean_files ac_clean_files="$ac_clean_files $CONFIG_STATUS" -{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 -$as_echo "$as_me: creating $CONFIG_STATUS" >&6;} +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 +printf "%s\n" "$as_me: creating $CONFIG_STATUS" >&6;} as_write_fail=0 cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 #! $SHELL @@ -6323,14 +7101,16 @@ cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1 # Be more Bourne compatible DUALCASE=1; export DUALCASE # for MKS sh -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : +as_nop=: +if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1 +then : emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' setopt NO_GLOB_SUBST -else +else $as_nop case `(set -o) 2>/dev/null` in #( *posix*) : set -o posix ;; #( @@ -6340,46 +7120,46 @@ esac fi + +# Reset variables that may have inherited troublesome values from +# the environment. + +# IFS needs to be set, to space, tab, and newline, in precisely that order. +# (If _AS_PATH_WALK were called with IFS unset, it would have the +# side effect of setting IFS to empty, thus disabling word splitting.) +# Quoting is to prevent editors from complaining about space-tab. as_nl=' ' export as_nl -# Printing a long string crashes Solaris 7 /usr/bin/printf. -as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo -# Prefer a ksh shell builtin over an external printf program on Solaris, -# but without wasting forks for bash or zsh. -if test -z "$BASH_VERSION$ZSH_VERSION" \ - && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='print -r --' - as_echo_n='print -rn --' -elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='printf %s\n' - as_echo_n='printf %s' -else - if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then - as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' - as_echo_n='/usr/ucb/echo -n' - else - as_echo_body='eval expr "X$1" : "X\\(.*\\)"' - as_echo_n_body='eval - arg=$1; - case $arg in #( - *"$as_nl"*) - expr "X$arg" : "X\\(.*\\)$as_nl"; - arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; - esac; - expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" - ' - export as_echo_n_body - as_echo_n='sh -c $as_echo_n_body as_echo' - fi - export as_echo_body - as_echo='sh -c $as_echo_body as_echo' -fi +IFS=" "" $as_nl" + +PS1='$ ' +PS2='> ' +PS4='+ ' + +# Ensure predictable behavior from utilities with locale-dependent output. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# We cannot yet rely on "unset" to work, but we need these variables +# to be unset--not just set to an empty or harmless value--now, to +# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh). This construct +# also avoids known problems related to "unset" and subshell syntax +# in other old shells (e.g. bash 2.01 and pdksh 5.2.14). +for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH +do eval test \${$as_var+y} \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done + +# Ensure that fds 0, 1, and 2 are open. +if (exec 3>&0) 2>/dev/null; then :; else exec 0&1) 2>/dev/null; then :; else exec 1>/dev/null; fi +if (exec 3>&2) ; then :; else exec 2>/dev/null; fi # The user is always right. -if test "${PATH_SEPARATOR+set}" != set; then +if ${PATH_SEPARATOR+false} :; then PATH_SEPARATOR=: (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || @@ -6388,13 +7168,6 @@ if test "${PATH_SEPARATOR+set}" != set; then fi -# IFS -# We need space, tab and new line, in precisely that order. Quoting is -# there to prevent editors from complaining about space-tab. -# (If _AS_PATH_WALK were called with IFS unset, it would disable word -# splitting by setting IFS to empty value.) -IFS=" "" $as_nl" - # Find who we are. Look in the path if we contain no directory separator. as_myself= case $0 in #(( @@ -6403,8 +7176,12 @@ case $0 in #(( for as_dir in $PATH do IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + case $as_dir in #((( + '') as_dir=./ ;; + */) ;; + *) as_dir=$as_dir/ ;; + esac + test -r "$as_dir$0" && as_myself=$as_dir$0 && break done IFS=$as_save_IFS @@ -6416,30 +7193,10 @@ if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then - $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 exit 1 fi -# Unset variables that we do not need and which cause bugs (e.g. in -# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" -# suppresses any "Segmentation fault" message there. '((' could -# trigger a bug in pdksh 5.2.14. -for as_var in BASH_ENV ENV MAIL MAILPATH -do eval test x\${$as_var+set} = xset \ - && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : -done -PS1='$ ' -PS2='> ' -PS4='+ ' - -# NLS nuisances. -LC_ALL=C -export LC_ALL -LANGUAGE=C -export LANGUAGE - -# CDPATH. -(unset CDPATH) >/dev/null 2>&1 && unset CDPATH # as_fn_error STATUS ERROR [LINENO LOG_FD] @@ -6452,13 +7209,14 @@ as_fn_error () as_status=$1; test $as_status -eq 0 && as_status=1 if test "$4"; then as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 fi - $as_echo "$as_me: error: $2" >&2 + printf "%s\n" "$as_me: error: $2" >&2 as_fn_exit $as_status } # as_fn_error + # as_fn_set_status STATUS # ----------------------- # Set $? to STATUS, without forking. @@ -6485,18 +7243,20 @@ as_fn_unset () { eval $1=; unset $1;} } as_unset=as_fn_unset + # as_fn_append VAR VALUE # ---------------------- # Append the text in VALUE to the end of the definition contained in VAR. Take # advantage of any shell optimizations that allow amortized linear growth over # repeated appends, instead of the typical quadratic growth present in naive # implementations. -if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null +then : eval 'as_fn_append () { eval $1+=\$2 }' -else +else $as_nop as_fn_append () { eval $1=\$$1\$2 @@ -6508,12 +7268,13 @@ fi # as_fn_append # Perform arithmetic evaluation on the ARGs, and store the result in the # global $as_val. Take advantage of shells that can avoid forks. The arguments # must be portable across $(()) and expr. -if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null +then : eval 'as_fn_arith () { as_val=$(( $* )) }' -else +else $as_nop as_fn_arith () { as_val=`expr "$@" || test $? -eq 1` @@ -6544,7 +7305,7 @@ as_me=`$as_basename -- "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X/"$0" | +printf "%s\n" X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/ q @@ -6566,6 +7327,10 @@ as_cr_Letters=$as_cr_letters$as_cr_LETTERS as_cr_digits='0123456789' as_cr_alnum=$as_cr_Letters$as_cr_digits + +# Determine whether it's possible to make 'echo' print without a newline. +# These variables are no longer used directly by Autoconf, but are AC_SUBSTed +# for compatibility with existing Makefiles. ECHO_C= ECHO_N= ECHO_T= case `echo -n x` in #((((( -n*) @@ -6579,6 +7344,12 @@ case `echo -n x` in #((((( ECHO_N='-n';; esac +# For backward compatibility with old third-party macros, we provide +# the shell variables $as_echo and $as_echo_n. New code should use +# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively. +as_echo='printf %s\n' +as_echo_n='printf %s' + rm -f conf$$ conf$$.exe conf$$.file if test -d conf$$.dir; then rm -f conf$$.dir/conf$$.file @@ -6620,7 +7391,7 @@ as_fn_mkdir_p () as_dirs= while :; do case $as_dir in #( - *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( *) as_qdir=$as_dir;; esac as_dirs="'$as_qdir' $as_dirs" @@ -6629,7 +7400,7 @@ $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$as_dir" | +printf "%s\n" X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q @@ -6692,7 +7463,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # values after options handling. ac_log=" This file was extended by cpuminer-kudaraidee $as_me 1.1.0, which was -generated by GNU Autoconf 2.69. Invocation command line was +generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS @@ -6754,14 +7525,16 @@ $config_commands Report bugs to the package provider." _ACEOF +ac_cs_config=`printf "%s\n" "$ac_configure_args" | sed "$ac_safe_unquote"` +ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\''/g"` cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" +ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ cpuminer-kudaraidee config.status 1.1.0 -configured by $0, generated by GNU Autoconf 2.69, +configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" -Copyright (C) 2012 Free Software Foundation, Inc. +Copyright (C) 2021 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." @@ -6801,15 +7574,15 @@ do -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) ac_cs_recheck=: ;; --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) - $as_echo "$ac_cs_version"; exit ;; + printf "%s\n" "$ac_cs_version"; exit ;; --config | --confi | --conf | --con | --co | --c ) - $as_echo "$ac_cs_config"; exit ;; + printf "%s\n" "$ac_cs_config"; exit ;; --debug | --debu | --deb | --de | --d | -d ) debug=: ;; --file | --fil | --fi | --f ) $ac_shift case $ac_optarg in - *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + *\'*) ac_optarg=`printf "%s\n" "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; '') as_fn_error $? "missing file argument" ;; esac as_fn_append CONFIG_FILES " '$ac_optarg'" @@ -6817,7 +7590,7 @@ do --header | --heade | --head | --hea ) $ac_shift case $ac_optarg in - *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + *\'*) ac_optarg=`printf "%s\n" "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; esac as_fn_append CONFIG_HEADERS " '$ac_optarg'" ac_need_defaults=false;; @@ -6826,7 +7599,7 @@ do as_fn_error $? "ambiguous option: \`$1' Try \`$0 --help' for more information.";; --help | --hel | -h ) - $as_echo "$ac_cs_usage"; exit ;; + printf "%s\n" "$ac_cs_usage"; exit ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil | --si | --s) ac_cs_silent=: ;; @@ -6854,7 +7627,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 if \$ac_cs_recheck; then set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion shift - \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 + \printf "%s\n" "running CONFIG_SHELL=$SHELL \$*" >&6 CONFIG_SHELL='$SHELL' export CONFIG_SHELL exec "\$@" @@ -6868,7 +7641,7 @@ exec 5>>config.log sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX ## Running $as_me. ## _ASBOX - $as_echo "$ac_log" + printf "%s\n" "$ac_log" } >&5 _ACEOF @@ -6876,7 +7649,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 # # INIT-COMMANDS # -AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir" +AMDEP_TRUE="$AMDEP_TRUE" MAKE="${MAKE-make}" _ACEOF @@ -6888,9 +7661,10 @@ do case $ac_config_target in "cpuminer-config.h") CONFIG_HEADERS="$CONFIG_HEADERS cpuminer-config.h" ;; "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;; - "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; - "compat/Makefile") CONFIG_FILES="$CONFIG_FILES compat/Makefile" ;; - "compat/jansson/Makefile") CONFIG_FILES="$CONFIG_FILES compat/jansson/Makefile" ;; + " ") CONFIG_FILES="$CONFIG_FILES " ;; + "Makefile ") CONFIG_FILES="$CONFIG_FILES Makefile " ;; + "compat/Makefile ") CONFIG_FILES="$CONFIG_FILES compat/Makefile " ;; + "compat/jansson/Makefile ") CONFIG_FILES="$CONFIG_FILES compat/jansson/Makefile " ;; *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; esac @@ -6902,9 +7676,9 @@ done # We use the long form for the default assignment because of an extremely # bizarre bug on SunOS 4.1.3. if $ac_need_defaults; then - test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files - test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers - test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands + test ${CONFIG_FILES+y} || CONFIG_FILES=$config_files + test ${CONFIG_HEADERS+y} || CONFIG_HEADERS=$config_headers + test ${CONFIG_COMMANDS+y} || CONFIG_COMMANDS=$config_commands fi # Have a temporary directory for convenience. Make it in the build tree @@ -7240,7 +8014,7 @@ do esac || as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; esac - case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac + case $ac_f in *\'*) ac_f=`printf "%s\n" "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac as_fn_append ac_file_inputs " '$ac_f'" done @@ -7248,17 +8022,17 @@ do # use $as_me), people would be surprised to read: # /* config.h. Generated by config.status. */ configure_input='Generated from '` - $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' + printf "%s\n" "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' `' by configure.' if test x"$ac_file" != x-; then configure_input="$ac_file. $configure_input" - { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 -$as_echo "$as_me: creating $ac_file" >&6;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 +printf "%s\n" "$as_me: creating $ac_file" >&6;} fi # Neutralize special characters interpreted by sed in replacement strings. case $configure_input in #( *\&* | *\|* | *\\* ) - ac_sed_conf_input=`$as_echo "$configure_input" | + ac_sed_conf_input=`printf "%s\n" "$configure_input" | sed 's/[\\\\&|]/\\\\&/g'`;; #( *) ac_sed_conf_input=$configure_input;; esac @@ -7275,7 +8049,7 @@ $as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$ac_file" : 'X\(//\)[^/]' \| \ X"$ac_file" : 'X\(//\)$' \| \ X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$ac_file" | +printf "%s\n" X"$ac_file" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q @@ -7299,9 +8073,9 @@ $as_echo X"$ac_file" | case "$ac_dir" in .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; *) - ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'` # A ".." for each directory in $ac_dir_suffix. - ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` case $ac_top_builddir_sub in "") ac_top_builddir_sub=. ac_top_build_prefix= ;; *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; @@ -7363,8 +8137,8 @@ ac_sed_dataroot=' case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in *datarootdir*) ac_datarootdir_seen=yes;; *@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 -$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 +printf "%s\n" "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_datarootdir_hack=' @@ -7408,9 +8182,9 @@ test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ "$ac_tmp/out"`; test -z "$ac_out"; } && - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' which seems to be undefined. Please make sure it is defined" >&5 -$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' +printf "%s\n" "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' which seems to be undefined. Please make sure it is defined" >&2;} rm -f "$ac_tmp/stdin" @@ -7426,20 +8200,20 @@ which seems to be undefined. Please make sure it is defined" >&2;} # if test x"$ac_file" != x-; then { - $as_echo "/* $configure_input */" \ + printf "%s\n" "/* $configure_input */" >&1 \ && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" } >"$ac_tmp/config.h" \ || as_fn_error $? "could not create $ac_file" "$LINENO" 5 if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then - { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5 -$as_echo "$as_me: $ac_file is unchanged" >&6;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5 +printf "%s\n" "$as_me: $ac_file is unchanged" >&6;} else rm -f "$ac_file" mv "$ac_tmp/config.h" "$ac_file" \ || as_fn_error $? "could not create $ac_file" "$LINENO" 5 fi else - $as_echo "/* $configure_input */" \ + printf "%s\n" "/* $configure_input */" >&1 \ && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \ || as_fn_error $? "could not create -" "$LINENO" 5 fi @@ -7459,7 +8233,7 @@ $as_expr X"$_am_arg" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$_am_arg" : 'X\(//\)[^/]' \| \ X"$_am_arg" : 'X\(//\)$' \| \ X"$_am_arg" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$_am_arg" | +printf "%s\n" X"$_am_arg" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q @@ -7479,8 +8253,8 @@ $as_echo X"$_am_arg" | s/.*/./; q'`/stamp-h$_am_stamp_count ;; - :C) { $as_echo "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5 -$as_echo "$as_me: executing $ac_file commands" >&6;} + :C) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5 +printf "%s\n" "$as_me: executing $ac_file commands" >&6;} ;; esac @@ -7490,29 +8264,35 @@ $as_echo "$as_me: executing $ac_file commands" >&6;} # Older Autoconf quotes --file arguments for eval, but not when files # are listed without --file. Let's play safe and only enable the eval # if we detect the quoting. - case $CONFIG_FILES in - *\'*) eval set x "$CONFIG_FILES" ;; - *) set x $CONFIG_FILES ;; - esac + # TODO: see whether this extra hack can be removed once we start + # requiring Autoconf 2.70 or later. + case $CONFIG_FILES in #( + *\'*) : + eval set x "$CONFIG_FILES" ;; #( + *) : + set x $CONFIG_FILES ;; #( + *) : + ;; +esac shift - for mf + # Used to flag and report bootstrapping failures. + am_rc=0 + for am_mf do # Strip MF so we end up with the name of the file. - mf=`echo "$mf" | sed -e 's/:.*$//'` - # Check whether this is an Automake generated Makefile or not. - # We used to match only the files named 'Makefile.in', but - # some people rename them; so instead we look at the file content. - # Grep'ing the first line is not enough: some people post-process - # each Makefile.in and add a new line on top of each file to say so. - # Grep'ing the whole file is not good either: AIX grep has a line + am_mf=`printf "%s\n" "$am_mf" | sed -e 's/:.*$//'` + # Check whether this is an Automake generated Makefile which includes + # dependency-tracking related rules and includes. + # Grep'ing the whole file directly is not great: AIX grep has a line # limit of 2048, but all sed's we know have understand at least 4000. - if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then - dirpart=`$as_dirname -- "$mf" || -$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$mf" : 'X\(//\)[^/]' \| \ - X"$mf" : 'X\(//\)$' \| \ - X"$mf" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$mf" | + sed -n 's,^am--depfiles:.*,X,p' "$am_mf" | grep X >/dev/null 2>&1 \ + || continue + am_dirpart=`$as_dirname -- "$am_mf" || +$as_expr X"$am_mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$am_mf" : 'X\(//\)[^/]' \| \ + X"$am_mf" : 'X\(//\)$' \| \ + X"$am_mf" : 'X\(/\)' \| . 2>/dev/null || +printf "%s\n" X"$am_mf" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q @@ -7530,53 +8310,50 @@ $as_echo X"$mf" | q } s/.*/./; q'` - else - continue - fi - # Extract the definition of DEPDIR, am__include, and am__quote - # from the Makefile without running 'make'. - DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"` - test -z "$DEPDIR" && continue - am__include=`sed -n 's/^am__include = //p' < "$mf"` - test -z "$am__include" && continue - am__quote=`sed -n 's/^am__quote = //p' < "$mf"` - # Find all dependency output files, they are included files with - # $(DEPDIR) in their names. We invoke sed twice because it is the - # simplest approach to changing $(DEPDIR) to its actual value in the - # expansion. - for file in `sed -n " - s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \ - sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do - # Make sure the directory exists. - test -f "$dirpart/$file" && continue - fdir=`$as_dirname -- "$file" || -$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$file" : 'X\(//\)[^/]' \| \ - X"$file" : 'X\(//\)$' \| \ - X"$file" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$file" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ + am_filepart=`$as_basename -- "$am_mf" || +$as_expr X/"$am_mf" : '.*/\([^/][^/]*\)/*$' \| \ + X"$am_mf" : 'X\(//\)$' \| \ + X"$am_mf" : 'X\(/\)' \| . 2>/dev/null || +printf "%s\n" X/"$am_mf" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/ q } - /^X\(\/\/\)$/{ + /^X\/\(\/\/\)$/{ s//\1/ q } - /^X\(\/\).*/{ + /^X\/\(\/\).*/{ s//\1/ q } s/.*/./; q'` - as_dir=$dirpart/$fdir; as_fn_mkdir_p - # echo "creating $dirpart/$file" - echo '# dummy' > "$dirpart/$file" - done + { echo "$as_me:$LINENO: cd "$am_dirpart" \ + && sed -e '/# am--include-marker/d' "$am_filepart" \ + | $MAKE -f - am--depfiles" >&5 + (cd "$am_dirpart" \ + && sed -e '/# am--include-marker/d' "$am_filepart" \ + | $MAKE -f - am--depfiles) >&5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } || am_rc=$? done + if test $am_rc -ne 0; then + { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "Something went wrong bootstrapping makefile fragments + for automatic dependency tracking. If GNU make was not used, consider + re-running the configure script with MAKE=\"gmake\" (or whatever is + necessary). You can also try re-running configure with the + '--disable-dependency-tracking' option to at least be able to build + the package (albeit without support for automatic dependency tracking). +See \`config.log' for more details" "$LINENO" 5; } + fi + { am_dirpart=; unset am_dirpart;} + { am_filepart=; unset am_filepart;} + { am_mf=; unset am_mf;} + { am_rc=; unset am_rc;} + rm -f conftest-deps.mk } ;; @@ -7613,7 +8390,8 @@ if test "$no_create" != yes; then $ac_cs_success || as_fn_exit 1 fi if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 -$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 +printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} fi + diff --git a/configure.ac b/configure.ac index 0b9e8732..39f25b13 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-kudaraidee], [1.1.0]) +AC_INIT([cpuminer-opt], [3.19.6]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 87d388d4..d78f3044 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3,7 +3,7 @@ * Copyright 2012-2014 pooler * Copyright 2014 Lucas Jones * Copyright 2014-2016 Tanguy Pruvot - * Copyright 2016-2020 Jay D Dee + * Copyright 2016-2021 Jay D Dee * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -38,6 +38,7 @@ #include #include #include "sysinfos.c" +#include "algo/sha/sha256d.h" #ifdef WIN32 #include @@ -91,10 +92,10 @@ bool opt_extranonce = true; bool want_longpoll = false; bool have_longpoll = false; bool have_gbt = true; -bool opt_minotaurx = false; // Flag for MinX on GBT bool allow_getwork = true; bool want_stratum = true; // pretty useless bool have_stratum = false; +bool stratum_down = true; bool allow_mininginfo = true; bool use_syslog = false; bool use_colors = true; @@ -113,28 +114,25 @@ char* opt_param_key = NULL; int opt_param_n = 0; int opt_param_r = 0; int opt_n_threads = 0; -bool opt_reset_on_stale = false; bool opt_sapling = false; - -// Windows doesn't support 128 bit affinity mask. -// Need compile time and run time test. -#if defined(__linux) && defined(GCC_INT128) -#define AFFINITY_USES_UINT128 1 -uint128_t opt_affinity = -1; -static bool affinity_uses_uint128 = true; -#else -uint64_t opt_affinity = -1; -static bool affinity_uses_uint128 = false; -#endif - -int opt_priority = 0; +static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL; // default, use all cores +int opt_priority = 0; // deprecated int num_cpus = 1; -int num_cpugroups = 1; -char *rpc_url = NULL;; +int num_cpugroups = 1; // For Windows +#define max_cpus 256 // max for affinity +char *rpc_url = NULL; char *rpc_userpass = NULL; char *rpc_user, *rpc_pass; char *short_url = NULL; char *coinbase_address; +char *opt_data_file = NULL; +bool opt_verify = false; +static bool opt_stratum_keepalive = false; +static struct timeval stratum_keepalive_timer; +// Stratum typically times out in 5 minutes or 300 seconds +#define stratum_keepalive_timeout 180 // 3 minutes +static struct timeval stratum_reset_time; + // pk_buffer_size is used as a version selector by b58 code, therefore // it must be set correctly to work. @@ -164,8 +162,11 @@ uint32_t accepted_share_count = 0; uint32_t rejected_share_count = 0; uint32_t stale_share_count = 0; uint32_t solved_block_count = 0; +uint32_t stratum_errors = 0; double *thr_hashrates; double global_hashrate = 0.; +double total_hashes = 0.; +struct timeval total_hashes_time = {0,0}; double stratum_diff = 0.; double net_diff = 0.; double net_hashrate = 0.; @@ -192,7 +193,6 @@ int default_api_listen = 4048; static struct timeval session_start; static struct timeval five_min_start; static uint64_t session_first_block = 0; -static double latency_sum = 0.; static uint64_t submit_sum = 0; static uint64_t accept_sum = 0; static uint64_t stale_sum = 0; @@ -205,6 +205,7 @@ static double lowest_share = 9e99; // lowest accepted share diff static double last_targetdiff = 0.; #if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32)) static uint32_t hi_temp = 0; +static uint32_t prev_temp = 0; #endif @@ -222,18 +223,25 @@ char* lp_id; static void workio_cmd_free(struct workio_cmd *wc); -static void format_affinity_map( char *map_str, uint64_t map ) +// array mapping thread to cpu +static uint8_t thread_affinity_map[ max_cpus ]; + +// display affinity mask graphically +static void format_affinity_mask( char *mask_str, uint64_t mask ) { +#if defined(WINDOWS_CPU_GROUPS_ENABLED) + int n = num_cpus / num_cpugroups; +#else int n = num_cpus < 64 ? num_cpus : 64; +#endif int i; - for ( i = 0; i < n; i++ ) { - if ( map & 1 ) map_str[i] = '!'; - else map_str[i] = '.'; - map >>= 1; + if ( mask & 1 ) mask_str[i] = '!'; + else mask_str[i] = '.'; + mask >>= 1; } - memset( &map_str[i], 0, 64 - i ); + memset( &mask_str[i], 0, 64 - i ); } #ifdef __linux /* Linux specific policy and affinity management */ @@ -255,93 +263,70 @@ static inline void drop_policy(void) #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */ #endif -// Linux affinity can use int128. -#if AFFINITY_USES_UINT128 -static void affine_to_cpu_mask( int id, uint128_t mask ) -#else -static void affine_to_cpu_mask( int id, uint64_t mask ) -#endif +static void affine_to_cpu( struct thr_info *thr ) { + int thread = thr->id; cpu_set_t set; CPU_ZERO( &set ); - uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus; - - for ( uint8_t i = 0; i < ncpus; i++ ) - { - // cpu mask -#if AFFINITY_USES_UINT128 - if( ( mask & ( (uint128_t)1 << i ) ) ) CPU_SET( i, &set ); -#else - if( (ncpus > 64) || ( mask & (1 << i) ) ) CPU_SET( i, &set ); -#endif - } - if ( id == -1 ) - { - // process affinity - sched_setaffinity(0, sizeof(&set), &set); - } - else - { - // thread only - pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set); - } + CPU_SET( thread_affinity_map[ thread ], &set ); + if ( opt_debug ) + applog( LOG_INFO, "Binding thread %d to cpu %d", + thread, thread_affinity_map[ thread ] ); + pthread_setaffinity_np( thr->pth, sizeof(set), &set ); } #elif defined(WIN32) /* Windows */ + static inline void drop_policy(void) { } // Windows CPU groups to manage more than 64 CPUs. -static void affine_to_cpu_mask( int id, uint64_t mask ) +// mask arg is ignored +static void affine_to_cpu( struct thr_info *thr ) { - bool success; + int thread = thr->id; unsigned long last_error; -// BOOL success; -// DWORD last_error; + bool ok; - if ( id == -1 ) - success = SetProcessAffinityMask( GetCurrentProcess(), mask ); +#if defined(WINDOWS_CPU_GROUPS_ENABLED) + unsigned long group_size = GetActiveProcessorCount( 0 ); + unsigned long group = thread / group_size; + unsigned long cpu = thread_affinity_map[ thread % group_size ]; -// Are Windows CPU Groups supported? -#if _WIN32_WINNT==0x0601 - else if ( num_cpugroups == 1 ) - success = SetThreadAffinityMask( GetCurrentThread(), mask ); - else - { - // Find the correct cpu group - int cpu = id % num_cpus; - int group; - for( group = 0; group < num_cpugroups; group++ ) - { - int cpus = GetActiveProcessorCount( group ); - if ( cpu < cpus ) break; - cpu -= cpus; - } + GROUP_AFFINITY affinity; + affinity.Group = group; + affinity.Mask = 1ULL << cpu; - if (opt_debug) - applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", - id, cpu, group, (1ULL << cpu)); + if ( opt_debug ) + applog( LOG_INFO, "Binding thread %d to cpu %d in cpu group %d", + thread, cpu, group ); + + ok = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL ); - GROUP_AFFINITY affinity; - affinity.Group = group; - affinity.Mask = 1ULL << cpu; - success = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL ); - } #else - else - success = SetThreadAffinityMask( GetCurrentThread(), mask ); + + unsigned long cpu = thread_affinity_map[ thread ]; + uint64_t mask = 1ULL << cpu; + + if ( opt_debug ) + applog( LOG_INFO, "Binding thread %d to cpu %d", thread, cpu ); + + ok = SetThreadAffinityMask( GetCurrentThread(), mask ); + #endif - if (!success) + if ( !ok ) { - last_error = GetLastError(); - applog(LOG_WARNING, "affine_to_cpu_mask for %u returned %x", - id, last_error); + last_error = GetLastError(); + applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x", + thread, last_error ); } -} +} #else + static inline void drop_policy(void) { } -static void affine_to_cpu_mask(int id, unsigned long mask) { } +static void affine_to_cpu( struct thr_info *thr ) { } + #endif // not very useful, just index the arrray directly. @@ -446,8 +431,10 @@ static bool work_decode( const json_t *val, struct work *work ) if ( !allow_mininginfo ) net_diff = algo_gate.calc_network_diff( work ); + else + net_diff = hash_to_diff( work->target ); - work->targetdiff = hash_to_diff( work->target ); + work->targetdiff = net_diff; stratum_diff = last_targetdiff = work->targetdiff; work->sharediff = 0; algo_gate.decode_extra_data( work, &net_blocks ); @@ -481,18 +468,27 @@ static bool get_mininginfo( CURL *curl, struct work *work ) // "networkhashps": 56475980 if ( res ) { + // net_diff is a global that is set from the work hash target by + // both getwork and GBT. Don't overwrite it, define a local to override + // the global. + double net_diff = 0.; json_t *key = json_object_get( res, "difficulty" ); if ( key ) { if ( json_is_object( key ) ) key = json_object_get( key, "proof-of-work" ); if ( json_is_real( key ) ) - net_diff = work->targetdiff = json_real_value( key ); + net_diff = json_real_value( key ); } key = json_object_get( res, "networkhashps" ); - if ( key && json_is_integer( key ) ) - net_hashrate = (double) json_integer_value( key ); + if ( key ) + { + if ( json_is_integer( key ) ) + net_hashrate = (double) json_integer_value( key ); + else if ( json_is_real( key ) ) + net_hashrate = (double) json_real_value( key ); + } key = json_object_get( res, "blocks" ); if ( key && json_is_integer( key ) ) @@ -507,26 +503,7 @@ static bool get_mininginfo( CURL *curl, struct work *work ) // complete missing data from getwork work->height = (uint32_t) net_blocks + 1; if ( work->height > g_work.height ) - { restart_threads(); - -/* redundant with new block log - if ( !opt_quiet ) - { - char netinfo[64] = { 0 }; - char srate[32] = { 0 }; - sprintf( netinfo, "diff %.2f", net_diff ); - if ( net_hashrate ) - { - format_hashrate( net_hashrate, srate ); - strcat( netinfo, ", net " ); - strcat( netinfo, srate ); - } - applog( LOG_BLUE, "%s block %d, %s", - algo_names[opt_algo], work->height, netinfo ); - } -*/ - } } // res } json_decref( val ); @@ -568,7 +545,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work ) if ( !s ) continue; if ( !strcmp( s, "segwit" ) || !strcmp( s, "!segwit" ) ) + { segwit = true; + if ( opt_debug ) + applog( LOG_INFO, "GBT: SegWit is enabled" ); + } } } // Segwit END @@ -917,16 +898,17 @@ static bool gbt_work_decode( const json_t *val, struct work *work ) } for ( i = 0; i < ARRAY_SIZE( work->target ); i++ ) work->target[7 - i] = be32dec( target + i ); - + net_diff = work->targetdiff = hash_to_diff( work->target ); + tmp = json_object_get( val, "workid" ); if ( tmp ) { - if ( !json_is_string( tmp ) ) - { - applog( LOG_ERR, "JSON invalid workid" ); - goto out; - } - work->workid = strdup( json_string_value( tmp ) ); + if ( !json_is_string( tmp ) ) + { + applog( LOG_ERR, "JSON invalid workid" ); + goto out; + } + work->workid = strdup( json_string_value( tmp ) ); } rc = true; @@ -967,25 +949,25 @@ void scale_hash_for_display ( double* hashrate, char* prefix ) else { *prefix = 'Y'; *hashrate /= 1e24; } } -static inline void sprintf_et( char *str, int seconds ) +static inline void sprintf_et( char *str, long unsigned int seconds ) { - // sprintf doesn't like uint64_t, Linux thinks it's long, Windows long long. - unsigned int min = seconds / 60; - unsigned int sec = seconds % 60; - unsigned int hrs = min / 60; + long unsigned int min = seconds / 60; + long unsigned int sec = seconds % 60; + long unsigned int hrs = min / 60; + if ( unlikely( hrs ) ) { - unsigned int years = hrs / (24*365); - unsigned int days = hrs / 24; - if ( years ) - sprintf( str, "%uy%ud", years, years % 365 ); - else if ( days ) //0d00h - sprintf( str, "%ud%02uh", days, hrs % 24 ); + long unsigned int days = hrs / 24; + long unsigned int years = days / 365; + if ( years ) // 0y000d + sprintf( str, "%luy%lud", years, years % 365 ); + else if ( days ) // 0d00h + sprintf( str, "%lud%02luh", days, hrs % 24 ); else // 0h00m - sprintf( str, "%uh%02um", hrs, min % 60 ); + sprintf( str, "%luh%02lum", hrs, min % 60 ); } else // 0m00s - sprintf( str, "%um%02us", min, sec ); + sprintf( str, "%lum%02lus", min, sec ); } const long double exp32 = EXP32; // 2**32 @@ -1003,6 +985,7 @@ struct share_stats_t double share_diff; double stratum_diff; double target_diff; + uint32_t height; char job_id[32]; }; @@ -1013,48 +996,94 @@ static struct timeval last_submit_time = {0}; static inline int stats_ptr_incr( int p ) { - return ++p < s_stats_size ? p : 0; + return ++p % s_stats_size; } void report_summary_log( bool force ) { struct timeval now, et, uptime, start_time; - pthread_mutex_lock( &stats_lock ); - gettimeofday( &now, NULL ); timeval_subtract( &et, &now, &five_min_start ); - if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) ) - && ( et.tv_sec < 300 ) ) +#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32)) + + // Display CPU temperature and clock rate. + int curr_temp = cpu_temp(0); + static struct timeval cpu_temp_time = {0}; + struct timeval diff; + + if ( !opt_quiet || ( curr_temp >= 80 ) ) { - pthread_mutex_unlock( &stats_lock ); - return; + int wait_time = curr_temp >= 90 ? 5 : curr_temp >= 80 ? 30 : + curr_temp >= 70 ? 60 : 120; + timeval_subtract( &diff, &now, &cpu_temp_time ); + if ( ( diff.tv_sec > wait_time ) + || ( ( curr_temp > prev_temp ) && ( curr_temp >= 75 ) ) ) + { + char tempstr[32]; + float lo_freq = 0., hi_freq = 0.; + + memcpy( &cpu_temp_time, &now, sizeof(cpu_temp_time) ); + linux_cpu_hilo_freq( &lo_freq, &hi_freq ); + if ( use_colors && ( curr_temp >= 70 ) ) + { + if ( curr_temp >= 80 ) + sprintf( tempstr, "%s%d C%s", CL_RED, curr_temp, CL_WHT ); + else + sprintf( tempstr, "%s%d C%s", CL_YLW, curr_temp, CL_WHT ); + } + else + sprintf( tempstr, "%d C", curr_temp ); + + applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz", + tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 ); + if ( curr_temp > hi_temp ) hi_temp = curr_temp; + if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) ) + restart_threads(); + prev_temp = curr_temp; + } + } + +#endif + + if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) ) ) + { + if ( et.tv_sec < 300 ) + return; + if ( ( s_get_ptr != s_put_ptr ) && ( et.tv_sec < 360 ) ) + return; } +// if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) ) +// && ( et.tv_sec < 300 ) ) +// return; + // collect and reset periodic counters + pthread_mutex_lock( &stats_lock ); + uint64_t submits = submit_sum; submit_sum = 0; uint64_t accepts = accept_sum; accept_sum = 0; uint64_t rejects = reject_sum; reject_sum = 0; uint64_t stales = stale_sum; stale_sum = 0; uint64_t solved = solved_sum; solved_sum = 0; - memcpy( &start_time, &five_min_start, sizeof start_time ); memcpy( &five_min_start, &now, sizeof now ); pthread_mutex_unlock( &stats_lock ); timeval_subtract( &et, &now, &start_time ); - timeval_subtract( &uptime, &now, &session_start ); + timeval_subtract( &uptime, &total_hashes_time, &session_start ); double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6; - double ghrate = global_hashrate; - double shrate = share_time == 0. ? 0. : exp32 * last_targetdiff - * (double)(accepts) / share_time; - double sess_hrate = uptime.tv_sec == 0. ? 0. : exp32 * norm_diff_sum - / (double)uptime.tv_sec; - double submit_rate = share_time == 0. ? 0. : (double)submits*60. - / share_time; + double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. ); + double target_diff = exp32 * last_targetdiff; + double shrate = safe_div( target_diff * (double)(accepts), + share_time, 0. ); +// global_hashrate = ghrate; + double sess_hrate = safe_div( exp32 * norm_diff_sum, + (double)uptime.tv_sec, 0. ); + double submit_rate = safe_div( (double)submits * 60., share_time, 0. ); char shr_units[4] = {0}; char ghr_units[4] = {0}; char sess_hr_units[4] = {0}; @@ -1071,51 +1100,77 @@ void report_summary_log( bool force ) applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url ); applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str ); applog2( LOG_INFO, "Share rate %.2f/min %.2f/min", - submit_rate, (double)submitted_share_count*60. / - ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) ); + submit_rate, safe_div( (double)submitted_share_count*60., + ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) ); applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)", - shrate, shr_units, sess_hrate, sess_hr_units, - ghrate, ghr_units ); + shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units ); if ( accepted_share_count < submitted_share_count ) { - double lost_ghrate = uptime.tv_sec == 0 ? 0. - : exp32 * last_targetdiff - * (double)(submitted_share_count - accepted_share_count ) - / (double)uptime.tv_sec; - double lost_shrate = share_time == 0. ? 0. - : exp32 * last_targetdiff * (double)(submits - accepts ) - / share_time; + double lost_ghrate = safe_div( target_diff + * (double)(submitted_share_count - accepted_share_count ), + (double)uptime.tv_sec, 0. ); + double lost_shrate = safe_div( target_diff * (double)(submits - accepts ), share_time, 0. ); char lshr_units[4] = {0}; char lghr_units[4] = {0}; scale_hash_for_display( &lost_shrate, lshr_units ); scale_hash_for_display( &lost_ghrate, lghr_units ); - applog2( LOG_INFO, "Lost hash rate %7.2f%sh/s %7.2f%sh/s", - lost_shrate, lshr_units, lost_ghrate, lghr_units ); + applog2( LOG_INFO, "Lost hash rate %7.2f%sh/s %7.2f%sh/s", + lost_shrate, lshr_units, lost_ghrate, lghr_units ); } - applog2( LOG_INFO,"Submitted %6d %6d", - submits, submitted_share_count ); - applog2( LOG_INFO,"Accepted %6d %6d", - accepts, accepted_share_count ); + applog2( LOG_INFO,"Submitted %7d %7d", + submits, submitted_share_count ); + applog2( LOG_INFO, "Accepted %7d %7d %5.1f%%", + accepts, accepted_share_count, + 100. * safe_div( (double)accepted_share_count, + (double)submitted_share_count, 0. ) ); if ( stale_share_count ) - applog2( LOG_INFO,"Stale %6d %6d", - stales, stale_share_count ); + { + int prio = stales ? LOG_MINR : LOG_INFO; + applog2( prio, "Stale %7d %7d %5.1f%%", + stales, stale_share_count, + 100. * safe_div( (double)stale_share_count, + (double)submitted_share_count, 0. ) ); + } if ( rejected_share_count ) - applog2( LOG_INFO,"Rejected %6d %6d", - rejects, rejected_share_count ); + { + int prio = rejects ? LOG_ERR : LOG_INFO; + applog2( prio, "Rejected %7d %7d %5.1f%%", + rejects, rejected_share_count, + 100. * safe_div( (double)rejected_share_count, + (double)submitted_share_count, 0. ) ); + } if ( solved_block_count ) - applog2( LOG_INFO,"Blocks Solved %6d %6d", - solved, solved_block_count ); + { + int prio = solved ? LOG_PINK : LOG_INFO; + applog2( prio, "Blocks Solved %7d %7d", + solved, solved_block_count ); + } + if ( stratum_errors ) + applog2( LOG_INFO, "Stratum resets %7d", stratum_errors ); + applog2( LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g", - highest_share, lowest_share ); -} + highest_share, lowest_share ); -bool lowdiff_debug = false; + int mismatch = submitted_share_count + - ( accepted_share_count + stale_share_count + rejected_share_count ); + + if ( mismatch ) + { + if ( stratum_errors ) + applog2( LOG_MINR, "Count mismatch: %d, stats may be inaccurate", + mismatch ); + else if ( !opt_quiet ) + applog2( LOG_INFO, CL_LBL + "Count mismatch, submitted share may still be pending" CL_N ); + } +} -static int share_result( int result, struct work *work, const char *reason ) +static int share_result( int result, struct work *work, + const char *reason ) { - double share_time = 0.; //, share_ratio = 0.; + double share_time = 0.; double hashrate = 0.; int latency = 0; struct share_stats_t my_stats = {0}; @@ -1126,7 +1181,8 @@ static int share_result( int result, struct work *work, const char *reason ) char bres[48]; bool solved = false; bool stale = false; - char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL; + char *acol, *bcol, *scol, *rcol; + acol = bcol = scol = rcol = "\0"; pthread_mutex_lock( &stats_lock ); @@ -1156,53 +1212,50 @@ static int share_result( int result, struct work *work, const char *reason ) sizeof last_submit_time ); } -/* - share_ratio = my_stats.net_diff == 0. ? 0. : my_stats.share_diff / - my_stats.net_diff; -*/ - // check result if ( likely( result ) ) { accepted_share_count++; - if ( ( my_stats.share_diff > 0. ) + if ( ( my_stats.share_diff > 0. ) && ( my_stats.share_diff < lowest_share ) ) lowest_share = my_stats.share_diff; if ( my_stats.share_diff > highest_share ) highest_share = my_stats.share_diff; - sprintf( sres, "Stale %d", stale_share_count ); - sprintf( rres, "Rejected %d", rejected_share_count ); + sprintf( sres, "S%d", stale_share_count ); + sprintf( rres, "R%d", rejected_share_count ); if unlikely( ( my_stats.net_diff > 0. ) - && ( my_stats.share_diff >= net_diff ) ) + && ( my_stats.share_diff >= my_stats.net_diff ) ) { solved = true; solved_block_count++; sprintf( bres, "BLOCK SOLVED %d", solved_block_count ); - sprintf( ares, "Accepted %d", accepted_share_count); + sprintf( ares, "A%d", accepted_share_count ); } else { - sprintf( bres, "BLOCK SOLVED %d", solved_block_count ); - sprintf( ares, "Accepted %d ", accepted_share_count); + sprintf( bres, "B%d", solved_block_count ); + sprintf( ares, "Accepted %d", accepted_share_count ); } } else { - sprintf( ares, "Accepted %d", accepted_share_count ); - sprintf( bres, "BLOCK SOLVED %d", solved_block_count ); - stale = work ? work->data[ algo_gate.ntime_index ] - != g_work.data[ algo_gate.ntime_index ] : false; - if ( reason ) stale = stale || strstr( reason, "job" ); + sprintf( ares, "A%d", accepted_share_count ); + sprintf( bres, "B%d", solved_block_count ); + if ( reason ) + stale = strstr( reason, "job" ); + else if ( work ) + stale = work->data[ algo_gate.ntime_index ] + != g_work.data[ algo_gate.ntime_index ]; if ( stale ) { stale_share_count++; sprintf( sres, "Stale %d", stale_share_count ); - sprintf( rres, "Rejected %d", rejected_share_count ); + sprintf( rres, "R%d", rejected_share_count ); } else { rejected_share_count++; - sprintf( sres, "Stale %d", stale_share_count ); + sprintf( sres, "S%d", stale_share_count ); sprintf( rres, "Rejected %d" , rejected_share_count ); } } @@ -1226,32 +1279,30 @@ static int share_result( int result, struct work *work, const char *reason ) else reject_sum++; } submit_sum++; - latency_sum += latency; pthread_mutex_unlock( &stats_lock ); if ( use_colors ) { - bcol = acol = scol = rcol = CL_WHT; + bcol = acol = scol = rcol = CL_N; if ( likely( result ) ) { - acol = CL_WHT CL_GRN; - if ( unlikely( solved ) ) bcol = CL_WHT CL_MAG; + acol = CL_LGR; + if ( unlikely( solved ) ) bcol = CL_LMA; } - else if ( stale ) scol = CL_WHT CL_YL2; - else rcol = CL_WHT CL_RED; + else if ( stale ) scol = CL_YL2; + else rcol = CL_LRD; } - applog( LOG_NOTICE, "%s%s, %s%s, %s%s, %s%s, " CL_WHT "Diff %.5g, %.2f H/s, %.3f sec (%dms)", - acol, ares, scol, sres, rcol, rres, bcol, - bres, my_stats.share_diff, hashrate, share_time, latency ); + applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)", + my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol, + bres, CL_N, share_time, latency ); if ( unlikely( opt_debug || !result || solved ) ) { if ( have_stratum ) applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s", - my_stats.share_diff, stratum.block_height, - my_stats.job_id ); + my_stats.share_diff, my_stats.height, my_stats.job_id ); else applog2( LOG_INFO, "Diff %.5g, Block %d", my_stats.share_diff, work ? work->height : last_block_height ); @@ -1260,14 +1311,14 @@ static int share_result( int result, struct work *work, const char *reason ) if ( unlikely( !( opt_quiet || result || stale ) ) ) { uint32_t str[8]; + uint32_t *targ; - if ( reason ) - applog( LOG_WARNING, "Reject reason: %s", reason ); + if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason ); - // display share hash and target for troubleshooting diff_to_hash( str, my_stats.share_diff ); - applog2( LOG_INFO, "Hash: %08x%08x%08x...", str[7], str[6], str[5] ); - uint32_t *targ; + applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6], + str[5], str[4], str[3],str[2], str[1], str[0] ); + if ( work ) targ = work->target; else @@ -1275,7 +1326,8 @@ static int share_result( int result, struct work *work, const char *reason ) diff_to_hash( str, my_stats.target_diff ); targ = &str[0]; } - applog2( LOG_INFO, "Target: %08x%08x%08x...", targ[7], targ[6], targ[5] ); + applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6], + targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] ); } return 1; } @@ -1498,12 +1550,6 @@ const char *gbt_lp_req = */ // Segwit END -// Parameters for MinX BlockTemplate BEGIN -#define MINX_PARAMS "\"powalgo\": \"minotaurx\"" -static const char *gbt_req_minx = "{\"method\": \"getblocktemplate\", \"params\": [{"MINX_PARAMS", \"capabilities\": "GBT_CAPABILITIES", \"rules\": "GBT_RULES"}], \"id\":0}\r\n"; -static const char *gbt_lp_req_minx = "{\"method\": \"getblocktemplate\", \"params\": [{"MINX_PARAMS", \"capabilities\": "GBT_CAPABILITIES", \"rules\": "GBT_RULES", \"longpollid\": \"%s\"}], \"id\":0}\r\n"; -// Parameters for MinX BlockTemplate END - static bool get_upstream_work( CURL *curl, struct work *work ) { json_t *val; @@ -1513,20 +1559,11 @@ static bool get_upstream_work( CURL *curl, struct work *work ) start: gettimeofday( &tv_start, NULL ); - // Parameters for MinX BlockTemplate BEGIN -/* + val = json_rpc_call( curl, rpc_url, rpc_userpass, have_gbt ? gbt_req : getwork_req, &err, have_gbt ? JSON_RPC_QUIET_404 : 0); -*/ - if(!opt_minotaurx) - { - val = json_rpc_call(curl, rpc_url, rpc_userpass, have_gbt ? gbt_req : getwork_req, &err, have_gbt ? JSON_RPC_QUIET_404 : 0); - } else - { - val = json_rpc_call(curl, rpc_url, rpc_userpass, have_gbt ? gbt_req_minx : getwork_req, &err, have_gbt ? JSON_RPC_QUIET_404 : 0); - } - // Parameters for MinX BlockTemplate BEGIN + gettimeofday( &tv_end, NULL ); if ( have_stratum ) @@ -1595,6 +1632,7 @@ static bool get_upstream_work( CURL *curl, struct work *work ) { double miner_hr = 0.; double net_hr = net_hashrate; + double nd = net_diff * exp32; char net_hr_units[4] = {0}; char miner_hr_units[4] = {0}; char net_ttf[32]; @@ -1609,11 +1647,11 @@ static bool get_upstream_work( CURL *curl, struct work *work ) pthread_mutex_unlock( &stats_lock ); if ( net_hr > 0. ) - sprintf_et( net_ttf, ( net_diff * exp32 ) / net_hr ); + sprintf_et( net_ttf, nd / net_hr ); else sprintf( net_ttf, "NA" ); if ( miner_hr > 0. ) - sprintf_et( miner_ttf, ( net_diff * exp32 ) / miner_hr ); + sprintf_et( miner_ttf, nd / miner_hr ); else sprintf( miner_ttf, "NA" ); @@ -1828,6 +1866,7 @@ static void update_submit_stats( struct work *work, const void *hash ) share_stats[ s_put_ptr ].net_diff = net_diff; share_stats[ s_put_ptr ].stratum_diff = stratum_diff; share_stats[ s_put_ptr ].target_diff = work->targetdiff; + share_stats[ s_put_ptr ].height = work->height; if ( have_stratum ) strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 ); s_put_ptr = stats_ptr_incr( s_put_ptr ); @@ -1838,6 +1877,10 @@ static void update_submit_stats( struct work *work, const void *hash ) bool submit_solution( struct work *work, const void *hash, struct thr_info *thr ) { + // Job went stale during hashing of a valid share. + if ( !opt_quiet && work_restart[ thr->id ].restart ) + applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N ); + work->sharediff = hash_to_diff( hash ); if ( likely( submit_work( thr, work ) ) ) { @@ -1854,19 +1897,28 @@ bool submit_solution( struct work *work, const void *hash, if ( !opt_quiet ) { if ( have_stratum ) - applog( LOG_NOTICE, "Share %d Submitted Diff %.5g, Block %d, Job %s", + applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s", submitted_share_count, work->sharediff, work->height, work->job_id ); else - applog( LOG_NOTICE, "Share %d Submitted Diff %.5g, Block %d, Ntime %08x", + applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x", submitted_share_count, work->sharediff, work->height, work->data[ algo_gate.ntime_index ] ); } - if ( unlikely( lowdiff_debug ) ) + if ( opt_debug ) { uint32_t* h = (uint32_t*)hash; uint32_t* t = (uint32_t*)work->target; + uint32_t* d = (uint32_t*)work->data; + + unsigned char *xnonce2str = abin2hex( work->xnonce2, + work->xnonce2_len ); + applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id, + work->data[ algo_gate.nonce_index ], xnonce2str ); + free( xnonce2str ); + applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] ); + applog(LOG_INFO," : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]); applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]); applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x", @@ -1947,6 +1999,10 @@ void set_work_data_big_endian( struct work *work ) // calculate net diff from nbits. double std_calc_network_diff( struct work* work ) { + uint32_t nbits = work->data[ algo_gate.nbits_index ]; + uint32_t shift = nbits & 0xff; + uint32_t bits = bswap_32( nbits ) & 0x00ffffff; +/* // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... int nbits_index = algo_gate.nbits_index; @@ -1954,15 +2010,17 @@ double std_calc_network_diff( struct work* work ) : swab32( work->data[ nbits_index ] ); uint32_t bits = ( nbits & 0xffffff ); int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28 +*/ + int m; - double d = (double)0x0000ffff / (double)bits; + long double d = (long double)0x0000ffff / (long double)bits; for ( m = shift; m < 29; m++ ) d *= 256.0; for ( m = 29; m < shift; m++ ) d /= 256.0; if ( opt_debug_diff ) - applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); - return d; + applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits); + return (double)d; } void std_get_new_work( struct work* work, struct work* g_work, int thr_id, @@ -2006,7 +2064,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) pthread_rwlock_wrlock( &g_work_lock ); pthread_mutex_lock( &sctx->work_lock ); - new_job = sctx->new_job; + new_job = sctx->new_job; // otherwise just increment extranonce2 sctx->new_job = false; free( g_work->job_id ); @@ -2022,7 +2080,8 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) / ( opt_target_factor * opt_diff_factor ); diff_to_hash( g_work->target, g_work->targetdiff ); - // Increment extranonce2 + // Pre increment extranonce2 in case of being called again before receiving + // a new job for ( int t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); @@ -2043,20 +2102,20 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) pthread_mutex_unlock( &stats_lock ); if ( stratum_diff != sctx->job.diff ) - applog( LOG_BLUE, "New Stratum Diff %g", - sctx->job.diff ); + applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s", + sctx->job.diff, sctx->block_height, g_work->job_id ); else if ( last_block_height != sctx->block_height ) - applog( LOG_BLUE, "New Block %d, Job %s", - sctx->block_height, g_work->job_id ); + applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s", + sctx->block_height, net_diff, g_work->job_id ); else if ( g_work->job_id && new_job ) - applog( LOG_BLUE, "%s %s Block %d, Job %s, network diff %.5g", - short_url, algo_names[ opt_algo ], sctx->block_height, g_work->job_id, net_diff ); + applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s", + sctx->block_height, net_diff, g_work->job_id ); else if ( !opt_quiet ) { - unsigned char *xnonce2str = abin2hex( g_work->xnonce2, - g_work->xnonce2_len ); - applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g", - xnonce2str, sctx->block_height, net_diff ); + unsigned char *xnonce2str = bebin2hex( g_work->xnonce2, + g_work->xnonce2_len ); + applog( LOG_INFO, "Extranonce2 0x%s, Block %d, Job %s", + xnonce2str, sctx->block_height, g_work->job_id ); free( xnonce2str ); } @@ -2081,11 +2140,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) if ( likely( hr > 0. ) ) { + double nd = net_diff * exp32; char hr_units[4] = {0}; char block_ttf[32]; char share_ttf[32]; - sprintf_et( block_ttf, ( net_diff * exp32 ) / hr ); + sprintf_et( block_ttf, nd / hr ); sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr ); scale_hash_for_display ( &hr, hr_units ); applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s", @@ -2099,11 +2159,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) uint64_t net_ttf = ( last_block_height - session_first_block ) == 0 ? 0 : et.tv_sec / ( last_block_height - session_first_block ); - if ( net_diff && net_ttf ) + if ( net_diff > 0. && net_ttf ) { - double net_hr = net_diff * exp32 / net_ttf; + double net_hr = nd / net_ttf; char net_hr_units[4] = {0}; - scale_hash_for_display ( &net_hr, net_hr_units ); applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s", net_hr, net_hr_units ); @@ -2112,6 +2171,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) } // hr > 0 } // !quiet } // new diff/block + + if ( new_job && !( opt_quiet || stratum_errors ) ) + { + int mismatch = submitted_share_count - ( accepted_share_count + + stale_share_count + + rejected_share_count ); + if ( mismatch ) + applog( LOG_INFO, + CL_LBL "%d Submitted share pending, maybe stale" CL_N, + submitted_share_count ); + } } static void *miner_thread( void *userdata ) @@ -2138,11 +2208,11 @@ static void *miner_thread( void *userdata ) /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE * and if that fails, then SCHED_BATCH. No need for this to be an * error if it fails */ - if (!opt_benchmark && opt_priority == 0) + if ( !opt_priority ) { setpriority(PRIO_PROCESS, 0, 19); - if ( !thr_id && !opt_quiet ) - applog(LOG_INFO, "Miner thread priority %d (nice 19)", opt_priority ); + if ( !thr_id && opt_debug ) + applog(LOG_INFO, "Default miner thread priority %d (nice 19)", opt_priority ); drop_policy(); } else @@ -2159,82 +2229,69 @@ static void *miner_thread( void *userdata ) case 4: prio = -10; break; case 5: prio = -15; } - if ( !( thr_id || opt_quiet ) ) - applog( LOG_INFO, "Miner thread priority %d (nice %d)", + if ( !thr_id ) + { + applog( LOG_INFO, "User set miner thread priority %d (nice %d)", opt_priority, prio ); + applog( LOG_WARNING, "High priority mining threads may cause system instability"); + } #endif setpriority(PRIO_PROCESS, 0, prio); if ( opt_priority == 0 ) drop_policy(); } + // CPU thread affinity - if ( num_cpus > 1 ) - { -#if AFFINITY_USES_UINT128 - // Default affinity - if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 ) - { - affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) ); - if ( opt_debug ) - applog( LOG_INFO, "Binding thread %d to cpu %d.", - thr_id, thr_id % num_cpus, - u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ), - u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) ); - } -#else - if ( ( opt_affinity == -1 ) && ( opt_n_threads > 1 ) ) - { - affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) ); - if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to cpu %d.", - thr_id, thr_id % num_cpus, 1 << (thr_id % num_cpus)) ; - } -#endif - else // Custom affinity - { - affine_to_cpu_mask( thr_id, opt_affinity ); - if ( opt_debug ) - { -#if AFFINITY_USES_UINT128 - if ( num_cpus > 64 ) - applog( LOG_INFO, "Binding thread %d to mask %016llx %016llx", - thr_id, u128_hi64( opt_affinity ), - u128_lo64( opt_affinity ) ); - else - applog( LOG_INFO, "Binding thread %d to mask %016llx", - thr_id, opt_affinity ); -#else - applog( LOG_INFO, "Binding thread %d to mask %016llx", - thr_id, opt_affinity ); -#endif - } - } - } // num_cpus > 1 + if ( opt_affinity && num_cpus > 1 ) affine_to_cpu( mythr ); if ( !algo_gate.miner_thread_init( thr_id ) ) { - applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id ); + applog( LOG_ERR, "FAIL: thread %d failed to initialize", thr_id ); exit (1); } // wait for stratum to send first job - if ( have_stratum ) while ( unlikely( !g_work.job_id ) ) sleep(1); + if ( have_stratum ) while ( unlikely( stratum_down ) ) + { + if ( opt_debug ) + applog( LOG_INFO, "Thread %d waiting for first job", thr_id ); + sleep(1); + } + // nominal startng values + int64_t max64 = 20; + thr_hashrates[thr_id] = 20; while (1) { uint64_t hashes_done; struct timeval tv_start, tv_end, diff; - int64_t max64 = 1000; +// int64_t max64 = 1000; int nonce_found = 0; if ( likely( algo_gate.do_this_thread( thr_id ) ) ) { - if ( have_stratum ) + if ( have_stratum ) { - if ( *nonceptr >= end_nonce ) - stratum_gen_work( &stratum, &g_work ); + while ( unlikely( stratum_down ) ) + sleep( 1 ); + if ( unlikely( ( *nonceptr >= end_nonce ) + && !work_restart[thr_id].restart ) ) + { + if ( opt_extranonce ) + stratum_gen_work( &stratum, &g_work ); + else + { + if ( !thr_id ) + { + applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" ); + applog( LOG_WARNING, "waiting for new work..."); + } + while ( !work_restart[thr_id].restart ) + sleep ( 1 ); + } + } } - else + else // GBT or getwork { pthread_rwlock_wrlock( &g_work_lock ); @@ -2245,8 +2302,7 @@ static void *miner_thread( void *userdata ) if ( unlikely( !get_work( mythr, &g_work ) ) ) { pthread_rwlock_unlock( &g_work_lock ); - applog( LOG_ERR, "work retrieval failed, exiting " - "mining thread %d", thr_id ); + applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id ); goto out; } g_work_time = time(NULL); @@ -2268,12 +2324,6 @@ static void *miner_thread( void *userdata ) if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) ) continue; - // conditional mining - if ( unlikely( !wanna_mine( thr_id ) ) ) - { - sleep(5); - continue; - } // LP_SCANTIME overrides opt_scantime option, is this right? @@ -2343,11 +2393,15 @@ static void *miner_thread( void *userdata ) if ( diff.tv_usec || diff.tv_sec ) { pthread_mutex_lock( &stats_lock ); + total_hashes += hashes_done; + total_hashes_time = tv_end; thr_hashrates[thr_id] = hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 ); pthread_mutex_unlock( &stats_lock ); } + // This code is deprecated, scanhash should never return true. + // This remains as a backup in case some old implementations still exist. // If unsubmiited nonce(s) found, submit now. if ( unlikely( nonce_found && !opt_benchmark ) ) { @@ -2374,48 +2428,6 @@ static void *miner_thread( void *userdata ) } } -#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32)) - - // Display CPU temperature and clock rate. - int curr_temp, prev_hi_temp; - static struct timeval cpu_temp_time = {0}; - - pthread_mutex_lock( &stats_lock ); - - prev_hi_temp = hi_temp; - curr_temp = cpu_temp(0); - if ( curr_temp > hi_temp ) hi_temp = curr_temp; - - pthread_mutex_unlock( &stats_lock ); - - if ( !opt_quiet || ( curr_temp >= 80 ) ) - { - int wait_time = curr_temp >= 80 ? 20 : curr_temp >= 70 ? 60 : 120; - timeval_subtract( &diff, &tv_end, &cpu_temp_time ); - if ( ( diff.tv_sec > wait_time ) || ( curr_temp > prev_hi_temp ) ) - { - char tempstr[32]; - float lo_freq = 0., hi_freq = 0.; - - memcpy( &cpu_temp_time, &tv_end, sizeof(cpu_temp_time) ); - linux_cpu_hilo_freq( &lo_freq, &hi_freq ); - if ( use_colors && ( curr_temp >= 70 ) ) - { - if ( curr_temp >= 80 ) - sprintf( tempstr, "%s%d C%s", CL_RED, curr_temp, CL_WHT ); - else - sprintf( tempstr, "%s%d C%s", CL_YLW, curr_temp, CL_WHT ); - } - else - sprintf( tempstr, "%d C", curr_temp ); - - applog( LOG_NOTICE,"CPU temp: curr %s (max %d), Freq: %.3f/%.3f GHz", - tempstr, prev_hi_temp, lo_freq / 1e6, hi_freq / 1e6 ); - } - } - -#endif - // display hashrate if ( unlikely( opt_hash_meter ) ) { @@ -2439,7 +2451,6 @@ static void *miner_thread( void *userdata ) && thr_id == opt_n_threads - 1 ) ) { double hashrate = 0.; - pthread_mutex_lock( &stats_lock ); for ( i = 0; i < opt_n_threads; i++ ) hashrate += thr_hashrates[i]; @@ -2448,18 +2459,37 @@ static void *miner_thread( void *userdata ) if ( opt_benchmark ) { + struct timeval uptime; char hr[16]; char hr_units[2] = {0,0}; - scale_hash_for_display( &hashrate, hr_units ); - sprintf( hr, "%.2f", hashrate ); -#if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32)) - applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units ); + timeval_subtract( &uptime, &total_hashes_time, &session_start ); + double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. ); + + if ( hashrate > 0. ) + { + scale_hash_for_display( &hashrate, hr_units ); + sprintf( hr, "%.2f", hashrate ); +#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32)) + applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units ); #else - applog( LOG_NOTICE, "Total: %s %sH/s, CPU temp: %dC", - hr, hr_units, (uint32_t)cpu_temp(0) ); + float lo_freq = 0., hi_freq = 0.; + linux_cpu_hilo_freq( &lo_freq, &hi_freq ); + applog( LOG_NOTICE, + "Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz", + hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6, + hi_freq / 1e6 ); #endif - } + } + } } // benchmark + + // conditional mining + if ( unlikely( !wanna_mine( thr_id ) ) ) + { + sleep(5); + continue; + } + } // miner_thread loop out: @@ -2481,21 +2511,8 @@ json_t *std_longpoll_rpc_call( CURL *curl, int *err, char* lp_url ) char *req = NULL; if (have_gbt) { - // Parameters for MinX BlockTemplate BEGIN - /* - req = (char*) malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1); - sprintf(req, gbt_lp_req, lp_id); - */ - if(!opt_minotaurx) - { - req = (char*) malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1); - sprintf(req, gbt_lp_req, lp_id); - } else - { - req = (char*) malloc(strlen(gbt_lp_req_minx) + strlen(lp_id) + 1); - sprintf(req, gbt_lp_req_minx, lp_id); - } - // Parameters for MinX BlockTemplate END + req = (char*) malloc( strlen(gbt_lp_req) + strlen(lp_id) + 1 ); + sprintf( req, gbt_lp_req, lp_id ); } val = json_rpc_call( curl, rpc_url, rpc_userpass, getwork_req, err, JSON_RPC_LONGPOLL ); @@ -2590,7 +2607,7 @@ static void *longpoll_thread(void *userdata) if (!opt_quiet) { char netinfo[64] = { 0 }; - if (net_diff > 0.) + if ( net_diff > 0. ) { sprintf(netinfo, ", diff %.3f", net_diff); } @@ -2729,6 +2746,18 @@ void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) sctx->job.final_sapling_hash ); } +// Loop is out of order: +// +// connect/reconnect +// handle message +// get new message +// +// change to +// connect/reconnect +// get new message +// handle message + + static void *stratum_thread(void *userdata ) { struct thr_info *mythr = (struct thr_info *) userdata; @@ -2746,6 +2775,9 @@ static void *stratum_thread(void *userdata ) if ( unlikely( stratum_need_reset ) ) { stratum_need_reset = false; + gettimeofday( &stratum_reset_time, NULL ); + stratum_down = true; + stratum_errors++; stratum_disconnect( &stratum ); if ( strcmp( stratum.url, rpc_url ) ) { @@ -2753,14 +2785,17 @@ static void *stratum_thread(void *userdata ) stratum.url = strdup( rpc_url ); applog(LOG_BLUE, "Connection changed to %s", short_url); } - else // if ( !opt_quiet ) - applog(LOG_WARNING, "Stratum connection reset"); + else + applog(LOG_BLUE, "Stratum connection reset"); // reset stats queue as well - s_get_ptr = s_put_ptr = 0; + restart_threads(); + if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0; } while ( !stratum.curl ) { + stratum_down = true; + restart_threads(); pthread_rwlock_wrlock( &g_work_lock ); g_work_time = 0; pthread_rwlock_unlock( &g_work_lock ); @@ -2781,17 +2816,14 @@ static void *stratum_thread(void *userdata ) } else { - restart_threads(); + stratum_down = false; applog(LOG_BLUE,"Stratum connection established" ); + if ( stratum.new_job ) // prime first job + stratum_gen_work( &stratum, &g_work ); } } - report_summary_log( ( stratum_diff != stratum.job.diff ) - && ( stratum_diff != 0. ) ); - - if ( stratum.new_job ) - stratum_gen_work( &stratum, &g_work ); - + // Wait for new message from server if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) ) { if ( likely( s = stratum_recv_line( &stratum ) ) ) @@ -2802,73 +2834,309 @@ static void *stratum_thread(void *userdata ) } else { - applog(LOG_WARNING, "Stratum connection interrupted"); - stratum_disconnect( &stratum ); +// applog(LOG_WARNING, "Stratum connection interrupted"); +// stratum_disconnect( &stratum ); + stratum_need_reset = true; } } else { applog(LOG_ERR, "Stratum connection timeout"); - stratum_disconnect( &stratum ); + stratum_need_reset = true; +// stratum_disconnect( &stratum ); } + report_summary_log( ( stratum_diff != stratum.job.diff ) + && ( stratum_diff != 0. ) ); + + if ( !stratum_need_reset ) + { + // Is keepalive needed? Mutex would normally be required but that + // would block any attempt to submit a share. A share is more + // important even if it messes up the keepalive. + + if ( opt_stratum_keepalive ) + { + struct timeval now, et; + gettimeofday( &now, NULL ); + // any shares submitted since last keepalive? + if ( last_submit_time.tv_sec > stratum_keepalive_timer.tv_sec ) + memcpy( &stratum_keepalive_timer, &last_submit_time, + sizeof (struct timeval) ); + + timeval_subtract( &et, &now, &stratum_keepalive_timer ); + + if ( et.tv_sec > stratum_keepalive_timeout ) + { + double diff = stratum.job.diff * 0.5; + stratum_keepalive_timer = now; + if ( !opt_quiet ) + applog( LOG_BLUE, + "Stratum keepalive requesting lower difficulty" ); + stratum_suggest_difficulty( &stratum, diff ); + } + + if ( last_submit_time.tv_sec > stratum_reset_time.tv_sec ) + timeval_subtract( &et, &now, &last_submit_time ); + else + timeval_subtract( &et, &now, &stratum_reset_time ); + + if ( et.tv_sec > stratum_keepalive_timeout + 60 ) + { + applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" ); + stratum_need_reset = true; + stratum_keepalive_timer = now; + } + } // stratum_keepalive + + if ( stratum.new_job && !stratum_need_reset ) + stratum_gen_work( &stratum, &g_work ); + + } // stratum_need_reset } // loop out: return NULL; } -void show_version_and_exit(void) +static void show_credits() { - printf("\n built on " __DATE__ -#ifdef _MSC_VER - " with VC++ 2013\n"); -#elif defined(__GNUC__) - " with GCC"); - printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); -#endif - - printf(" features:" -#if defined(USE_ASM) && defined(__i386__) - " i386" -#endif -#if defined(USE_ASM) && defined(__x86_64__) - " x86_64" -#endif -#if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__)) - " SSE2" -#endif -#if defined(__x86_64__) && defined(USE_AVX) - " AVX" -#endif -#if defined(__x86_64__) && defined(USE_AVX2) - " AVX2" -#endif -#if defined(__x86_64__) && defined(USE_XOP) - " XOP" -#endif -#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) - " ARM" -#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) - " ARMv5E" -#endif -#if defined(__ARM_NEON__) - " NEON" -#endif -#endif - "\n\n"); + printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n"); + printf(" A CPU miner with multi algo support and optimized for CPUs\n"); + printf(" with AVX512, SHA and VAES extensions by JayDDee.\n"); + printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n"); +} - /* dependencies versions */ - printf("%s\n", curl_version()); -#ifdef JANSSON_VERSION - printf("jansson/%s ", JANSSON_VERSION); -#endif +#define check_cpu_capability() cpu_capability( false ) +#define display_cpu_capability() cpu_capability( true ) +static bool cpu_capability( bool display_only ) +{ + char cpu_brand[0x40]; + bool cpu_has_sse2 = has_sse2(); + bool cpu_has_aes = has_aes_ni(); + bool cpu_has_sse42 = has_sse42(); + bool cpu_has_avx = has_avx(); + bool cpu_has_avx2 = has_avx2(); + bool cpu_has_sha = has_sha(); + bool cpu_has_avx512 = has_avx512(); + bool cpu_has_vaes = has_vaes(); + bool sw_has_aes = false; + bool sw_has_sse2 = false; + bool sw_has_sse42 = false; + bool sw_has_avx = false; + bool sw_has_avx2 = false; + bool sw_has_avx512 = false; + bool sw_has_sha = false; + bool sw_has_vaes = false; + set_t algo_features = algo_gate.optimizations; + bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); + bool algo_has_aes = set_incl( AES_OPT, algo_features ); + bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); + bool algo_has_avx = set_incl( AVX_OPT, algo_features ); + bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); + bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); + bool algo_has_sha = set_incl( SHA_OPT, algo_features ); + bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); + bool use_aes; + bool use_sse2; + bool use_sse42; + bool use_avx; + bool use_avx2; + bool use_avx512; + bool use_sha; + bool use_vaes; + bool use_none; + + #ifdef __AES__ + sw_has_aes = true; + #endif + #ifdef __SSE2__ + sw_has_sse2 = true; + #endif + #ifdef __SSE4_2__ + sw_has_sse42 = true; + #endif + #ifdef __AVX__ + sw_has_avx = true; + #endif + #ifdef __AVX2__ + sw_has_avx2 = true; + #endif + #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) + sw_has_avx512 = true; + #endif + #ifdef __SHA__ + sw_has_sha = true; + #endif + #ifdef __VAES__ + sw_has_vaes = true; + #endif + + +// #if !((__AES__) || (__SSE2__)) +// printf("Neither __AES__ nor __SSE2__ defined.\n"); +// #endif + + cpu_brand_string( cpu_brand ); + printf( "CPU: %s\n", cpu_brand ); + + printf("SW built on " __DATE__ + #ifdef _MSC_VER + " with VC++ 2013\n"); + #elif defined(__GNUC__) + " with GCC"); + printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); + #else + printf("\n"); + #endif + + printf("CPU features: "); + if ( cpu_has_avx512 ) printf( " AVX512" ); + else if ( cpu_has_avx2 ) printf( " AVX2 " ); + else if ( cpu_has_avx ) printf( " AVX " ); + else if ( cpu_has_sse42 ) printf( " SSE4.2" ); + else if ( cpu_has_sse2 ) printf( " SSE2 " ); + if ( cpu_has_vaes ) printf( " VAES" ); + else if ( cpu_has_aes ) printf( " AES" ); + if ( cpu_has_sha ) printf( " SHA" ); + + printf("\nSW features: "); + if ( sw_has_avx512 ) printf( " AVX512" ); + else if ( sw_has_avx2 ) printf( " AVX2 " ); + else if ( sw_has_avx ) printf( " AVX " ); + else if ( sw_has_sse42 ) printf( " SSE4.2" ); + else if ( sw_has_sse2 ) printf( " SSE2 " ); + if ( sw_has_vaes ) printf( " VAES" ); + else if ( sw_has_aes ) printf( " AES" ); + if ( sw_has_sha ) printf( " SHA" ); + + if ( !display_only ) + { + printf("\nAlgo features:"); + if ( algo_features == EMPTY_SET ) printf( " None" ); + else + { + if ( algo_has_avx512 ) printf( " AVX512" ); + else if ( algo_has_avx2 ) printf( " AVX2 " ); + else if ( algo_has_sse42 ) printf( " SSE4.2" ); + else if ( algo_has_sse2 ) printf( " SSE2 " ); + if ( algo_has_vaes ) printf( " VAES" ); + else if ( algo_has_aes ) printf( " AES" ); + if ( algo_has_sha ) printf( " SHA" ); + } + } + printf("\n"); + + if ( display_only ) return true; + + // Check for CPU and build incompatibilities + if ( !cpu_has_sse2 ) + { + printf( "A CPU with SSE2 is required to use cpuminer-opt\n" ); + return false; + } + if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) ) + { + printf( "The SW build requires a CPU with AES and AVX2!\n" ); + return false; + } + if ( sw_has_sse42 && !cpu_has_sse42 ) + { + printf( "The SW build requires a CPU with SSE4.2!\n" ); + return false; + } + if ( sw_has_aes && !cpu_has_aes ) + { + printf( "The SW build requires a CPU with AES!\n" ); + return false; + } + if ( sw_has_sha && !cpu_has_sha ) + { + printf( "The SW build requires a CPU with SHA!\n" ); + return false; + } + + // Determine mining options + use_sse2 = cpu_has_sse2 && algo_has_sse2; + use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42; + use_avx = cpu_has_avx && sw_has_avx && algo_has_avx; + use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; + use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; + use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; + use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; + use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes; + use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512 + || use_avx2 || use_sha || use_vaes ); + + // Display best options + printf( "\nStarting miner with" ); + if ( use_none ) printf( " no optimizations" ); + else + { + if ( use_avx512 ) printf( " AVX512" ); + else if ( use_avx2 ) printf( " AVX2" ); + else if ( use_avx ) printf( " AVX" ); + else if ( use_sse42 ) printf( " SSE42" ); + else if ( use_sse2 ) printf( " SSE2" ); + if ( use_vaes ) printf( " VAES" ); + else if ( use_aes ) printf( " AES" ); + if ( use_sha ) printf( " SHA" ); + } + printf( "...\n\n" ); + + return true; +} + +void show_version_and_exit(void) +{ + printf("\n built on " __DATE__ +#ifdef _MSC_VER + " with VC++ 2013\n"); +#elif defined(__GNUC__) + " with GCC"); + printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); +#endif + + printf(" features:" +#if defined(USE_ASM) && defined(__i386__) + " i386" +#endif +#if defined(USE_ASM) && defined(__x86_64__) + " x86_64" +#endif +#if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__)) + " SSE2" +#endif +#if defined(__x86_64__) && defined(USE_AVX) + " AVX" +#endif +#if defined(__x86_64__) && defined(USE_AVX2) + " AVX2" +#endif +#if defined(__x86_64__) && defined(USE_XOP) + " XOP" +#endif +#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) + " ARM" +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) + " ARMv5E" +#endif +#if defined(__ARM_NEON__) + " NEON" +#endif +#endif + "\n\n"); + + printf("%s\n", curl_version()); +#ifdef JANSSON_VERSION + printf("jansson/%s ", JANSSON_VERSION); +#endif #ifdef PTW32_VERSION printf("pthreads/%d.%d.%d.%d ", PTW32_VERSION); #endif @@ -2876,7 +3144,6 @@ void show_version_and_exit(void) exit(0); } - void show_usage_and_exit(int status) { if (status) @@ -2897,7 +3164,7 @@ void parse_arg(int key, char *arg ) { char *p; int v, i; - uint64_t ul; +// uint64_t ul; double d; switch( key ) @@ -3208,26 +3475,14 @@ void parse_arg(int key, char *arg ) break; #endif case 1020: // cpu-affinity - p = strstr(arg, "0x"); - if ( p ) - ul = strtoull( p, NULL, 16 ); - else - ul = atoll( arg ); -// if ( ul > ( 1ULL << num_cpus ) - 1ULL ) -// ul = -1LL; -#if AFFINITY_USES_UINT128 -// replicate the low 64 bits to make a full 128 bit mask if there are more -// than 64 CPUs, otherwise zero extend the upper half. - opt_affinity = (uint128_t)ul; - if ( num_cpus > 64 ) - opt_affinity = (opt_affinity << 64 ) | opt_affinity; -#else - opt_affinity = ul; -#endif - break; + p = strstr( arg, "0x" ); + opt_affinity = p ? strtoull( p, NULL, 16 ) + : atoll( arg ); + break; case 1021: // cpu-priority v = atoi(arg); - if (v < 0 || v > 5) /* sanity check */ + applog(LOG_NOTICE,"--cpu-priority is deprecated and will be removed from a future release"); + if (v < 0 || v > 5) /* sanity check */ show_usage_and_exit(1); opt_priority = v; break; @@ -3263,12 +3518,20 @@ void parse_arg(int key, char *arg ) break; case 1024: opt_randomize = true; - break; - case 1026: - opt_reset_on_stale = true; + applog(LOG_NOTICE,"--randomize is deprecated and will be removed from a future release"); break; - case 'V': - show_version_and_exit(); + case 1027: // data-file + opt_data_file = strdup( arg ); + break; + case 1028: // verify + opt_verify = true; + break; + case 1029: // stratum-keepalive + opt_stratum_keepalive = true; + break; + case 'V': + display_cpu_capability(); + exit(0); case 'h': show_usage_and_exit(0); @@ -3323,20 +3586,18 @@ static void parse_cmdline(int argc, char *argv[]) while (1) { #if HAVE_GETOPT_LONG - key = getopt_long(argc, argv, short_options, options, NULL); + key = getopt_long(argc, argv, short_options, options, NULL); #else - key = getopt(argc, argv, short_options); + key = getopt(argc, argv, short_options); #endif - if (key < 0) - break; - - parse_arg(key, optarg); + if ( key < 0 ) break; + parse_arg( key, optarg ); } - if (optind < argc) + if ( optind < argc ) { - fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n", - argv[0], argv[optind]); - show_usage_and_exit(1); + fprintf( stderr, "%s: unsupported non-option argument -- '%s'\n", + argv[0], argv[optind]); + show_usage_and_exit(1); } } @@ -3385,193 +3646,6 @@ static int thread_create(struct thr_info *thr, void* func) return err; } -static void show_credits() -{ - printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n"); - printf(" A CPU miner with multi algo support and optimized for CPUs \n"); - printf(" with AVX512, SHA and VAES extensions by JayDDee. \n"); - printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT \n"); - printf(""); - printf(" _ _ \n"); - printf(" ___ ___ _ _ _____|_|___ ___ ___ ___ ___ ___| |_ \n"); - printf(" | _| . | | | | | | -_| _|___| . | . | _| \n"); - printf(" |___| _|___|_|_|_|_|_|_|___|_| |___| _|_| \n"); - printf(" |_| |_| \n"); - printf(""); - -} - -bool check_cpu_capability () -{ - char cpu_brand[0x40]; - bool cpu_has_sse2 = has_sse2(); - bool cpu_has_aes = has_aes_ni(); - bool cpu_has_sse42 = has_sse42(); - bool cpu_has_avx = has_avx(); - bool cpu_has_avx2 = has_avx2(); - bool cpu_has_sha = has_sha(); - bool cpu_has_avx512 = has_avx512(); - bool cpu_has_vaes = has_vaes(); - bool sw_has_aes = false; - bool sw_has_sse2 = false; - bool sw_has_sse42 = false; - bool sw_has_avx = false; - bool sw_has_avx2 = false; - bool sw_has_avx512 = false; - bool sw_has_sha = false; - bool sw_has_vaes = false; - set_t algo_features = algo_gate.optimizations; - bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); - bool algo_has_aes = set_incl( AES_OPT, algo_features ); - bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); - bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); - bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); - bool algo_has_sha = set_incl( SHA_OPT, algo_features ); - bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); - bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features ); - bool use_aes; - bool use_sse2; - bool use_sse42; - bool use_avx2; - bool use_avx512; - bool use_sha; - bool use_vaes; - bool use_none; - - #ifdef __AES__ - sw_has_aes = true; - #endif - #ifdef __SSE2__ - sw_has_sse2 = true; - #endif - #ifdef __SSE4_2__ - sw_has_sse42 = true; - #endif - #ifdef __AVX__ - sw_has_avx = true; - #endif - #ifdef __AVX2__ - sw_has_avx2 = true; - #endif - #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) - sw_has_avx512 = true; - #endif - #ifdef __SHA__ - sw_has_sha = true; - #endif - #ifdef __VAES__ - sw_has_vaes = true; - #endif - - -// #if !((__AES__) || (__SSE2__)) -// printf("Neither __AES__ nor __SSE2__ defined.\n"); -// #endif - - cpu_brand_string( cpu_brand ); - printf( "CPU: %s\n", cpu_brand ); - - printf("SW built on " __DATE__ - #ifdef _MSC_VER - " with VC++ 2013\n"); - #elif defined(__GNUC__) - " with GCC"); - printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); - #else - printf("\n"); - #endif - - printf("CPU features: "); - if ( cpu_has_avx512 ) printf( " AVX512" ); - else if ( cpu_has_avx2 ) printf( " AVX2 " ); - else if ( cpu_has_avx ) printf( " AVX " ); - else if ( cpu_has_sse42 ) printf( " SSE4.2" ); - else if ( cpu_has_sse2 ) printf( " SSE2 " ); - if ( cpu_has_vaes ) printf( " VAES" ); - else if ( cpu_has_aes ) printf( " AES" ); - if ( cpu_has_sha ) printf( " SHA" ); - - printf("\nSW features: "); - if ( sw_has_avx512 ) printf( " AVX512" ); - else if ( sw_has_avx2 ) printf( " AVX2 " ); - else if ( sw_has_avx ) printf( " AVX " ); - else if ( sw_has_sse42 ) printf( " SSE4.2" ); - else if ( sw_has_sse2 ) printf( " SSE2 " ); - if ( sw_has_vaes ) printf( " VAES" ); - else if ( sw_has_aes ) printf( " AES" ); - if ( sw_has_sha ) printf( " SHA" ); - - printf("\nAlgo features:"); - if ( algo_features == EMPTY_SET ) printf( " None" ); - else - { - if ( algo_has_avx512 ) printf( " AVX512" ); - else if ( algo_has_avx2 ) printf( " AVX2 " ); - else if ( algo_has_sse42 ) printf( " SSE4.2" ); - else if ( algo_has_sse2 ) printf( " SSE2 " ); - if ( algo_has_vaes ) printf( " VAES" ); - else if ( algo_has_aes ) printf( " AES" ); - if ( algo_has_sha ) printf( " SHA" ); - } - printf("\n"); - - // Check for CPU and build incompatibilities - if ( !cpu_has_sse2 ) - { - printf( "A CPU with SSE2 is required to use cpuminer-opt\n" ); - return false; - } - if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) ) - { - printf( "The SW build requires a CPU with AES and AVX2!\n" ); - return false; - } - if ( sw_has_sse42 && !cpu_has_sse42 ) - { - printf( "The SW build requires a CPU with SSE4.2!\n" ); - return false; - } - if ( sw_has_aes && !cpu_has_aes ) - { - printf( "The SW build requires a CPU with AES!\n" ); - return false; - } - if ( sw_has_sha && !cpu_has_sha ) - { - printf( "The SW build requires a CPU with SHA!\n" ); - return false; - } - - // Determine mining options - use_sse2 = cpu_has_sse2 && algo_has_sse2; - use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; - use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42; - use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; - use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; - use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; - use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes - && ( use_avx512 || algo_has_vaes256 ); - use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || - use_sha || use_vaes ); - - // Display best options - printf( "\nStarting miner with" ); - if ( use_none ) printf( " no optimizations" ); - else - { - if ( use_avx512 ) printf( " AVX512" ); - else if ( use_avx2 ) printf( " AVX2" ); - else if ( use_sse42 ) printf( " SSE4.2" ); - else if ( use_sse2 ) printf( " SSE2" ); - if ( use_vaes ) printf( " VAES" ); - else if ( use_aes ) printf( " AES" ); - if ( use_sha ) printf( " SHA" ); - } - printf( "...\n\n" ); - - return true; -} - void get_defconfig_path(char *out, size_t bufsize, char *argv0); int main(int argc, char *argv[]) @@ -3587,26 +3661,21 @@ int main(int argc, char *argv[]) rpc_user = strdup(""); rpc_pass = strdup(""); - parse_cmdline(argc, argv); - #if defined(WIN32) -// SYSTEM_INFO sysinfo; -// GetSystemInfo(&sysinfo); -// num_cpus = sysinfo.dwNumberOfProcessors; -// What happens if GetActiveProcessorGroupCount called if groups not enabled? // Are Windows CPU Groups supported? -#if _WIN32_WINNT==0x0601 +#if defined(WINDOWS_CPU_GROUPS_ENABLED) num_cpus = 0; num_cpugroups = GetActiveProcessorGroupCount(); - for( i = 0; i < num_cpugroups; i++ ) + for( i = 0; i < num_cpugroups; i++ ) { - int cpus = GetActiveProcessorCount(i); + int cpus = GetActiveProcessorCount( i ); num_cpus += cpus; if (opt_debug) - applog(LOG_DEBUG, "Found %d cpus on cpu group %d", cpus, i); + applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i ); } + #else SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); @@ -3622,25 +3691,27 @@ int main(int argc, char *argv[]) #else num_cpus = 1; #endif - if (num_cpus < 1) - num_cpus = 1; - if (!opt_n_threads) - opt_n_threads = num_cpus; + if ( num_cpus < 1 ) + num_cpus = 1; + opt_n_threads = num_cpus; + + parse_cmdline( argc, argv ); if ( opt_algo == ALGO_NULL ) { - fprintf(stderr, "%s: no algo supplied\n", argv[0]); + fprintf( stderr, "%s: No algo parameter specified\n", argv[0] ); show_usage_and_exit(1); - } + } + + // need to register to get algo optimizations for cpu capabilities + // but that causes registration logs before cpu capabilities is output. + // Would need to split register function into 2 parts. First part sets algo + // optimizations but no logging, second part does any logging. + if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1); + + if ( !check_cpu_capability() ) exit(1); - if (opt_algo == ALGO_MINOTAURX) - { - // Activating MinX for GBT - // applog(LOG_INFO, "Activating the BlockTemplate for MinotaurX"); - opt_minotaurx = true; - } - if ( !opt_benchmark ) { if ( !short_url ) @@ -3679,9 +3750,6 @@ int main(int argc, char *argv[]) return 1; } - // All options must be set before starting the gate - if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1); - if ( coinbase_address ) { pk_script_size = address_to_script( pk_script, pk_buffer_size, @@ -3693,14 +3761,6 @@ int main(int argc, char *argv[]) } } - // Initialize stats times and counters - memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) ); - gettimeofday( &last_submit_time, NULL ); - memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) ); - memcpy( &session_start, &last_submit_time, sizeof (struct timeval) ); - - if ( !check_cpu_capability() ) exit(1); - pthread_mutex_init( &stats_lock, NULL ); pthread_rwlock_init( &g_work_lock, NULL ); pthread_mutex_init( &stratum.sock_lock, NULL ); @@ -3770,44 +3830,28 @@ int main(int argc, char *argv[]) } #endif -// To be confirmed with more than 64 cpus - if ( opt_affinity != -1 ) + if ( opt_affinity && num_cpus > max_cpus ) { - if ( !affinity_uses_uint128 && num_cpus > 64 ) + applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled", + max_cpus ); + opt_affinity = 0ULL; + } + + if ( opt_affinity ) + { + for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ ) { - applog(LOG_WARNING,"Setting CPU affinity with more than 64 CPUs is only"); - applog(LOG_WARNING,"available on Linux. Using default affinity."); - opt_affinity = -1; + while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++; + thread_affinity_map[ thr ] = cpu % num_cpus; } -/* - else + if ( !opt_quiet ) { - affine_to_cpu_mask( -1, opt_affinity ); - if ( !opt_quiet ) - { -#if AFFINITY_USES_UINT128 - if ( num_cpus > 64 ) - applog(LOG_DEBUG, "Binding process to cpu mask %x", - u128_hi64( opt_affinity ), u128_lo64( opt_affinity ) ); - else - applog(LOG_DEBUG, "Binding process to cpu mask %x", - opt_affinity ); -#else - applog(LOG_DEBUG, "Binding process to cpu mask %x", - opt_affinity ); -#endif - } + char affinity_mask[64]; + format_affinity_mask( affinity_mask, opt_affinity ); + applog( LOG_INFO, "CPU affinity [%s]", affinity_mask ); } -*/ } - - if ( !opt_quiet && ( opt_n_threads < num_cpus ) ) - { - char affinity_map[64]; - format_affinity_map( affinity_map, opt_affinity ); - applog( LOG_INFO, "CPU affinity [%s]", affinity_map ); - } - + #ifdef HAVE_SYSLOG_H if (use_syslog) openlog("cpuminer", LOG_PID, LOG_USER); @@ -3862,11 +3906,14 @@ int main(int argc, char *argv[]) return 1; } } - if ( have_stratum ) + + if ( have_stratum ) { if ( opt_debug ) applog(LOG_INFO,"Creating stratum thread"); + stratum.new_job = false; // just to make sure + /* init stratum thread info */ stratum_thr_id = opt_n_threads + 2; thr = &thr_info[stratum_thr_id]; @@ -3904,31 +3951,45 @@ int main(int argc, char *argv[]) return 1; } if ( !opt_quiet ) - applog( LOG_INFO,"API listnening to %s:%d", opt_api_allow, + applog( LOG_INFO,"API listening to %s:%d", opt_api_allow, opt_api_listen ); } + // hold the stats lock while starting miner threads + pthread_mutex_lock( &stats_lock ); + /* start mining threads */ - for (i = 0; i < opt_n_threads; i++) + for ( i = 0; i < opt_n_threads; i++ ) { - usleep( 5000 ); +// usleep( 5000 ); thr = &thr_info[i]; thr->id = i; thr->q = tq_new(); - if (!thr->q) + if ( !thr->q ) return 1; - err = thread_create(thr, miner_thread); - if (err) { - applog(LOG_ERR, "Miner thread %d create failed", i); + err = thread_create( thr, miner_thread ); + if ( err ) + { + applog( LOG_ERR, "Miner thread %d create failed", i ); return 1; } } - applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm", - opt_n_threads, num_cpus, algo_names[opt_algo] ); + // Initialize stats times and counters + memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) ); + gettimeofday( &last_submit_time, NULL ); + memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) ); + memcpy( &session_start, &last_submit_time, sizeof (struct timeval) ); + memcpy( &stratum_keepalive_timer, &last_submit_time, sizeof (struct timeval) ); + memcpy( &stratum_reset_time, &last_submit_time, sizeof (struct timeval) ); + memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) ); + pthread_mutex_unlock( &stats_lock ); + + applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm", + opt_n_threads, num_cpus, algo_names[opt_algo] ); /* main loop - simply wait for workio thread to exit */ pthread_join( thr_info[work_thr_id].pth, NULL ); applog( LOG_WARNING, "workio thread dead, exiting." ); return 0; -} +} \ No newline at end of file diff --git a/malloc-huge.c b/malloc-huge.c new file mode 100644 index 00000000..75c0165d --- /dev/null +++ b/malloc-huge.c @@ -0,0 +1,36 @@ +#include "malloc-huge.h" +#include "miner.h" + +#define HUGEPAGE_SIZE_2M (2 * 1024 * 1024) + +void *malloc_hugepages( size_t size ) +{ +#if !(defined(MAP_HUGETLB) && defined(MAP_ANON)) +// applog( LOG_WARNING, "Huge pages not available",size); + return NULL; +#else + + if ( size < HUGEPAGE_MIN_ALLOC ) + { +// applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size); + return NULL; + } + + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1; + void *p = NULL; + int flags = + #ifdef MAP_NOCORE + MAP_NOCORE | + #endif + MAP_HUGETLB | MAP_ANON | MAP_PRIVATE; + + // round size up to next page boundary + size = ( size + hugepage_mask ) & (~hugepage_mask); + + p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 ); + if ( p == MAP_FAILED ) + p = NULL; + return p; +#endif +} + diff --git a/malloc-huge.h b/malloc-huge.h new file mode 100644 index 00000000..371e09a5 --- /dev/null +++ b/malloc-huge.h @@ -0,0 +1,24 @@ +#if !(defined(MALLOC_HUGE__)) +#define MALLOC_HUGE__ + +#include +#include +#include +#include + +#ifdef __unix__ +#include +#endif + +#if defined(MAP_HUGETLB) + +// Minimum block size 6 MiB to use huge pages +#define HUGEPAGE_MIN_ALLOC (6 * 1024 * 1024) + +#endif + +// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure. +void *malloc_hugepages( size_t size ); + +#endif + diff --git a/miner.h b/miner.h index 2834101f..440b59e7 100644 --- a/miner.h +++ b/miner.h @@ -70,17 +70,25 @@ void *alloca (size_t); #ifdef HAVE_SYSLOG_H #include -#define LOG_BLUE 0x10 /* unique value */ +#define LOG_BLUE 0x10 /* unique value */ +#define LOG_MAJR 0x11 /* unique value */ +#define LOG_MINR 0x12 /* unique value */ +#define LOG_GREEN 0x13 /* unique value */ +#define LOG_PINK 0x14 /* unique value */ #else enum { - LOG_ERR, + LOG_CRIT, + LOG_ERR, LOG_WARNING, LOG_NOTICE, LOG_INFO, LOG_DEBUG, - /* custom notices */ - LOG_BLUE = 0x10, -}; + /* custom notices */ + LOG_BLUE = 0x10, + LOG_MAJR = 0x11, + LOG_MINR = 0x12, + LOG_GREEN = 0x13, + LOG_PINK = 0x14 }; #endif extern bool is_power_of_2( int n ); @@ -216,7 +224,7 @@ json_t* json_load_url(char* cfg_url, json_error_t *err); void sha256_init(uint32_t *state); void sha256_transform(uint32_t *state, const uint32_t *block, int swap); -void sha256d(unsigned char *hash, const unsigned char *data, int len); +//void sha256d(unsigned char *hash, const unsigned char *data, int len); #ifdef USE_ASM #if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__) @@ -225,7 +233,8 @@ int sha256_use_4way(); void sha256_init_4way(uint32_t *state); void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); #endif -#if defined(__x86_64__) && defined(USE_AVX2) +//#if defined(__x86_64__) && defined(USE_AVX2) +#if defined(__x86_64__) && defined(__AVX2__) #define HAVE_SHA256_8WAY 1 int sha256_use_8way(); void sha256_init_8way(uint32_t *state); @@ -271,9 +280,9 @@ struct thr_api { #define CL_N "\x1B[0m" #define CL_RED "\x1B[31m" #define CL_GRN "\x1B[32m" -#define CL_YLW "\x1B[33m" +#define CL_YLW "\x1B[33m" // dark yellow #define CL_BLU "\x1B[34m" -#define CL_MAG "\x1B[35m" +#define CL_MAG "\x1B[35m" // purple #define CL_CYN "\x1B[36m" #define CL_BLK "\x1B[22;30m" /* black */ @@ -281,7 +290,7 @@ struct thr_api { #define CL_GR2 "\x1B[22;32m" /* green */ #define CL_BRW "\x1B[22;33m" /* brown */ #define CL_BL2 "\x1B[22;34m" /* blue */ -#define CL_MA2 "\x1B[22;35m" /* magenta */ +#define CL_MA2 "\x1B[22;35m" /* purple */ #define CL_CY2 "\x1B[22;36m" /* cyan */ #define CL_SIL "\x1B[22;37m" /* gray */ @@ -290,9 +299,9 @@ struct thr_api { #else #define CL_GRY "\x1B[90m" /* dark gray selectable in putty */ #endif -#define CL_LRD "\x1B[01;31m" /* light red */ -#define CL_LGR "\x1B[01;32m" /* light green */ -#define CL_YL2 "\x1B[01;33m" /* yellow */ +#define CL_LRD "\x1B[01;31m" /* bright red */ +#define CL_LGR "\x1B[01;32m" /* bright green */ +#define CL_YL2 "\x1B[01;33m" /* bright yellow */ #define CL_LBL "\x1B[01;34m" /* light blue */ #define CL_LMA "\x1B[01;35m" /* light magenta */ #define CL_LCY "\x1B[01;36m" /* light cyan */ @@ -307,6 +316,7 @@ extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass, extern void cbin2hex(char *out, const char *in, size_t len); void bin2hex( char *s, const unsigned char *p, size_t len ); char *abin2hex( const unsigned char *p, size_t len ); +char *bebin2hex( const unsigned char *p, size_t len ); bool hex2bin( unsigned char *p, const char *hexstr, size_t len ); bool jobj_binary( const json_t *obj, const char *key, void *buf, size_t buflen ); @@ -456,9 +466,7 @@ void stratum_disconnect(struct stratum_ctx *sctx); bool stratum_subscribe(struct stratum_ctx *sctx); bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); - -extern bool lowdiff_debug; - +bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff ); extern bool aes_ni_supported; @@ -483,7 +491,7 @@ void format_hashrate(double hashrate, char *output); void print_hash_tests(void); void scale_hash_for_display ( double* hashrate, char* units ); - +void format_number_si( double* hashrate, char* si_units ); void report_summary_log( bool force ); /* @@ -518,27 +526,26 @@ uint32_t* get_stratum_job_ntime(); enum algos { ALGO_NULL, - ALGO_0X10, + ALGO_0X10, ALGO_ALLIUM, ALGO_ANIME, ALGO_ARGON2, ALGO_ARGON2D250, ALGO_ARGON2D500, ALGO_ARGON2D4096, - ALGO_ARGON2D16000, - ALGO_AXIOM, - ALGO_BLAKE, + ALGO_ARGON2D16000, + ALGO_AXIOM, + ALGO_BLAKE, ALGO_BLAKE2B, ALGO_BLAKE2S, ALGO_BLAKECOIN, ALGO_BMW, ALGO_BMW512, ALGO_C11, - ALGO_CPUPOWER, + ALGO_CPUOWER, ALGO_DECRED, ALGO_DEEP, ALGO_DMD_GR, - ALGO_GR, ALGO_GROESTL, ALGO_HEX, ALGO_HMQ1725, @@ -553,9 +560,8 @@ enum algos { ALGO_LYRA2REV3, ALGO_LYRA2Z, ALGO_LYRA2Z330, - ALGO_M7M, + ALGO_M7M, ALGO_MINOTAUR, - ALGO_MINOTAURX, ALGO_MYR_GR, ALGO_NEOSCRYPT, ALGO_NIST5, @@ -581,6 +587,7 @@ enum algos { ALGO_TRIBUS, ALGO_VANILLA, ALGO_VELTOR, + ALGO_VERTHASH, ALGO_WHIRLPOOL, ALGO_WHIRLPOOLX, ALGO_X11, @@ -608,10 +615,10 @@ enum algos { ALGO_YESCRYPTR16, ALGO_YESCRYPTR32, ALGO_YESPOWER, - ALGO_YESPOWERARWN, + ALGO_YESPOWERARWN, ALGO_YESPOWERR16, - ALGO_YESPOWERSUGAR, - ALGO_YESPOWERURX, + ALGO_YESPOWERSUGAR, + ALGO_YESPOWERURX, ALGO_YESPOWER_B2B, ALGO_ZR5, ALGO_COUNT @@ -627,19 +634,18 @@ static const char* const algo_names[] = { "argon2d4096", "argon2d16000", "axiom", - "blake", + "blake", "blake2b", "blake2s", "blakecoin", "bmw", "bmw512", "c11", - "cpupower", + "cpupower", "decred", "deep", "dmd-gr", - "gr", - "groestl", + "groestl", "hex", "hmq1725", "hodl", @@ -655,8 +661,7 @@ static const char* const algo_names[] = { "lyra2z330", "m7m", "minotaur", - "minotaurx", - "myr-gr", + "myr-gr", "neoscrypt", "nist5", "pentablake", @@ -681,6 +686,7 @@ static const char* const algo_names[] = { "tribus", "vanilla", "veltor", + "verthash", "whirlpool", "whirlpoolx", "x11", @@ -708,10 +714,10 @@ static const char* const algo_names[] = { "yescryptr16", "yescryptr32", "yespower", - "yespowerarwn", + "yespowerARWN", "yespowerr16", - "yespowersugar", - "yespowerurx", + "yespowerSUGAR", + "yespowerURX", "yespower-b2b", "zr5", "\0" @@ -731,7 +737,6 @@ extern int opt_timeout; extern bool want_longpoll; extern bool have_longpoll; extern bool have_gbt; -extern bool opt_minotaurx; extern char* lp_id; extern char *rpc_userpass; extern const char *gbt_lp_req; @@ -755,7 +760,6 @@ extern uint32_t opt_work_size; extern double *thr_hashrates; extern double global_hashrate; extern double stratum_diff; -extern bool opt_reset_on_stale; extern double net_diff; extern double net_hashrate; extern int opt_param_n; @@ -780,6 +784,8 @@ extern pthread_mutex_t stats_lock; extern bool opt_sapling; extern const int pk_buffer_size_max; extern int pk_buffer_size; +extern char *opt_data_file; +extern bool opt_verify; static char const usage[] = "\ Usage: cpuminer [OPTIONS]\n\ @@ -789,23 +795,23 @@ Options:\n\ allium Garlicoin (GRLC)\n\ anime Animecoin (ANI)\n\ argon2 Argon2 Coin (AR2)\n\ - argon2d250 argon2d-crds, Credits (CRDS)\n\ + argon2d250\n\ argon2d500 argon2d-dyn, Dynamic (DYN)\n\ argon2d4096 argon2d-uis, Unitus (UIS)\n\ + argon2d16000 argon2d16000, Alterdot (ADOT)\n\ axiom Shabal-256 MemoHash\n\ - blake blake256r14 (SFR)\n\ + blake blake256r14 (SFR)\n\ blake2b Blake2b 256\n\ blake2s Blake-2 S\n\ blakecoin blake256r8\n\ bmw BMW 256\n\ bmw512 BMW 512\n\ c11 Chaincoin\n\ - cpupower CPUchain\n\ + cpupower CPUchain (CPU)\n\ decred Blake256r14dcr\n\ deep Deepcoin (DCN)\n\ dmd-gr Diamond\n\ - gr GhostRider\n\ - groestl Groestl coin\n\ + groestl Groestl coin\n\ hex x16r-hex\n\ hmq1725 Espers\n\ hodl Hodlcoin\n\ @@ -816,14 +822,13 @@ Options:\n\ lyra2h Hppcoin\n\ lyra2re lyra2\n\ lyra2rev2 lyrav2\n\ - lyra2rev3 lyrav2v3, Vertcoin\n\ + lyra2rev3 lyrav2v3\n\ lyra2z\n\ lyra2z330 Lyra2 330 rows\n\ m7m Magi (XMG)\n\ myr-gr Myriad-Groestl\n\ minotaur Ringcoin (RNG)\n\ - minotaurx Litecoin Cash (LCC)\n\ - neoscrypt NeoScrypt(128, 2, 1)\n\ + neoscrypt NeoScrypt(128, 2, 1)\n\ nist5 Nist5\n\ pentablake 5 x blake512\n\ phi1612 phi\n\ @@ -834,11 +839,11 @@ Options:\n\ qubit Qubit\n\ scrypt scrypt(1024, 1, 1) (default)\n\ scrypt:N scrypt(N, 1, 1)\n\ - scryptn2 scrypt:1048576\n\ + scryptn2 scrypt(1048576, 1,1)\n\ sha256d Double SHA-256\n\ sha256q Quad SHA-256, Pyrite (PYE)\n\ sha256t Triple SHA-256, Onecoin (OC)\n\ - sha3d Double Keccak256 (BSHA3)\n\ + sha3d Double Keccak256 (BSHA3)\n\ shavite3 Shavite3\n\ skein Skein+Sha (Skeincoin)\n\ skein2 Double Skein (Woodcoin)\n\ @@ -849,6 +854,7 @@ Options:\n\ tribus Denarius (DNR)\n\ vanilla blake256r8vnl (VCash)\n\ veltor\n\ + verthash\n\ whirlpool\n\ whirlpoolx\n\ x11 Dash\n\ @@ -876,15 +882,15 @@ Options:\n\ yescryptr16 Eli\n\ yescryptr32 WAVI\n\ yespower Cryply\n\ - yespowerarwn Arowanacoin (ARWN)\n\ + yespowerARWN Arowanacoin\n\ yespowerr16 Yenten (YTN)\n\ - yespowersugar Sugarchain (SUGAR)\n\ - yespowerurx UraniumX (URX)\n\ + yespowerSUGAR Sugarchain\n\ + yespowerURX UraniumX\n\ yespower-b2b generic yespower + blake2b\n\ zr5 Ziftr\n\ - -N, --param-n N parameter for scrypt based algos\n\ - -R, --param-r R parameter for scrypt based algos\n\ - -K, --param-key Key (pers) parameter for algos that use it\n\ + -N, --param-n=N N parameter for scrypt based algos\n\ + -R, --param-r=N R parameter for scrypt based algos\n\ + -K, --param-key=STRING Key (pers) parameter for algos that use it\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -899,20 +905,19 @@ Options:\n\ -T, --timeout=N timeout for long poll and stratum (default: 300 seconds)\n\ -s, --scantime=N upper bound on time spent scanning current work when\n\ long polling is unavailable, in seconds (default: 5)\n\ - --randomize Randomize scan range start to reduce duplicates\n\ - --reset-on-stale Workaround reset stratum if too many stale shares\n\ - -f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\ - -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\ - --hash-meter Display thread hash rates\n\ + --randomize randomize scan range (deprecated)\n\ + -f, --diff-factor=N divide req. difficulty by this factor (std is 1.0)\n\ + -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\ + --hash-meter display thread hash rates\n\ --coinbase-addr=ADDR payout address for solo mining\n\ --coinbase-sig=TEXT data to insert in the coinbase when possible\n\ --no-longpoll disable long polling support\n\ --no-getwork disable getwork support\n\ --no-gbt disable getblocktemplate support\n\ --no-stratum disable X-Stratum support\n\ - --no-extranonce disable Stratum extranonce support\n\ + --no-extranonce disable Stratum extranonce subscribe\n\ --no-redirect ignore requests to change the URL of the mining server\n\ - -q, --quiet disable per-thread hashmeter output\n\ + -q, --quiet reduce log verbosity\n\ --no-color disable colored output\n\ -D, --debug enable debug output\n\ -P, --protocol-dump verbose dump of protocol-level activities\n" @@ -924,14 +929,17 @@ Options:\n\ -B, --background run the miner in the background\n\ --benchmark run in offline benchmark mode\n\ --cpu-affinity set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\ - --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest)\n\ - -b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4048)\n\ - --api-remote Allow remote control\n\ - --max-temp=N Only mine if cpu temp is less than specified value (linux)\n\ - --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\ - --max-diff=N Only mine if net difficulty is less than specified value\n\ + --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest) (deprecated)\n\ + -b, --api-bind=address[:port] IP address for the miner API, default port is 4048)\n\ + --api-remote allow remote control\n\ + --max-temp=N only mine if cpu temp is less than specified value (linux)\n\ + --max-rate=N[KMG] only mine if net hashrate is less than specified value\n\ + --max-diff=N only mine if net difficulty is less than specified value\n\ -c, --config=FILE load a JSON-format configuration file\n\ - -V, --version display version information and exit\n\ + --data-file=FILE path and name of data file\n\ + --verify enable additional time consuming start up tests\n\ + --stratum-keepalive prevent disconnects when difficulty is too high\n\ + -V, --version display version and CPU information and exit\n\ -h, --help display this help text and exit\n\ "; @@ -988,7 +996,6 @@ static struct option const options[] = { { "retries", 1, NULL, 'r' }, { "retry-pause", 1, NULL, 1025 }, { "randomize", 0, NULL, 1024 }, - { "reset-on-stale", 0, NULL, 1026 }, { "scantime", 1, NULL, 's' }, #ifdef HAVE_SYSLOG_H { "syslog", 0, NULL, 'S' }, @@ -999,6 +1006,9 @@ static struct option const options[] = { { "url", 1, NULL, 'o' }, { "user", 1, NULL, 'u' }, { "userpass", 1, NULL, 'O' }, + { "data-file", 1, NULL, 1027 }, + { "verify", 0, NULL, 1028 }, + { "stratum-keepalive", 0, NULL, 1029 }, { "version", 0, NULL, 'V' }, { 0, 0, 0, 0 } }; diff --git a/simd-utils.h b/simd-utils.h index f8ee35fd..f2e201d6 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -78,6 +78,8 @@ // - specialized shift and rotate functions that move elements around // use the notation "1x32" to indicate the distance moved as units of // the element size. +// Vector shuffle rotations are being renamed to "vrol" and "vror" +// to avoid confusion with bit rotations. // - there is a subset of some functions for scalar data. They may have // no prefix nor vec-size, just one size, the size of the data. // - Some integer functions are also defined which use a similar notation. @@ -131,7 +133,7 @@ // If a sequence of constants is to be used it can be more efficient to // use arithmetic with already existing constants to generate new ones. // -// ex: const __m512i one = _mm512_const1_64( 1 ); +// ex: const __m512i one = m512_one_64; // const __m512i two = _mm512_add_epi64( one, one ); // ////////////////////////////////////////////////////////////////////////// diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 93a5e19b..00fb1516 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -11,6 +11,53 @@ // // 32 bit data +// Transpose 1 block consisting of 4x4x32 bit integers. +#define MM128_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \ +{ \ + __m128i t0 = mm128_shuffle2_32( s0, s1, 0x44 ); \ + __m128i t1 = mm128_shuffle2_32( s0, s1, 0xee ); \ + __m128i t2 = mm128_shuffle2_32( s2, s3, 0x44 ); \ + __m128i t3 = mm128_shuffle2_32( s2, s3, 0xee ); \ + d0 = mm128_shuffle2_32( t0, t2, 0x88 ); \ + d1 = mm128_shuffle2_32( t0, t2, 0xdd ); \ + d2 = mm128_shuffle2_32( t1, t3, 0x88 ); \ + d3 = mm128_shuffle2_32( t1, t3, 0xdd ); \ +} + +#if defined(__AVX2__) + +// Transpose 2 contiguous blocks +#define MM256_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \ +{ \ + __m256i t0 = mm256_shuffle2_32( s0, s1, 0x44 ); \ + __m256i t1 = mm256_shuffle2_32( s0, s1, 0xee ); \ + __m256i t2 = mm256_shuffle2_32( s2, s3, 0x44 ); \ + __m256i t3 = mm256_shuffle2_32( s2, s3, 0xee ); \ + d0 = mm256_shuffle2_32( t0, t2, 0x88 ); \ + d1 = mm256_shuffle2_32( t0, t2, 0xdd ); \ + d2 = mm256_shuffle2_32( t1, t3, 0x88 ); \ + d3 = mm256_shuffle2_32( t1, t3, 0xdd ); \ +} + +#endif + +#if defined(__AVX512F__) + +// Transpose 4 contiguous blocks. +#define MM512_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \ +{ \ + __m512i t0 = mm512_shuffle2_32( s0, s1, 0x44 ); \ + __m512i t1 = mm512_shuffle2_32( s0, s1, 0xee ); \ + __m512i t2 = mm512_shuffle2_32( s2, s3, 0x44 ); \ + __m512i t3 = mm512_shuffle2_32( s2, s3, 0xee ); \ + d0 = mm512_shuffle2_32( t0, t2, 0x88 ); \ + d1 = mm512_shuffle2_32( t0, t2, 0xdd ); \ + d2 = mm512_shuffle2_32( t1, t3, 0x88 ); \ + d3 = mm512_shuffle2_32( t1, t3, 0xdd ); \ +} + +#endif + // 2x32 static inline void intrlv_2x32( void *dst, const void *src0, @@ -65,7 +112,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1, d0[24] = s[48]; d1[24] = s[49]; d0[25] = s[50]; d1[25] = s[51]; d0[26] = s[52]; d1[26] = s[53]; d0[27] = s[54]; d1[27] = s[55]; d0[28] = s[56]; d1[28] = s[57]; d0[29] = s[58]; d1[29] = s[59]; - d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[61]; d1[31] = s[63]; + d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[62]; d1[31] = s[63]; } static inline void extr_lane_2x32( void *dst, const void *src, @@ -86,104 +133,37 @@ static inline void extr_lane_2x32( void *dst, const void *src, // 4x32 /* -static inline void intrlv_4x32( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, int bit_len ) -{ - __m64 *d = (__m64*)dst; - const __m64 *s0 = (const __m64*)src0; - const __m64 *s1 = (const __m64*)src1; - const __m64 *s2 = (const __m64*)src2; - const __m64 *s3 = (const __m64*)src3; - - d[ 0] = _mm_unpacklo_pi32( s0[0], s1[0] ); - d[ 1] = _mm_unpacklo_pi32( s2[0], s3[0] ); - d[ 2] = _mm_unpackhi_pi32( s0[0], s1[0] ); - d[ 3] = _mm_unpackhi_pi32( s2[0], s3[0] ); - - d[ 4] = _mm_unpacklo_pi32( s0[1], s1[1] ); - d[ 5] = _mm_unpacklo_pi32( s2[1], s3[1] ); - d[ 6] = _mm_unpackhi_pi32( s0[1], s1[1] ); - d[ 7] = _mm_unpackhi_pi32( s2[1], s3[1] ); - - d[ 8] = _mm_unpacklo_pi32( s0[2], s1[2] ); - d[ 9] = _mm_unpacklo_pi32( s2[2], s3[2] ); - d[10] = _mm_unpackhi_pi32( s0[2], s1[2] ); - d[11] = _mm_unpackhi_pi32( s2[2], s3[2] ); - - d[12] = _mm_unpacklo_pi32( s0[3], s1[3] ); - d[13] = _mm_unpacklo_pi32( s2[3], s3[3] ); - d[14] = _mm_unpackhi_pi32( s0[3], s1[3] ); - d[15] = _mm_unpackhi_pi32( s2[3], s3[3] ); - - if ( bit_len <= 256 ) return; - - d[16] = _mm_unpacklo_pi32( s0[4], s1[4] ); - d[17] = _mm_unpacklo_pi32( s2[4], s3[4] ); - d[18] = _mm_unpackhi_pi32( s0[4], s1[4] ); - d[19] = _mm_unpackhi_pi32( s2[4], s3[4] ); +static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, + const void *src2, const void *src3, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; - d[20] = _mm_unpacklo_pi32( s0[5], s1[5] ); - d[21] = _mm_unpacklo_pi32( s2[5], s3[5] ); - d[22] = _mm_unpackhi_pi32( s0[5], s1[5] ); - d[23] = _mm_unpackhi_pi32( s2[5], s3[5] ); + MM128_ILEAVE32( d[ 0], d[ 1], d[ 2], d[ 3], s0[0], s1[0], s2[0], s3[0] ); + MM128_ILEAVE32( d[ 4], d[ 5], d[ 6], d[ 7], s0[1], s1[1], s2[1], s3[1] ); - d[24] = _mm_unpacklo_pi32( s0[6], s1[6] ); - d[25] = _mm_unpacklo_pi32( s2[6], s3[6] ); - d[26] = _mm_unpackhi_pi32( s0[6], s1[6] ); - d[27] = _mm_unpackhi_pi32( s2[6], s3[6] ); + if ( bit_len <= 256 ) return; - d[28] = _mm_unpacklo_pi32( s0[7], s1[7] ); - d[29] = _mm_unpacklo_pi32( s2[7], s3[7] ); - d[30] = _mm_unpackhi_pi32( s0[7], s1[7] ); - d[31] = _mm_unpackhi_pi32( s2[7], s3[7] ); + MM128_ILEAVE32( d[ 8], d[ 9], d[10], d[11], s0[2], s1[2], s2[2], s3[2] ); + MM128_ILEAVE32( d[12], d[13], d[14], d[15], s0[3], s1[3], s2[3], s3[3] ); if ( bit_len <= 512 ) return; - d[32] = _mm_unpacklo_pi32( s0[8], s1[8] ); - d[33] = _mm_unpacklo_pi32( s2[8], s3[8] ); - d[34] = _mm_unpackhi_pi32( s0[8], s1[8] ); - d[35] = _mm_unpackhi_pi32( s2[8], s3[8] ); + MM128_ILEAVE32( d[16], d[17], d[18], d[19], s0[4], s1[4], s2[4], s3[4] ); - d[36] = _mm_unpacklo_pi32( s0[9], s1[9] ); - d[37] = _mm_unpacklo_pi32( s2[9], s3[9] ); - d[38] = _mm_unpackhi_pi32( s0[9], s1[9] ); - d[39] = _mm_unpackhi_pi32( s2[9], s3[9] ); - if ( bit_len <= 640 ) return; - d[40] = _mm_unpacklo_pi32( s0[10], s1[10] ); - d[41] = _mm_unpacklo_pi32( s2[10], s3[10] ); - d[42] = _mm_unpackhi_pi32( s0[10], s1[10] ); - d[43] = _mm_unpackhi_pi32( s2[10], s3[10] ); - - d[44] = _mm_unpacklo_pi32( s0[11], s1[11] ); - d[45] = _mm_unpacklo_pi32( s2[11], s3[11] ); - d[46] = _mm_unpackhi_pi32( s0[11], s1[11] ); - d[47] = _mm_unpackhi_pi32( s2[11], s3[11] ); - - d[48] = _mm_unpacklo_pi32( s0[12], s1[12] ); - d[49] = _mm_unpacklo_pi32( s2[12], s3[12] ); - d[50] = _mm_unpackhi_pi32( s0[12], s1[12] ); - d[51] = _mm_unpackhi_pi32( s2[12], s3[12] ); - - d[52] = _mm_unpacklo_pi32( s0[13], s1[13] ); - d[53] = _mm_unpacklo_pi32( s2[13], s3[13] ); - d[54] = _mm_unpackhi_pi32( s0[13], s1[13] ); - d[55] = _mm_unpackhi_pi32( s2[13], s3[13] ); - - d[56] = _mm_unpacklo_pi32( s0[14], s1[14] ); - d[57] = _mm_unpacklo_pi32( s2[14], s3[14] ); - d[58] = _mm_unpackhi_pi32( s0[14], s1[14] ); - d[59] = _mm_unpackhi_pi32( s2[14], s3[14] ); - - d[60] = _mm_unpacklo_pi32( s0[15], s1[15] ); - d[61] = _mm_unpacklo_pi32( s2[15], s3[15] ); - d[62] = _mm_unpackhi_pi32( s0[15], s1[15] ); - d[63] = _mm_unpackhi_pi32( s2[15], s3[15] ); -} + MM128_ILEAVE32( d[20], d[21], d[22], d[23], s0[5], s1[5], s2[5], s3[5] ); + MM128_ILEAVE32( d[24], d[25], d[26], d[27], s0[6], s1[6], s2[6], s3[6] ); + MM128_ILEAVE32( d[28], d[29], d[30], d[31], s0[4], s1[4], s2[4], s3[4] ); +} */ -static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, +static inline void intrlv_4x32( void * + dst, const void *src0, const void *src1, const void *src2, const void *src3, const int bit_len ) { uint32_t *d = (uint32_t*)dst; @@ -230,53 +210,45 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, /* static inline void intrlv_4x32_512( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3 ) + const void *src1, const void *src2, const void *src3 ) { - __m64 *d = (__m64*)dst; - const __m64 *s0 = (const __m64*)src0; - const __m64 *s1 = (const __m64*)src1; - const __m64 *s2 = (const __m64*)src2; - const __m64 *s3 = (const __m64*)src3; - - d[ 0] = _mm_unpacklo_pi32( s0[0], s1[0] ); - d[ 1] = _mm_unpacklo_pi32( s2[0], s3[0] ); - d[ 2] = _mm_unpackhi_pi32( s0[0], s1[0] ); - d[ 3] = _mm_unpackhi_pi32( s2[0], s3[0] ); - - d[ 4] = _mm_unpacklo_pi32( s0[1], s1[1] ); - d[ 5] = _mm_unpacklo_pi32( s2[1], s3[1] ); - d[ 6] = _mm_unpackhi_pi32( s0[1], s1[1] ); - d[ 7] = _mm_unpackhi_pi32( s2[1], s3[1] ); - - d[ 8] = _mm_unpacklo_pi32( s0[2], s1[2] ); - d[ 9] = _mm_unpacklo_pi32( s2[2], s3[2] ); - d[10] = _mm_unpackhi_pi32( s0[2], s1[2] ); - d[11] = _mm_unpackhi_pi32( s2[2], s3[2] ); - - d[12] = _mm_unpacklo_pi32( s0[3], s1[3] ); - d[13] = _mm_unpacklo_pi32( s2[3], s3[3] ); - d[14] = _mm_unpackhi_pi32( s0[3], s1[3] ); - d[15] = _mm_unpackhi_pi32( s2[3], s3[3] ); - - d[16] = _mm_unpacklo_pi32( s0[4], s1[4] ); - d[17] = _mm_unpacklo_pi32( s2[4], s3[4] ); - d[18] = _mm_unpackhi_pi32( s0[4], s1[4] ); - d[19] = _mm_unpackhi_pi32( s2[4], s3[4] ); - - d[20] = _mm_unpacklo_pi32( s0[5], s1[5] ); - d[21] = _mm_unpacklo_pi32( s2[5], s3[5] ); - d[22] = _mm_unpackhi_pi32( s0[5], s1[5] ); - d[23] = _mm_unpackhi_pi32( s2[5], s3[5] ); - - d[24] = _mm_unpacklo_pi32( s0[6], s1[6] ); - d[25] = _mm_unpacklo_pi32( s2[6], s3[6] ); - d[26] = _mm_unpackhi_pi32( s0[6], s1[6] ); - d[27] = _mm_unpackhi_pi32( s2[6], s3[6] ); - - d[28] = _mm_unpacklo_pi32( s0[7], s1[7] ); - d[29] = _mm_unpacklo_pi32( s2[7], s3[7] ); - d[30] = _mm_unpackhi_pi32( s0[7], s1[7] ); - d[31] = _mm_unpackhi_pi32( s2[7], s3[7] ); +#if defined(__AVX2__) + + __m256i *d = (__m256i*)dst; + const __m256i *s0 = (const __m256i*)src0; + const __m256i *s1 = (const __m256i*)src1; + const __m256i *s2 = (const __m256i*)src2; + const __m256i *s3 = (const __m256i*)src3; + __m256i dt0, dt1, dt2, dt3; + + MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); + + d[0] = _mm256_permute2x128_si256( dt0, dt1, 0x20 ); + d[1] = _mm256_permute2x128_si256( dt2, dt3, 0x20 ); + d[2] = _mm256_permute2x128_si256( dt0, dt1, 0x31 ); + d[3] = _mm256_permute2x128_si256( dt2, dt3, 0x31 ); + + MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[1], s1[1], s2[1], s3[1] ); + + d[4] = _mm256_permute2x128_si256( dt0, dt1, 0x20 ); + d[5] = _mm256_permute2x128_si256( dt2, dt3, 0x20 ); + d[6] = _mm256_permute2x128_si256( dt0, dt1, 0x31 ); + d[7] = _mm256_permute2x128_si256( dt2, dt3, 0x31 ); + +#else + + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + + MM128_ILEAVE32( d[ 0], d[ 1], d[ 2], d[ 3], s0[0], s1[0], s2[0], s3[0] ); + MM128_ILEAVE32( d[ 4], d[ 5], d[ 6], d[ 7], s0[1], s1[1], s2[1], s3[1] ); + MM128_ILEAVE32( d[ 8], d[ 9], d[10], d[11], s0[2], s1[2], s2[2], s3[2] ); + MM128_ILEAVE32( d[12], d[13], d[14], d[15], s0[3], s1[3], s2[3], s3[3] ); + +#endif } */ @@ -306,100 +278,34 @@ static inline void intrlv_4x32_512( void *dst, const void *src0, d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15]; } + /* static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src, int bit_len ) -{ - __m64 *d0 = (__m64*)dst0; - __m64 *d1 = (__m64*)dst1; - __m64 *d2 = (__m64*)dst2; - __m64 *d3 = (__m64*)dst3; - const __m64 *s = (const __m64*)src; - d0[0] = _mm_unpacklo_pi32( s[ 0], s[ 2] ); - d1[0] = _mm_unpackhi_pi32( s[ 0], s[ 2] ); - d2[0] = _mm_unpacklo_pi32( s[ 1], s[ 3] ); - d3[0] = _mm_unpackhi_pi32( s[ 1], s[ 3] ); - - d0[1] = _mm_unpacklo_pi32( s[ 4], s[ 6] ); - d1[1] = _mm_unpackhi_pi32( s[ 4], s[ 6] ); - d2[1] = _mm_unpacklo_pi32( s[ 5], s[ 7] ); - d3[1] = _mm_unpackhi_pi32( s[ 5], s[ 7] ); - - d0[2] = _mm_unpacklo_pi32( s[ 8], s[10] ); - d1[2] = _mm_unpackhi_pi32( s[ 8], s[10] ); - d2[2] = _mm_unpacklo_pi32( s[ 9], s[11] ); - d3[2] = _mm_unpackhi_pi32( s[ 9], s[11] ); - - d0[3] = _mm_unpacklo_pi32( s[12], s[14] ); - d1[3] = _mm_unpackhi_pi32( s[12], s[14] ); - d2[3] = _mm_unpacklo_pi32( s[13], s[15] ); - d3[3] = _mm_unpackhi_pi32( s[13], s[15] ); - - if ( bit_len <= 256 ) return; - - d0[4] = _mm_unpacklo_pi32( s[16], s[18] ); - d1[4] = _mm_unpackhi_pi32( s[16], s[18] ); - d2[4] = _mm_unpacklo_pi32( s[17], s[19] ); - d3[4] = _mm_unpackhi_pi32( s[17], s[19] ); + void *dst3, const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + const __m128i *s = (const __m128i*)src; - d0[5] = _mm_unpacklo_pi32( s[20], s[22] ); - d1[5] = _mm_unpackhi_pi32( s[20], s[22] ); - d2[5] = _mm_unpacklo_pi32( s[21], s[23] ); - d3[5] = _mm_unpackhi_pi32( s[21], s[23] ); + MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 1], s[ 2], s[ 3] ); + MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 4], s[ 5], s[ 6], s[ 7] ); - d0[6] = _mm_unpacklo_pi32( s[24], s[26] ); - d1[6] = _mm_unpackhi_pi32( s[24], s[26] ); - d2[6] = _mm_unpacklo_pi32( s[25], s[27] ); - d3[6] = _mm_unpackhi_pi32( s[25], s[27] ); + if ( bit_len <= 256 ) return; - d0[7] = _mm_unpacklo_pi32( s[28], s[30] ); - d1[7] = _mm_unpackhi_pi32( s[28], s[30] ); - d2[7] = _mm_unpacklo_pi32( s[29], s[31] ); - d3[7] = _mm_unpackhi_pi32( s[29], s[31] ); + MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[ 8], s[ 9], s[10], s[11] ); + MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[12], s[13], s[14], s[15] ); if ( bit_len <= 512 ) return; - d0[8] = _mm_unpacklo_pi32( s[32], s[34] ); - d1[8] = _mm_unpackhi_pi32( s[32], s[34] ); - d2[8] = _mm_unpacklo_pi32( s[33], s[35] ); - d3[8] = _mm_unpackhi_pi32( s[33], s[35] ); - - d0[9] = _mm_unpacklo_pi32( s[36], s[38] ); - d1[9] = _mm_unpackhi_pi32( s[36], s[38] ); - d2[9] = _mm_unpacklo_pi32( s[37], s[39] ); - d3[9] = _mm_unpackhi_pi32( s[37], s[39] ); + MM128_ILEAVE32( d0[4], d1[4], d2[4], d3[4], s[16], s[17], s[18], s[19] ); if ( bit_len <= 640 ) return; - d0[10] = _mm_unpacklo_pi32( s[40], s[42] ); - d1[10] = _mm_unpackhi_pi32( s[40], s[42] ); - d2[10] = _mm_unpacklo_pi32( s[41], s[43] ); - d3[10] = _mm_unpackhi_pi32( s[41], s[43] ); - - d0[11] = _mm_unpacklo_pi32( s[44], s[46] ); - d1[11] = _mm_unpackhi_pi32( s[44], s[46] ); - d2[11] = _mm_unpacklo_pi32( s[45], s[47] ); - d3[11] = _mm_unpackhi_pi32( s[45], s[47] ); - - d0[12] = _mm_unpacklo_pi32( s[48], s[50] ); - d1[12] = _mm_unpackhi_pi32( s[48], s[50] ); - d2[12] = _mm_unpacklo_pi32( s[49], s[51] ); - d3[12] = _mm_unpackhi_pi32( s[49], s[51] ); - - d0[13] = _mm_unpacklo_pi32( s[52], s[54] ); - d1[13] = _mm_unpackhi_pi32( s[52], s[54] ); - d2[13] = _mm_unpacklo_pi32( s[53], s[55] ); - d3[13] = _mm_unpackhi_pi32( s[53], s[55] ); - - d0[14] = _mm_unpacklo_pi32( s[56], s[58] ); - d1[14] = _mm_unpackhi_pi32( s[56], s[58] ); - d2[14] = _mm_unpacklo_pi32( s[57], s[59] ); - d3[14] = _mm_unpackhi_pi32( s[57], s[59] ); - - d0[15] = _mm_unpacklo_pi32( s[60], s[62] ); - d1[15] = _mm_unpackhi_pi32( s[60], s[62] ); - d2[15] = _mm_unpacklo_pi32( s[61], s[62] ); - d3[15] = _mm_unpackhi_pi32( s[61], s[62] ); + MM128_ILEAVE32( d0[5], d1[5], d2[5], d3[5], s[20], s[21], s[22], s[23] ); + MM128_ILEAVE32( d0[6], d1[6], d2[6], d3[6], s[24], s[25], s[26], s[27] ); + MM128_ILEAVE32( d0[7], d1[7], d2[7], d3[7], s[28], s[29], s[30], s[31] ); } */ @@ -452,47 +358,42 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { - __m64 *d0 = (__m64*)dst0; - __m64 *d1 = (__m64*)dst1; - __m64 *d2 = (__m64*)dst2; - __m64 *d3 = (__m64*)dst3; - const __m64 *s = (const __m64*)src; - - d0[0] = _mm_unpacklo_pi32( s[ 0], s[ 2] ); - d1[0] = _mm_unpackhi_pi32( s[ 0], s[ 2] ); - d2[0] = _mm_unpacklo_pi32( s[ 1], s[ 3] ); - d3[0] = _mm_unpackhi_pi32( s[ 1], s[ 3] ); - d0[1] = _mm_unpacklo_pi32( s[ 4], s[ 6] ); - d1[1] = _mm_unpackhi_pi32( s[ 4], s[ 6] ); - d2[1] = _mm_unpacklo_pi32( s[ 5], s[ 7] ); - d3[1] = _mm_unpackhi_pi32( s[ 5], s[ 7] ); - - d0[2] = _mm_unpacklo_pi32( s[ 8], s[10] ); - d1[2] = _mm_unpackhi_pi32( s[ 8], s[10] ); - d2[2] = _mm_unpacklo_pi32( s[ 9], s[11] ); - d3[2] = _mm_unpackhi_pi32( s[ 9], s[11] ); - d0[3] = _mm_unpacklo_pi32( s[12], s[14] ); - d1[3] = _mm_unpackhi_pi32( s[12], s[14] ); - d2[3] = _mm_unpacklo_pi32( s[13], s[15] ); - d3[3] = _mm_unpackhi_pi32( s[13], s[15] ); - - d0[4] = _mm_unpacklo_pi32( s[16], s[18] ); - d1[4] = _mm_unpackhi_pi32( s[16], s[18] ); - d2[4] = _mm_unpacklo_pi32( s[17], s[19] ); - d3[4] = _mm_unpackhi_pi32( s[17], s[19] ); - d0[5] = _mm_unpacklo_pi32( s[20], s[22] ); - d1[5] = _mm_unpackhi_pi32( s[20], s[22] ); - d2[5] = _mm_unpacklo_pi32( s[21], s[23] ); - d3[5] = _mm_unpackhi_pi32( s[21], s[23] ); - - d0[6] = _mm_unpacklo_pi32( s[24], s[26] ); - d1[6] = _mm_unpackhi_pi32( s[24], s[26] ); - d2[6] = _mm_unpacklo_pi32( s[25], s[27] ); - d3[6] = _mm_unpackhi_pi32( s[25], s[27] ); - d0[7] = _mm_unpacklo_pi32( s[28], s[30] ); - d1[7] = _mm_unpackhi_pi32( s[28], s[30] ); - d2[7] = _mm_unpacklo_pi32( s[29], s[31] ); - d3[7] = _mm_unpackhi_pi32( s[29], s[31] ); +#if defined(__AVX2__) + + __m256i *d0 = (__m256i*)dst0; + __m256i *d1 = (__m256i*)dst1; + __m256i *d2 = (__m256i*)dst2; + __m256i *d3 = (__m256i*)dst3; + const __m256i *s = (const __m256i*)src; + + __m256i st0 = _mm256_permute2x128_si256( s[0], s[2], 0x20 ); + __m256i st2 = _mm256_permute2x128_si256( s[1], s[3], 0x20 ); + __m256i st1 = _mm256_permute2x128_si256( s[0], s[2], 0x31 ); + __m256i st3 = _mm256_permute2x128_si256( s[1], s[3], 0x31 ); + + MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st1, st2, st3 ); + + st0 = _mm256_permute2x128_si256( s[4], s[6], 0x20 ); + st2 = _mm256_permute2x128_si256( s[5], s[7], 0x20 ); + st1 = _mm256_permute2x128_si256( s[4], s[6], 0x31 ); + st3 = _mm256_permute2x128_si256( s[5], s[7], 0x31 ); + + MM256_ILEAVE32( d0[1], d1[1], d2[1], d3[1], st0, st1, st2, st3 ); + +#else + + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + const __m128i *s = (const __m128i*)src; + + MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 1], s[ 2], s[ 3] ); + MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 4], s[ 5], s[ 6], s[ 7] ); + MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[ 8], s[ 9], s[10], s[11] ); + MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[12], s[13], s[14], s[15] ); + +#endif } */ @@ -662,6 +563,204 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) } // 8x32 +/* +static inline void intrlv_8x32( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, const void *src4, + const void *src5, const void *src6, const void *src7, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + const __m128i *s4 = (const __m128i*)src4; + const __m128i *s5 = (const __m128i*)src5; + const __m128i *s6 = (const __m128i*)src6; + const __m128i *s7 = (const __m128i*)src7; + + MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] ); + MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] ); + MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] ); + MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] ); + + if ( bit_len <= 256 ) return; + + MM128_ILEAVE32( d[16], d[18], d[20], d[22], s0[2], s1[2], s2[2], s3[2] ); + MM128_ILEAVE32( d[17], d[19], d[21], d[23], s4[2], s5[2], s6[2], s7[2] ); + MM128_ILEAVE32( d[24], d[26], d[28], d[30], s0[3], s1[3], s2[3], s3[3] ); + MM128_ILEAVE32( d[25], d[27], d[29], d[31], s4[3], s5[3], s6[3], s7[3] ); + + if ( bit_len <= 512 ) return; + + MM128_ILEAVE32( d[32], d[34], d[36], d[38], s0[4], s1[4], s2[4], s3[4] ); + MM128_ILEAVE32( d[33], d[35], d[37], d[39], s4[4], s5[4], s6[4], s7[4] ); + + if ( bit_len <= 640 ) return; + + MM128_ILEAVE32( d[40], d[42], d[44], d[46], s0[5], s1[5], s2[5], s3[5] ); + MM128_ILEAVE32( d[41], d[43], d[45], d[47], s4[5], s5[5], s6[5], s7[5] ); + + MM128_ILEAVE32( d[48], d[50], d[52], d[54], s0[6], s1[6], s2[6], s3[6] ); + MM128_ILEAVE32( d[49], d[51], d[53], d[55], s4[6], s5[6], s6[6], s7[6] ); + MM128_ILEAVE32( d[56], d[58], d[60], d[62], s0[7], s1[7], s2[7], s3[7] ); + MM128_ILEAVE32( d[57], d[59], d[61], d[63], s4[7], s5[7], s6[7], s7[7] ); +} + +// Not used +static inline void intrlv_8x32_256( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, const void *src4, + const void *src5, const void *src6, const void *src7 ) +{ +#if defined(__AVX2__) + + __m256i *d = (__m256i*)dst; + const __m256i *s0 = (const __m256i*)src0; + const __m256i *s1 = (const __m256i*)src1; + const __m256i *s2 = (const __m256i*)src2; + const __m256i *s3 = (const __m256i*)src3; + const __m256i *s4 = (const __m256i*)src4; + const __m256i *s5 = (const __m256i*)src5; + const __m256i *s6 = (const __m256i*)src6; + const __m256i *s7 = (const __m256i*)src7; + __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7; + + MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); + MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] ); + + d[0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); + d[4] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); + d[1] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); + d[5] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); + d[2] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); + d[6] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); + d[3] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); + d[7] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); + +#else +// Shouldn't get here, 8x32 used only with AVX2 + + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + const __m128i *s4 = (const __m128i*)src4; + const __m128i *s5 = (const __m128i*)src5; + const __m128i *s6 = (const __m128i*)src6; + const __m128i *s7 = (const __m128i*)src7; + + MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] ); + MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] ); + MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] ); + MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] ); + +#endif +} + +static inline void intrlv_8x32_512( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, const void *src4, + const void *src5, const void *src6, const void *src7 ) +{ +#if 0 //defined(__AVX512F__) + + __m512i *d = (__m512i*)dst; + const __m512i *s0 = (const __m512i*)src0; + const __m512i *s1 = (const __m512i*)src1; + const __m512i *s2 = (const __m512i*)src2; + const __m512i *s3 = (const __m512i*)src3; + const __m512i *s4 = (const __m512i*)src4; + const __m512i *s5 = (const __m512i*)src5; + const __m512i *s6 = (const __m512i*)src6; + const __m512i *s7 = (const __m512i*)src7; + + __m512i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7, t0, t1, t2, t3; + + MM512_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); + MM512_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] ); + + t0 = _mm512_shuffle_i32x4( dt0, dt4, 0x44 ); + t2 = _mm512_shuffle_i32x4( dt1, dt5, 0x44 ); + t1 = _mm512_shuffle_i32x4( dt0, dt4, 0xee ); + t3 = _mm512_shuffle_i32x4( dt1, dt5, 0xee ); + + d[0] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + d[2] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + d[4] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + d[6] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + + t0 = _mm512_shuffle_i32x4( dt2, dt6, 0x44 ); + t2 = _mm512_shuffle_i32x4( dt3, dt7, 0x44 ); + t1 = _mm512_shuffle_i32x4( dt2, dt6, 0xee ); + t3 = _mm512_shuffle_i32x4( dt3, dt7, 0xee ); + + d[1] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + d[3] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + d[5] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + d[7] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + +#elif defined(__AVX2__) + + __m256i *d = (__m256i*)dst; + const __m256i *s0 = (const __m256i*)src0; + const __m256i *s1 = (const __m256i*)src1; + const __m256i *s2 = (const __m256i*)src2; + const __m256i *s3 = (const __m256i*)src3; + const __m256i *s4 = (const __m256i*)src4; + const __m256i *s5 = (const __m256i*)src5; + const __m256i *s6 = (const __m256i*)src6; + const __m256i *s7 = (const __m256i*)src7; + __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7; + + MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); + MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] ); + + d[0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); + d[1] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); + d[4] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); + d[5] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); + d[2] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); + d[3] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); + d[6] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); + d[7] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); + + MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[1], s1[1], s2[1], s3[1] ); + MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[1], s5[1], s6[1], s7[1] ); + + d[ 8] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); + d[ 9] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); + d[12] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); + d[13] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); + d[10] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); + d[11] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); + d[14] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); + d[15] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); + +#else +// Shouldn't get here, 8x32 only used with AVX2 or AVX512 + + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + const __m128i *s4 = (const __m128i*)src4; + const __m128i *s5 = (const __m128i*)src5; + const __m128i *s6 = (const __m128i*)src6; + const __m128i *s7 = (const __m128i*)src7; + + MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] ); + MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] ); + MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] ); + MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] ); + + MM128_ILEAVE32( d[16], d[18], d[20], d[22], s0[2], s1[2], s2[2], s3[2] ); + MM128_ILEAVE32( d[17], d[19], d[21], d[23], s4[2], s5[2], s6[2], s7[2] ); + MM128_ILEAVE32( d[24], d[26], d[28], d[30], s0[3], s1[3], s2[3], s3[3] ); + MM128_ILEAVE32( d[25], d[27], d[29], d[31], s4[3], s5[3], s6[3], s7[3] ); + +#endif +} +*/ #define ILEAVE_8x32( i ) do \ { \ @@ -684,6 +783,7 @@ static inline void intrlv_8x32b( void *dst, const void *s0, const void *s1, ILEAVE_8x32( i ); } + static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, const void *s2, const void *s3, const void *s4, const void *s5, const void *s6, const void *s7, const int bit_len ) @@ -709,6 +809,8 @@ static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, ILEAVE_8x32( 30 ); ILEAVE_8x32( 31 ); } + + static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, const void *s2, const void *s3, const void *s4, const void *s5, const void *s6, const void *s7 ) @@ -723,8 +825,205 @@ static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, ILEAVE_8x32( 14 ); ILEAVE_8x32( 15 ); } + #undef ILEAVE_8x32 +/* +static inline void dintrlv_8x32( void *dst0, void *dst1, void *dst2, void *dst3, + void *dst4, void *dst5, void *dst6, void *dst7, const void *src, + const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + __m128i *d4 = (__m128i*)dst4; + __m128i *d5 = (__m128i*)dst5; + __m128i *d6 = (__m128i*)dst6; + __m128i *d7 = (__m128i*)dst7; + const __m128i *s = (const __m128i*)src; + + MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] ); + MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] ); + MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] ); + MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] ); + + if ( bit_len <= 256 ) return; + + MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[16], s[18], s[20], s[22] ); + MM128_ILEAVE32( d4[2], d5[2], d6[2], d7[2], s[17], s[19], s[21], s[23] ); + MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[24], s[26], s[28], s[30] ); + MM128_ILEAVE32( d4[3], d5[3], d6[3], d7[3], s[25], s[27], s[29], s[31] ); + + if ( bit_len <= 512 ) return; + + MM128_ILEAVE32( d0[4], d1[4], d2[4], d3[4], s[32], s[34], s[36], s[38] ); + MM128_ILEAVE32( d4[4], d5[4], d6[4], d7[4], s[33], s[35], s[37], s[39] ); + + if ( bit_len <= 640 ) return; + + MM128_ILEAVE32( d0[5], d1[5], d2[5], d3[5], s[40], s[42], s[44], s[46] ); + MM128_ILEAVE32( d4[5], d5[5], d6[5], d7[5], s[41], s[43], s[45], s[47] ); + MM128_ILEAVE32( d0[6], d1[6], d2[6], d3[6], s[48], s[50], s[52], s[54] ); + MM128_ILEAVE32( d4[6], d5[6], d6[6], d7[6], s[49], s[51], s[53], s[55] ); + MM128_ILEAVE32( d0[7], d1[7], d2[7], d3[7], s[56], s[58], s[60], s[62] ); + MM128_ILEAVE32( d4[7], d5[7], d6[7], d7[7], s[57], s[59], s[61], s[63] ); +} + +static inline void dintrlv_8x32_256( void *dst0, void *dst1, void *dst2, + void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, + const void *src ) +{ +#if defined(__AVX2__) + + __m256i *d0 = (__m256i*)dst0; + __m256i *d1 = (__m256i*)dst1; + __m256i *d2 = (__m256i*)dst2; + __m256i *d3 = (__m256i*)dst3; + __m256i *d4 = (__m256i*)dst4; + __m256i *d5 = (__m256i*)dst5; + __m256i *d6 = (__m256i*)dst6; + __m256i *d7 = (__m256i*)dst7; + const __m256i *s = (const __m256i*)src; + + __m256i st0 = _mm256_permute2x128_si256( s[0], s[4], 0x20 ); + __m256i st1 = _mm256_permute2x128_si256( s[0], s[4], 0x31 ); + __m256i st2 = _mm256_permute2x128_si256( s[1], s[5], 0x20 ); + __m256i st3 = _mm256_permute2x128_si256( s[1], s[5], 0x31 ); + __m256i st4 = _mm256_permute2x128_si256( s[2], s[6], 0x20 ); + __m256i st5 = _mm256_permute2x128_si256( s[2], s[6], 0x31 ); + __m256i st6 = _mm256_permute2x128_si256( s[3], s[7], 0x20 ); + __m256i st7 = _mm256_permute2x128_si256( s[3], s[7], 0x31 ); + + MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st2, st4, st6 ); + MM256_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st1, st3, st5, st7 ); + +#else +// Not needed, 8x32 used only with AVX2, AVX512 + + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + __m128i *d4 = (__m128i*)dst4; + __m128i *d5 = (__m128i*)dst5; + __m128i *d6 = (__m128i*)dst6; + __m128i *d7 = (__m128i*)dst7; + const __m128i *s = (const __m128i*)src; + + MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] ); + MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] ); + MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] ); + MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] ); + +#endif +} + +static inline void dintrlv_8x32_512( void *dst0, void *dst1, void *dst2, + void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, + const void *src ) +{ +#if 0 // defined(__AVX512F__) + + __m512i *d0 = (__m512i*)dst0; + __m512i *d1 = (__m512i*)dst1; + __m512i *d2 = (__m512i*)dst2; + __m512i *d3 = (__m512i*)dst3; + __m512i *d4 = (__m512i*)dst4; + __m512i *d5 = (__m512i*)dst5; + __m512i *d6 = (__m512i*)dst6; + __m512i *d7 = (__m512i*)dst7; + + + const __m512i *s = (const __m512i*)src; + + __m512i st0, st1, st2, st3, st4, st5, st6, st7, t0, t1, t2, t3; + + t0 = _mm512_shuffle_i32x4( s[0], s[2], 0x44 ); + t2 = _mm512_shuffle_i32x4( s[4], s[6], 0x44 ); + t1 = _mm512_shuffle_i32x4( s[0], s[2], 0xee ); + t3 = _mm512_shuffle_i32x4( s[4], s[6], 0xee ); + + st0 = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + st4 = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + st1 = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + st5 = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + + t0 = _mm512_shuffle_i32x4( s[1], s[3], 0x44 ); + t2 = _mm512_shuffle_i32x4( s[5], s[7], 0x44 ); + t1 = _mm512_shuffle_i32x4( s[1], s[3], 0xee ); + t3 = _mm512_shuffle_i32x4( s[5], s[7], 0xee ); + + st2 = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + st6 = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + st3 = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + st7 = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + + MM512_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st1, st2, st3 ); + MM512_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st4, st5, st6, st7 ); + +#elif defined(__AVX2__) + + __m256i *d0 = (__m256i*)dst0; + __m256i *d1 = (__m256i*)dst1; + __m256i *d2 = (__m256i*)dst2; + __m256i *d3 = (__m256i*)dst3; + __m256i *d4 = (__m256i*)dst4; + __m256i *d5 = (__m256i*)dst5; + __m256i *d6 = (__m256i*)dst6; + __m256i *d7 = (__m256i*)dst7; + const __m256i *s = (const __m256i*)src; + + __m256i st0 = _mm256_permute2x128_si256( s[0], s[4], 0x20 ); + __m256i st2 = _mm256_permute2x128_si256( s[1], s[5], 0x20 ); + __m256i st1 = _mm256_permute2x128_si256( s[0], s[4], 0x31 ); + __m256i st3 = _mm256_permute2x128_si256( s[1], s[5], 0x31 ); + __m256i st4 = _mm256_permute2x128_si256( s[2], s[6], 0x20 ); + __m256i st6 = _mm256_permute2x128_si256( s[3], s[7], 0x20 ); + __m256i st5 = _mm256_permute2x128_si256( s[2], s[6], 0x31 ); + __m256i st7 = _mm256_permute2x128_si256( s[3], s[7], 0x31 ); + + MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st2, st4, st6 ); + MM256_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st1, st3, st5, st7 ); + + st0 = _mm256_permute2x128_si256( s[ 8], s[12], 0x20 ); + st2 = _mm256_permute2x128_si256( s[ 9], s[13], 0x20 ); + st1 = _mm256_permute2x128_si256( s[ 8], s[12], 0x31 ); + st3 = _mm256_permute2x128_si256( s[ 9], s[13], 0x31 ); + st4 = _mm256_permute2x128_si256( s[10], s[14], 0x20 ); + st6 = _mm256_permute2x128_si256( s[11], s[15], 0x20 ); + st5 = _mm256_permute2x128_si256( s[10], s[14], 0x31 ); + st7 = _mm256_permute2x128_si256( s[11], s[15], 0x31 ); + + MM256_ILEAVE32( d0[1], d1[1], d2[1], d3[1], st0, st2, st4, st6 ); + MM256_ILEAVE32( d4[1], d5[1], d6[1], d7[1], st1, st3, st5, st7 ); + +#else + + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + __m128i *d4 = (__m128i*)dst4; + __m128i *d5 = (__m128i*)dst5; + __m128i *d6 = (__m128i*)dst6; + __m128i *d7 = (__m128i*)dst7; + const __m128i *s = (const __m128i*)src; + + MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] ); + MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] ); + MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] ); + MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] ); + + MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[16], s[18], s[20], s[22] ); + MM128_ILEAVE32( d4[2], d5[2], d6[2], d7[2], s[17], s[19], s[21], s[23] ); + MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[24], s[26], s[28], s[30] ); + MM128_ILEAVE32( d4[3], d5[3], d6[3], d7[3], s[25], s[27], s[29], s[31] ); + +#endif +} +*/ + #define DLEAVE_8x32( i ) do \ { \ const uint32_t *s = (const uint32_t*)(src) + ( (i) << 3 ); \ @@ -771,6 +1070,7 @@ static inline void dintrlv_8x32( void *d0, void *d1, void *d2, void *d3, DLEAVE_8x32( 30 ); DLEAVE_8x32( 31 ); } + static inline void dintrlv_8x32_512( void *d0, void *d1, void *d2, void *d3, void *d4, void *d5, void *d6, void *d7, const void *src ) { @@ -874,6 +1174,210 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) #endif // AVX2 // 16x32 +/* +static inline void intrlv_16x32( void *dst, const void *src00, + const void *src01, const void *src02, const void *src03, const void *src04, + const void *src05, const void *src06, const void *src07, const void *src08, + const void *src09, const void *src10, const void *src11, const void *src12, + const void *src13, const void *src14, const void *src15, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s00 = (const __m128i*)src00; + const __m128i *s01 = (const __m128i*)src01; + const __m128i *s02 = (const __m128i*)src02; + const __m128i *s03 = (const __m128i*)src03; + const __m128i *s04 = (const __m128i*)src04; + const __m128i *s05 = (const __m128i*)src05; + const __m128i *s06 = (const __m128i*)src06; + const __m128i *s07 = (const __m128i*)src07; + const __m128i *s08 = (const __m128i*)src08; + const __m128i *s09 = (const __m128i*)src09; + const __m128i *s10 = (const __m128i*)src10; + const __m128i *s11 = (const __m128i*)src11; + const __m128i *s12 = (const __m128i*)src12; + const __m128i *s13 = (const __m128i*)src13; + const __m128i *s14 = (const __m128i*)src14; + const __m128i *s15 = (const __m128i*)src15; + + MM128_ILEAVE32( d[ 0], d[ 4], d[ 8], d[12], s00[0], s01[0], s02[0], s03[0] ); + MM128_ILEAVE32( d[ 1], d[ 5], d[ 9], d[13], s04[0], s05[0], s06[0], s07[0] ); + MM128_ILEAVE32( d[ 2], d[ 6], d[10], d[14], s08[0], s09[0], s10[0], s11[0] ); + MM128_ILEAVE32( d[ 3], d[ 7], d[11], d[15], s12[0], s13[0], s14[0], s15[0] ); + + MM128_ILEAVE32( d[16], d[20], d[24], d[28], s00[1], s01[1], s02[1], s03[1] ); + MM128_ILEAVE32( d[17], d[21], d[25], d[29], s04[1], s05[1], s06[1], s07[1] ); + MM128_ILEAVE32( d[18], d[22], d[26], d[30], s08[1], s09[1], s10[1], s11[1] ); + MM128_ILEAVE32( d[19], d[23], d[27], d[31], s12[1], s13[1], s14[1], s15[1] ); + + if ( bit_len <= 256 ) return; + + MM128_ILEAVE32( d[32], d[36], d[40], d[44], s00[2], s01[2], s02[2], s03[2] ); + MM128_ILEAVE32( d[33], d[37], d[41], d[45], s04[2], s05[2], s06[2], s07[2] ); + MM128_ILEAVE32( d[34], d[38], d[42], d[46], s08[2], s09[2], s10[2], s11[2] ); + MM128_ILEAVE32( d[35], d[39], d[43], d[47], s12[2], s13[2], s14[2], s15[2] ); + + MM128_ILEAVE32( d[48], d[52], d[56], d[60], s00[3], s01[3], s02[3], s03[3] ); + MM128_ILEAVE32( d[49], d[53], d[57], d[61], s04[3], s05[3], s06[3], s07[3] ); + MM128_ILEAVE32( d[50], d[54], d[58], d[62], s08[3], s09[3], s10[3], s11[3] ); + MM128_ILEAVE32( d[51], d[55], d[59], d[63], s12[3], s13[3], s14[3], s15[3] ); + + if ( bit_len <= 512 ) return; + + MM128_ILEAVE32( d[64], d[68], d[72], d[76], s00[4], s01[4], s02[4], s03[4] ); + MM128_ILEAVE32( d[65], d[69], d[73], d[77], s04[4], s05[4], s06[4], s07[4] ); + MM128_ILEAVE32( d[66], d[70], d[74], d[78], s08[4], s09[4], s10[4], s11[4] ); + MM128_ILEAVE32( d[67], d[71], d[75], d[79], s12[4], s13[4], s14[4], s15[4] ); + + if ( bit_len <= 640 ) return; + + MM128_ILEAVE32( d[80], d[84], d[88], d[92], s00[5], s01[5], s02[5], s03[5] ); + MM128_ILEAVE32( d[81], d[85], d[89], d[93], s04[5], s05[5], s06[5], s07[5] ); + MM128_ILEAVE32( d[82], d[86], d[90], d[94], s08[5], s09[5], s10[5], s11[5] ); + MM128_ILEAVE32( d[83], d[87], d[91], d[95], s12[5], s13[5], s14[5], s15[5] ); + + MM128_ILEAVE32( d[ 96], d[100], d[104], d[108], s00[6], s01[6], s02[6], s03[6] ); + MM128_ILEAVE32( d[ 97], d[101], d[105], d[109], s04[6], s05[6], s06[6], s07[6] ); + MM128_ILEAVE32( d[ 98], d[102], d[106], d[110], s08[6], s09[6], s10[6], s11[6] ); + MM128_ILEAVE32( d[ 99], d[103], d[107], d[111], s12[6], s13[6], s14[6], s15[6] ); + + MM128_ILEAVE32( d[112], d[116], d[120], d[124], s00[7], s01[7], s02[7], s03[7] ); + MM128_ILEAVE32( d[113], d[117], d[121], d[125], s04[7], s05[7], s06[7], s07[7] ); + MM128_ILEAVE32( d[114], d[118], d[122], d[126], s08[7], s09[7], s10[7], s11[7] ); + MM128_ILEAVE32( d[115], d[119], d[123], d[127], s12[7], s13[7], s14[7], s15[7] ); +} + +// Not used, only potential use is with AVX512 +#if defined(__AVX2__) + +static inline void intrlv_16x32_256( void *dst, const void *src00, + const void *src01, const void *src02, const void *src03, const void *src04, + const void *src05, const void *src06, const void *src07, const void *src08, + const void *src09, const void *src10, const void *src11, const void *src12, + const void *src13, const void *src14, const void *src15 ) +{ + __m256i *d = (__m256i*)dst; + const __m256i *s00 = (const __m256i*)src00; + const __m256i *s01 = (const __m256i*)src01; + const __m256i *s02 = (const __m256i*)src02; + const __m256i *s03 = (const __m256i*)src03; + const __m256i *s04 = (const __m256i*)src04; + const __m256i *s05 = (const __m256i*)src05; + const __m256i *s06 = (const __m256i*)src06; + const __m256i *s07 = (const __m256i*)src07; + const __m256i *s08 = (const __m256i*)src08; + const __m256i *s09 = (const __m256i*)src09; + const __m256i *s10 = (const __m256i*)src10; + const __m256i *s11 = (const __m256i*)src11; + const __m256i *s12 = (const __m256i*)src12; + const __m256i *s13 = (const __m256i*)src13; + const __m256i *s14 = (const __m256i*)src14; + const __m256i *s15 = (const __m256i*)src15; + __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7; + + MM256_ILEAVE32( dt0, dt1, dt2, dt3, s00[0], s01[0], s02[0], s03[0] ); + MM256_ILEAVE32( dt4, dt5, dt6, dt7, s04[0], s05[0], s06[0], s07[0] ); + + d[ 0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); + d[ 8] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); + d[ 2] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); + d[10] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); + d[ 4] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); + d[12] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); + d[ 6] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); + d[14] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); + + MM256_ILEAVE32( dt0, dt1, dt1, dt3, s08[0], s09[0], s10[0], s11[0] ); + MM256_ILEAVE32( dt4, dt5, dt6, dt7, s12[0], s13[0], s14[0], s15[0] ); + + d[ 1] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); + d[ 9] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); + d[ 3] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); + d[11] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); + d[ 5] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); + d[13] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); + d[ 7] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); + d[15] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); +} +#endif + +// Not used +static inline void intrlv_16x32_512( void *dst, const void *src00, + const void *src01, const void *src02, const void *src03, const void *src04, + const void *src05, const void *src06, const void *src07, const void *src08, + const void *src09, const void *src10, const void *src11, const void *src12, + const void *src13, const void *src14, const void *src15 ) +{ +#if defined(__AVX512F__) + + __m512i *d = (__m512i*)dst; + const __m512i *s00 = (const __m512i*)src00; + const __m512i *s01 = (const __m512i*)src01; + const __m512i *s02 = (const __m512i*)src02; + const __m512i *s03 = (const __m512i*)src03; + const __m512i *s04 = (const __m512i*)src04; + const __m512i *s05 = (const __m512i*)src05; + const __m512i *s06 = (const __m512i*)src06; + const __m512i *s07 = (const __m512i*)src07; + const __m512i *s08 = (const __m512i*)src08; + const __m512i *s09 = (const __m512i*)src09; + const __m512i *s10 = (const __m512i*)src10; + const __m512i *s11 = (const __m512i*)src11; + const __m512i *s12 = (const __m512i*)src12; + const __m512i *s13 = (const __m512i*)src13; + const __m512i *s14 = (const __m512i*)src14; + const __m512i *s15 = (const __m512i*)src15; + __m512i st00, st01, st02, st03, st04, st05, st06, st07, + st08, st09, st10, st11, st12, st13, st14, st15, + t0, t1, t2, t3; + + MM512_ILEAVE32( st00, st01, st02, st03, s00[0], s01[0], s02[0], s03[0] ); + MM512_ILEAVE32( st04, st05, st06, st07, s04[0], s05[0], s06[0], s07[0] ); + MM512_ILEAVE32( st08, st09, st10, st11, s08[0], s09[0], s10[0], s11[0] ); + MM512_ILEAVE32( st12, st13, st14, st15, s12[0], s13[0], s14[0], s15[0] ); + + t0 = _mm512_shuffle_i32x4( st00, st04, 0x88 ); + t1 = _mm512_shuffle_i32x4( st00, st04, 0xdd ); + t2 = _mm512_shuffle_i32x4( st08, st12, 0x88 ); + t3 = _mm512_shuffle_i32x4( st08, st12, 0xdd ); + + d[ 0] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + d[ 8] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + d[ 4] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + d[12] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + + t0 = _mm512_shuffle_i32x4( st01, st05, 0x88 ); + t1 = _mm512_shuffle_i32x4( st01, st05, 0xdd ); + t2 = _mm512_shuffle_i32x4( st09, st13, 0x88 ); + t3 = _mm512_shuffle_i32x4( st09, st13, 0xdd ); + + d[ 1] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + d[ 9] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + d[ 5] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + d[13] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + + t0 = _mm512_shuffle_i32x4( st02, st06, 0x88 ); + t1 = _mm512_shuffle_i32x4( st02, st06, 0xdd ); + t2 = _mm512_shuffle_i32x4( st10, st14, 0x88 ); + t3 = _mm512_shuffle_i32x4( st10, st14, 0xdd ); + + d[ 2] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + d[10] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + d[ 6] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + d[14] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + + t0 = _mm512_shuffle_i32x4( st03, st07, 0x88 ); + t1 = _mm512_shuffle_i32x4( st03, st07, 0xdd ); + t2 = _mm512_shuffle_i32x4( st11, st15, 0x88 ); + t3 = _mm512_shuffle_i32x4( st11, st15, 0xdd ); + + d[ 3] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); + d[11] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); + d[ 7] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); + d[15] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); + +#endif +} +*/ #define ILEAVE_16x32( i ) do \ { \ @@ -923,6 +1427,7 @@ static inline void intrlv_16x32( void *dst, const void *s00, ILEAVE_16x32( 30 ); ILEAVE_16x32( 31 ); } + static inline void intrlv_16x32_512( void *dst, const void *s00, const void *s01, const void *s02, const void *s03, const void *s04, const void *s05, const void *s06, const void *s07, const void *s08, @@ -941,6 +1446,187 @@ static inline void intrlv_16x32_512( void *dst, const void *s00, #undef ILEAVE_16x32 +/* +static inline void dintrlv_16x32( void *dst00, void *dst01, void *dst02, + void *dst03, void *dst04, void *dst05, void *dst06, void *dst07, + void *dst08, void *dst09, void *dst10, void *dst11, void *dst12, + void *dst13, void *dst14, void *dst15, const void *src, + const int bit_len ) +{ + __m128i *d00 = (__m128i*)dst00; + __m128i *d01 = (__m128i*)dst01; + __m128i *d02 = (__m128i*)dst02; + __m128i *d03 = (__m128i*)dst03; + __m128i *d04 = (__m128i*)dst04; + __m128i *d05 = (__m128i*)dst05; + __m128i *d06 = (__m128i*)dst06; + __m128i *d07 = (__m128i*)dst07; + __m128i *d08 = (__m128i*)dst08; + __m128i *d09 = (__m128i*)dst09; + __m128i *d10 = (__m128i*)dst10; + __m128i *d11 = (__m128i*)dst11; + __m128i *d12 = (__m128i*)dst12; + __m128i *d13 = (__m128i*)dst13; + __m128i *d14 = (__m128i*)dst14; + __m128i *d15 = (__m128i*)dst15; + const __m128i *s = (const __m128i*)src; + + MM128_ILEAVE32( d00[0], d01[0], d02[0], d03[0], s[ 0], s[ 4], s[ 8], s[12] ); + MM128_ILEAVE32( d04[0], d05[0], d06[0], d07[0], s[ 1], s[ 5], s[ 9], s[13] ); + MM128_ILEAVE32( d08[0], d09[0], d10[0], d11[0], s[ 2], s[ 6], s[10], s[14] ); + MM128_ILEAVE32( d12[0], d13[0], d14[0], d15[0], s[ 3], s[ 7], s[11], s[15] ); + + MM128_ILEAVE32( d00[1], d01[1], d02[1], d03[1], s[16], s[20], s[24], s[28] ); + MM128_ILEAVE32( d04[1], d05[1], d06[1], d07[1], s[17], s[21], s[25], s[29] ); + MM128_ILEAVE32( d08[1], d09[1], d10[1], d11[1], s[18], s[22], s[26], s[30] ); + MM128_ILEAVE32( d12[1], d13[1], d14[1], d15[1], s[19], s[23], s[27], s[31] ); + + if ( bit_len <= 256 ) return; + + MM128_ILEAVE32( d00[2], d01[2], d02[2], d03[2], s[32], s[36], s[40], s[44] ); + MM128_ILEAVE32( d04[2], d05[2], d06[2], d07[2], s[33], s[37], s[41], s[45] ); + MM128_ILEAVE32( d08[2], d09[2], d10[2], d11[2], s[34], s[38], s[42], s[46] ); + MM128_ILEAVE32( d12[2], d13[2], d14[2], d15[2], s[35], s[39], s[43], s[47] ); + + MM128_ILEAVE32( d00[3], d01[3], d02[3], d03[3], s[48], s[52], s[56], s[60] ); + MM128_ILEAVE32( d04[3], d05[3], d06[3], d07[3], s[49], s[53], s[57], s[61] ); + MM128_ILEAVE32( d08[3], d09[3], d10[3], d11[3], s[50], s[54], s[58], s[62] ); + MM128_ILEAVE32( d12[3], d13[3], d14[3], d15[3], s[51], s[55], s[59], s[63] ); + + if ( bit_len <= 512 ) return; + + MM128_ILEAVE32( d00[4], d01[4], d02[4], d03[4], s[64], s[68], s[72], s[76] ); + MM128_ILEAVE32( d04[4], d05[4], d06[4], d07[4], s[65], s[69], s[73], s[77] ); + MM128_ILEAVE32( d08[4], d09[4], d10[4], d11[4], s[66], s[70], s[74], s[78] ); + MM128_ILEAVE32( d12[4], d13[4], d14[4], d15[4], s[67], s[71], s[75], s[79] ); + + if ( bit_len <= 640 ) return; + + MM128_ILEAVE32( d00[5], d01[5], d02[5], d03[5], s[80], s[84], s[88], s[92] ); + MM128_ILEAVE32( d04[5], d05[5], d06[5], d07[5], s[81], s[85], s[89], s[93] ); + MM128_ILEAVE32( d08[5], d09[5], d10[5], d11[5], s[82], s[86], s[90], s[94] ); + MM128_ILEAVE32( d12[5], d13[5], d14[5], d15[5], s[83], s[87], s[91], s[95] ); + + MM128_ILEAVE32( d00[6], d01[6], d02[6], d03[6], s[ 96], s[100], s[104], s[108] ); + MM128_ILEAVE32( d04[6], d05[6], d06[6], d07[6], s[ 97], s[101], s[105], s[109] ); + MM128_ILEAVE32( d08[6], d09[6], d10[6], d11[6], s[ 98], s[102], s[106], s[110] ); + MM128_ILEAVE32( d12[6], d13[6], d14[6], d15[6], s[ 99], s[103], s[107], s[111] ); + + MM128_ILEAVE32( d00[7], d01[7], d02[7], d03[7], s[112], s[116], s[120], s[124] ); + MM128_ILEAVE32( d04[7], d05[7], d06[7], d07[7], s[113], s[117], s[121], s[125] ); + MM128_ILEAVE32( d08[7], d09[7], d10[7], d11[7], s[114], s[118], s[122], s[126] ); + MM128_ILEAVE32( d12[7], d13[7], d14[7], d15[7], s[115], s[119], s[123], s[127] ); +} + +// 4 interleave algorithms same memory footprint: +// +// 1. 32 bit integer move +// +// Most instructions, all 32 bit loads & stores, use general purpose regs +// +// 2. SSE2 128 bit shuffle +// +// 128 bit loads and stores + fast shuffles, fewer total instructions: .75, +// uses 128 bit simd regs +// +// 3. AVX2 2x128 bit shuffle with 256 bit permute +// +// 256 bit loads and stores + slow 256 bit permutes, even fewer instructions: +// additional .5, uses 256 bit simd regs +// +// 4. AVX2 2x128 bit shuffle with union +// +// 128 bit loads, 256 bit stores + 128 bit moves using union + overhead +// converting from mm128 to mm256, compiler may choose mem ovly or + +static inline void dintrlv_16x32_256( void *dst00, void *dst01, void *dst02, + void *dst03, void *dst04, void *dst05, void *dst06, void *dst07, + void *dst08, void *dst09, void *dst10, void *dst11, void *dst12, + void *dst13, void *dst14, void *dst15, const void *src ) +{ +#if defined(__AVX2__) +// Can't use AVX512, min bit_len is 512 unless a single contiguous +// output buffer is used. + + const __m256i *s = (const __m256i*)src; + __m256i *d00 = (__m256i*)dst00; + __m256i *d01 = (__m256i*)dst01; + __m256i *d02 = (__m256i*)dst02; + __m256i *d03 = (__m256i*)dst03; + __m256i *d04 = (__m256i*)dst04; + __m256i *d05 = (__m256i*)dst05; + __m256i *d06 = (__m256i*)dst06; + __m256i *d07 = (__m256i*)dst07; + __m256i *d08 = (__m256i*)dst08; + __m256i *d09 = (__m256i*)dst09; + __m256i *d10 = (__m256i*)dst10; + __m256i *d11 = (__m256i*)dst11; + __m256i *d12 = (__m256i*)dst12; + __m256i *d13 = (__m256i*)dst13; + __m256i *d14 = (__m256i*)dst14; + __m256i *d15 = (__m256i*)dst15; + __m256i st0, st1, st2, st3, st4, st5, st6, st7; + + st0 = _mm256_permute2x128_si256( s[ 0], s[ 8], 0x20 ); + st4 = _mm256_permute2x128_si256( s[ 0], s[ 8], 0x31 ); + st1 = _mm256_permute2x128_si256( s[ 2], s[10], 0x20 ); + st5 = _mm256_permute2x128_si256( s[ 2], s[10], 0x31 ); + st2 = _mm256_permute2x128_si256( s[ 4], s[12], 0x20 ); + st6 = _mm256_permute2x128_si256( s[ 4], s[12], 0x31 ); + st3 = _mm256_permute2x128_si256( s[ 6], s[14], 0x20 ); + st7 = _mm256_permute2x128_si256( s[ 6], s[14], 0x31 ); + + MM256_ILEAVE32( d00[0], d01[0], d02[0], d03[0], st0, st1, st2, st3 ); + MM256_ILEAVE32( d04[0], d05[0], d06[0], d07[0], st4, st5, st6, st7 ); + + st0 = _mm256_permute2x128_si256( s[ 1], s[ 9], 0x20 ); + st4 = _mm256_permute2x128_si256( s[ 1], s[ 9], 0x31 ); + st1 = _mm256_permute2x128_si256( s[ 3], s[11], 0x20 ); + st5 = _mm256_permute2x128_si256( s[ 3], s[11], 0x31 ); + st2 = _mm256_permute2x128_si256( s[ 5], s[13], 0x20 ); + st6 = _mm256_permute2x128_si256( s[ 5], s[13], 0x31 ); + st3 = _mm256_permute2x128_si256( s[ 7], s[15], 0x20 ); + st7 = _mm256_permute2x128_si256( s[ 7], s[15], 0x31 ); + + MM256_ILEAVE32( d08[0], d09[0], d10[0], d11[0], st0, st1, st2, st3 ); + MM256_ILEAVE32( d12[0], d13[0], d14[0], d15[0], st4, st5, st6, st7 ); + + +#else +// not needed, 16x32 is only used with AVX512 + + __m128i *d00 = (__m128i*)dst00; + __m128i *d01 = (__m128i*)dst01; + __m128i *d02 = (__m128i*)dst02; + __m128i *d03 = (__m128i*)dst03; + __m128i *d04 = (__m128i*)dst04; + __m128i *d05 = (__m128i*)dst05; + __m128i *d06 = (__m128i*)dst06; + __m128i *d07 = (__m128i*)dst07; + __m128i *d08 = (__m128i*)dst08; + __m128i *d09 = (__m128i*)dst09; + __m128i *d10 = (__m128i*)dst10; + __m128i *d11 = (__m128i*)dst11; + __m128i *d12 = (__m128i*)dst12; + __m128i *d13 = (__m128i*)dst13; + __m128i *d14 = (__m128i*)dst14; + __m128i *d15 = (__m128i*)dst15; + const __m128i *s = (const __m128i*)src; + + MM128_ILEAVE32( d00[0], d01[0], d02[0], d03[0], s[ 0], s[ 4], s[ 8], s[12] ); + MM128_ILEAVE32( d04[0], d05[0], d06[0], d07[0], s[ 1], s[ 5], s[ 9], s[13] ); + MM128_ILEAVE32( d08[0], d09[0], d10[0], d11[0], s[ 2], s[ 6], s[10], s[14] ); + MM128_ILEAVE32( d12[0], d13[0], d14[0], d15[0], s[ 3], s[ 7], s[11], s[15] ); + + MM128_ILEAVE32( d00[1], d01[1], d02[1], d03[1], s[16], s[20], s[24], s[28] ); + MM128_ILEAVE32( d04[1], d05[1], d06[1], d07[1], s[17], s[21], s[25], s[29] ); + MM128_ILEAVE32( d08[1], d09[1], d10[1], d11[1], s[18], s[22], s[26], s[30] ); + MM128_ILEAVE32( d12[1], d13[1], d14[1], d15[1], s[19], s[23], s[27], s[31] ); + +#endif +} +*/ + #define DLEAVE_16x32( i ) do \ { \ const uint32_t *s = (const uint32_t*)(src) + ( (i) << 4 ); \ @@ -962,6 +1648,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00, *( (uint32_t*)(d15) +(i) ) = s[15]; \ } while(0) + static inline void dintrlv_16x32( void *d00, void *d01, void *d02, void *d03, void *d04, void *d05, void *d06, void *d07, void *d08, void *d09, void *d10, void *d11, void *d12, void *d13, void *d14, void *d15, @@ -988,6 +1675,7 @@ static inline void dintrlv_16x32( void *d00, void *d01, void *d02, void *d03, DLEAVE_16x32( 30 ); DLEAVE_16x32( 31 ); } + static inline void dintrlv_16x32_512( void *d00, void *d01, void *d02, void *d03, void *d04, void *d05, void *d06, void *d07, void *d08, void *d09, void *d10, void *d11, void *d12, @@ -1005,6 +1693,7 @@ static inline void dintrlv_16x32_512( void *d00, void *d01, void *d02, #undef DLEAVE_16x32 + static inline void extr_lane_16x32( void *d, const void *s, const int lane, const int bit_len ) { @@ -1225,37 +1914,6 @@ static inline void intrlv_4x64( void *dst, const void *src0, d[31] = _mm_unpackhi_epi64( s2[7], s3[7] ); } -/* -static inline void intrlv_4x64( void *dst, void *src0, - void *src1, void *src2, void *src3, int bit_len ) -{ - uint64_t *d = (uint64_t*)dst; - uint64_t *s0 = (uint64_t*)src0; - uint64_t *s1 = (uint64_t*)src1; - uint64_t *s2 = (uint64_t*)src2; - uint64_t *s3 = (uint64_t*)src3; - d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0]; - d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1]; - d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2]; - d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3]; - if ( bit_len <= 256 ) return; - d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4]; - d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5]; - d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6]; - d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7]; - if ( bit_len <= 512 ) return; - d[ 32] = s0[ 8]; d[ 33] = s1[ 8]; d[ 34] = s2[ 8]; d[ 35] = s3[ 8]; - d[ 36] = s0[ 9]; d[ 37] = s1[ 9]; d[ 38] = s2[ 9]; d[ 39] = s3[ 9]; - if ( bit_len <= 640 ) return; - d[ 40] = s0[10]; d[ 41] = s1[10]; d[ 42] = s2[10]; d[ 43] = s3[10]; - d[ 44] = s0[11]; d[ 45] = s1[11]; d[ 46] = s2[11]; d[ 47] = s3[11]; - d[ 48] = s0[12]; d[ 49] = s1[12]; d[ 50] = s2[12]; d[ 51] = s3[12]; - d[ 52] = s0[13]; d[ 53] = s1[13]; d[ 54] = s2[13]; d[ 55] = s3[13]; - d[ 56] = s0[14]; d[ 57] = s1[14]; d[ 58] = s2[14]; d[ 59] = s3[14]; - d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15]; -} -*/ - static inline void intrlv_4x64_512( void *dst, const void *src0, const void *src1, const void *src2, const void *src3 ) { @@ -1282,26 +1940,6 @@ static inline void intrlv_4x64_512( void *dst, const void *src0, d[15] = _mm_unpackhi_epi64( s2[3], s3[3] ); } -/* -static inline void intrlv_4x64_512( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3 ) -{ - uint64_t *d = (uint64_t*)dst; - const uint64_t *s0 = (const uint64_t*)src0; - const uint64_t *s1 = (const uint64_t*)src1; - const uint64_t *s2 = (const uint64_t*)src2; - const uint64_t *s3 = (const uint64_t*)src3; - d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0]; - d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1]; - d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2]; - d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3]; - d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4]; - d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5]; - d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6]; - d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7]; -} -*/ - static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, const int bit_len ) { @@ -1347,38 +1985,6 @@ static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, d3[7] = _mm_unpackhi_epi64( s[29], s[31] ); } - -/* -static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src, int bit_len ) -{ - uint64_t *d0 = (uint64_t*)dst0; - uint64_t *d1 = (uint64_t*)dst1; - uint64_t *d2 = (uint64_t*)dst2; - uint64_t *d3 = (uint64_t*)dst3; - const uint64_t *s = (const uint64_t*)src; - d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3]; - d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7]; - d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11]; - d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15]; - if ( bit_len <= 256 ) return; - d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19]; - d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23]; - d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27]; - d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31]; - if ( bit_len <= 512 ) return; - d0[ 8] = s[32]; d1[ 8] = s[33]; d2[ 8] = s[34]; d3[ 8] = s[35]; - d0[ 9] = s[36]; d1[ 9] = s[37]; d2[ 9] = s[38]; d3[ 9] = s[39]; - if ( bit_len <= 640 ) return; - d0[10] = s[40]; d1[10] = s[41]; d2[10] = s[42]; d3[10] = s[43]; - d0[11] = s[44]; d1[11] = s[45]; d2[11] = s[46]; d3[11] = s[47]; - d0[12] = s[48]; d1[12] = s[49]; d2[12] = s[50]; d3[12] = s[51]; - d0[13] = s[52]; d1[13] = s[53]; d2[13] = s[54]; d3[13] = s[55]; - d0[14] = s[56]; d1[14] = s[57]; d2[14] = s[58]; d3[14] = s[59]; - d0[15] = s[60]; d1[15] = s[61]; d2[15] = s[62]; d3[15] = s[63]; -} -*/ - static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { @@ -1405,26 +2011,33 @@ static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, d3[3] = _mm_unpackhi_epi64( s[13], s[15] ); } -/* -static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src ) + +static inline void extr_lane_4x64( void *dst, const void *src, const int lane, + const int bit_len ) { - uint64_t *d0 = (uint64_t*)dst0; - uint64_t *d1 = (uint64_t*)dst1; - uint64_t *d2 = (uint64_t*)dst2; - uint64_t *d3 = (uint64_t*)dst3; - const uint64_t *s = (const uint64_t*)src; - d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3]; - d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7]; - d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11]; - d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15]; - d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19]; - d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23]; - d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27]; - d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31]; + __m128i *d = (__m128i*)dst; + const __m128i *s = (const __m128i*)src; + int i = lane / 2; + if ( lane % 2 ) // odd lanes + { + d[0] = _mm_unpackhi_epi64( s[ i+ 0 ], s[ i+ 2 ] ); + d[1] = _mm_unpackhi_epi64( s[ i+ 4 ], s[ i+ 6 ] ); + if ( bit_len <= 256 ) return; + d[2] = _mm_unpackhi_epi64( s[ i+ 8 ], s[ i+10 ] ); + d[3] = _mm_unpackhi_epi64( s[ i+12 ], s[ i+14 ] ); + } + else // even lanes + { + d[0] = _mm_unpacklo_epi64( s[ i+ 0 ], s[ i+ 2 ] ); + d[1] = _mm_unpacklo_epi64( s[ i+ 4 ], s[ i+ 6 ] ); + if ( bit_len <= 256 ) return; + d[2] = _mm_unpacklo_epi64( s[ i+ 8 ], s[ i+10 ] ); + d[3] = _mm_unpacklo_epi64( s[ i+12 ], s[ i+14 ] ); + } + return; // bit_len == 512 } -*/ +/* static inline void extr_lane_4x64( void *d, const void *s, const int lane, const int bit_len ) { @@ -1438,11 +2051,44 @@ static inline void extr_lane_4x64( void *d, const void *s, ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+24 ]; ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+28 ]; } +*/ #if defined(__AVX2__) +// Doesn't really need AVX2, just SSSE3, but is only used with AVX2 code. + +static inline void mm256_intrlv80_4x64( void *d, const void *src ) +{ + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); -// There a alignment problems with the source buffer on Wwindows, -// can't use 256 bit bswap. + casti_m128i( d, 0 ) = + casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 ); + casti_m128i( d, 2 ) = + casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xee ); + + casti_m128i( d, 4 ) = + casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x44 ); + casti_m128i( d, 6 ) = + casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xee ); + + casti_m128i( d, 8 ) = + casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x44 ); + casti_m128i( d, 10 ) = + casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee ); + + casti_m128i( d, 12 ) = + casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 ); + casti_m128i( d, 14 ) = + casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee ); + + casti_m128i( d, 16 ) = + casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 ); + casti_m128i( d, 18 ) = + casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee ); +} static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { @@ -1636,40 +2282,6 @@ static inline void intrlv_8x64_512( void *dst, const void *src0, d[31] = _mm_unpackhi_epi64( s6[3], s7[3] ); } -/* -#define ILEAVE_8x64( i ) do \ -{ \ - uint64_t *d = (uint64_t*)(dst) + ( (i) << 3 ); \ - d[0] = *( (const uint64_t*)(s0) +(i) ); \ - d[1] = *( (const uint64_t*)(s1) +(i) ); \ - d[2] = *( (const uint64_t*)(s2) +(i) ); \ - d[3] = *( (const uint64_t*)(s3) +(i) ); \ - d[4] = *( (const uint64_t*)(s4) +(i) ); \ - d[5] = *( (const uint64_t*)(s5) +(i) ); \ - d[6] = *( (const uint64_t*)(s6) +(i) ); \ - d[7] = *( (const uint64_t*)(s7) +(i) ); \ -} while(0) - -static inline void intrlv_8x64( void *dst, const void *s0, - const void *s1, const void *s2, const void *s3, const void *s4, - const void *s5, const void *s6, const void *s7, int bit_len ) -{ - ILEAVE_8x64( 0 ); ILEAVE_8x64( 1 ); - ILEAVE_8x64( 2 ); ILEAVE_8x64( 3 ); - if ( bit_len <= 256 ) return; - ILEAVE_8x64( 4 ); ILEAVE_8x64( 5 ); - ILEAVE_8x64( 6 ); ILEAVE_8x64( 7 ); - if ( bit_len <= 512 ) return; - ILEAVE_8x64( 8 ); ILEAVE_8x64( 9 ); - if ( bit_len <= 640 ) return; - ILEAVE_8x64( 10 ); ILEAVE_8x64( 11 ); - ILEAVE_8x64( 12 ); ILEAVE_8x64( 13 ); - ILEAVE_8x64( 14 ); ILEAVE_8x64( 15 ); -} - -#undef ILEAVE_8x64 -*/ - static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, @@ -1815,39 +2427,32 @@ static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2, d7[3] = _mm_unpackhi_epi64( s[27], s[31] ); } -/* -#define DLEAVE_8x64( i ) do \ -{ \ - const uint64_t *s = (const uint64_t*)(src) + ( (i) << 3 ); \ - *( (uint64_t*)(d0) +(i) ) = s[0]; \ - *( (uint64_t*)(d1) +(i) ) = s[1]; \ - *( (uint64_t*)(d2) +(i) ) = s[2]; \ - *( (uint64_t*)(d3) +(i) ) = s[3]; \ - *( (uint64_t*)(d4) +(i) ) = s[4]; \ - *( (uint64_t*)(d5) +(i) ) = s[5]; \ - *( (uint64_t*)(d6) +(i) ) = s[6]; \ - *( (uint64_t*)(d7) +(i) ) = s[7]; \ -} while(0) - -static inline void dintrlv_8x64( void *d0, void *d1, void *d2, void *d3, - void *d4, void *d5, void *d6, void *d7, const void *src, int bit_len ) +static inline void extr_lane_8x64( void *dst, const void *src, const int lane, + const int bit_len ) { - DLEAVE_8x64( 0 ); DLEAVE_8x64( 1 ); - DLEAVE_8x64( 2 ); DLEAVE_8x64( 3 ); - if ( bit_len <= 256 ) return; - DLEAVE_8x64( 4 ); DLEAVE_8x64( 5 ); - DLEAVE_8x64( 6 ); DLEAVE_8x64( 7 ); - if ( bit_len <= 512 ) return; - DLEAVE_8x64( 8 ); DLEAVE_8x64( 9 ); - if ( bit_len <= 640 ) return; - DLEAVE_8x64( 10 ); DLEAVE_8x64( 11 ); - DLEAVE_8x64( 12 ); DLEAVE_8x64( 13 ); - DLEAVE_8x64( 14 ); DLEAVE_8x64( 15 ); + __m128i *d = (__m128i*)dst; + const __m128i *s = (const __m128i*)src; + int i = lane / 2; + if ( lane % 2 ) // odd lanes + { + d[0] = _mm_unpackhi_epi64( s[ i+ 0], s[ i+ 4] ); + d[1] = _mm_unpackhi_epi64( s[ i+ 8], s[ i+12] ); + if ( bit_len <= 256 ) return; + d[2] = _mm_unpackhi_epi64( s[ i+16], s[ i+20] ); + d[3] = _mm_unpackhi_epi64( s[ i+24], s[ i+28] ); + } + else // even lanes + { + d[0] = _mm_unpacklo_epi64( s[ i+ 0], s[ i+ 4] ); + d[1] = _mm_unpacklo_epi64( s[ i+ 8], s[ i+12] ); + if ( bit_len <= 256 ) return; + d[2] = _mm_unpacklo_epi64( s[ i+16], s[ i+20] ); + d[3] = _mm_unpacklo_epi64( s[ i+24], s[ i+28] ); + } + return; } -#undef DLEAVE_8x64 -*/ - +/* static inline void extr_lane_8x64( void *d, const void *s, const int lane, const int bit_len ) { @@ -1861,6 +2466,7 @@ static inline void extr_lane_8x64( void *d, const void *s, ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+ 48 ]; ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+ 56 ]; } +*/ #if defined(__AVX512F__) && defined(__AVX512VL__) diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 8b1fbeba..b5a36ab4 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -27,13 +27,22 @@ // All of the utilities here assume all data is in registers except // in rare cases where arguments are pointers. // +// Some constants are generated using a memory overlay on the stack. +// // Intrinsics automatically promote from REX to VEX when AVX is available // but ASM needs to be done manually. // /////////////////////////////////////////////////////////////////////////// -// Efficient and convenient moving bwtween GP & low bits of XMM. +// Used instead if casting. +typedef union +{ + __m128i m128; + uint32_t u32[4]; +} __attribute__ ((aligned (16))) m128_ovly; + +// Efficient and convenient moving between GP & low bits of XMM. // Use VEX when available to give access to xmm8-15 and zero extend for // larger vectors. @@ -59,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n ) return a; } -static inline uint64_t mm128_mov128_64( const __m128i a ) +// Inconstant naming, prefix should reflect return value: +// u64_mov128_64 + +static inline uint64_t u64_mov128_64( const __m128i a ) { uint64_t n; #if defined(__AVX__) @@ -70,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a ) return n; } -static inline uint32_t mm128_mov128_32( const __m128i a ) +static inline uint32_t u32_mov128_32( const __m128i a ) { uint32_t n; #if defined(__AVX__) @@ -81,6 +93,23 @@ static inline uint32_t mm128_mov128_32( const __m128i a ) return n; } +// Equivalent of set1, broadcast integer to all elements. +#define m128_const_i128( i ) mm128_mov64_128( i ) +#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 ) +#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 ) + +#if defined(__SSE4_1__) + +// Assign 64 bit integers to respective elements: {hi, lo} +#define m128_const_64( hi, lo ) \ + _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 ) + +#else // No insert in SSE2 + +#define m128_const_64 _mm_set_epi64x + +#endif + // Pseudo constants #define m128_zero _mm_setzero_si128() @@ -107,44 +136,70 @@ static inline __m128i mm128_neg1_fn() } #define m128_neg1 mm128_neg1_fn() +#if defined(__SSE4_1__) -// const functions work best when arguments are immediate constants or -// are known to be in registers. If data needs to loaded from memory or cache -// use set. +///////////////////////////// +// +// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c ) +// +// Fast and powerful but very limited in its application. +// It requires SSE4.1 but only works with 128 bit vectors with 32 bit +// elements. There is no equivalent instruction for 256 bit or 512 bit vectors. +// There's no integer version. There's no 64 bit, 16 bit or byte element +// sizing. It's unique. +// +// It can: +// - zero 32 bit elements of a 128 bit vector. +// - extract any 32 bit element from one 128 bit vector and insert the +// data to any 32 bit element of another 128 bit vector, or the same vector. +// - do both simultaneoulsly. +// +// It can be used as a more efficient replacement for _mm_insert_epi32 +// or _mm_extract_epi32. +// +// Control byte definition: +// c[3:0] zero mask +// c[5:4] destination element selector +// c[7:6] source element selector -// Equivalent of set1, broadcast 64 bit integer to all elements. -#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 ) -#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 ) +// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask +#define mm128_xim_32( v1, v2, c ) \ + _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \ + _mm_castsi128_ps( v2 ), c ) ) -#if defined(__SSE4_1__) +// Some examples of simple operations: -// Assign 64 bit integers to respective elements: {hi, lo} -#define m128_const_64( hi, lo ) \ - _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 ) +// Insert 32 bit integer into v at element c and return modified v. +static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i, + const int c ) +{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); } -#else // No insert in SSE2 +// Extract 32 bit element c from v and return as integer. +static inline uint32_t mm128_extract_32( const __m128i v, const int c ) +{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); } -#define m128_const_64 _mm_set_epi64x +// Clear (zero) 32 bit elements based on bits set in 4 bit mask. +static inline __m128i mm128_mask_32( const __m128i v, const int m ) +{ return mm128_xim_32( v, v, m ); } -#endif +// Move element i2 of v2 to element i1 of v1. For reference and convenience, +// it's faster to precalculate the index. +#define mm128_shuflmov_32( v1, i1, v2, i2 ) \ + mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) ) +#endif // SSE4_1 // // Basic operations without equivalent SIMD intrinsic // Bitwise not (~v) -#define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 ) +#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 ) // Unary negation of elements (-v) #define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v ) #define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v ) #define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v ) -// Clear (zero) 32 bit elements based on bits set in 4 bit mask. -// Fast, avoids using vector mask, but only available for 128 bit vectors. -#define mm128_mask_32( a, mask ) \ - _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( a ), \ - _mm_castsi128_ps( a ), mask ) ) // Add 4 values, fewer dependencies than sequential addition. #define mm128_add4_64( a, b, c, d ) \ @@ -162,27 +217,6 @@ static inline __m128i mm128_neg1_fn() #define mm128_xor4( a, b, c, d ) \ _mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) ) -// Horizontal vector testing - -#if defined(__SSE4_1__) - -#define mm128_allbits0( a ) _mm_testz_si128( a, a ) -#define mm128_allbits1( a ) _mm_testc_si128( a, m128_neg1 ) -// probably broken, avx2 is -//#define mm128_allbitsne( a ) _mm_testnzc_si128( a, m128_neg1 ) -#define mm128_anybits0( a ) mm128_allbits1( a ) -#define mm128_anybits1( a ) mm128_allbits0( a ) - -#else // SSE2 - -// Bit-wise test of entire vector, useful to test results of cmp. -#define mm128_anybits0( a ) (uint128_t)(a) -#define mm128_anybits1( a ) (((uint128_t)(a))+1) - -#define mm128_allbits0( a ) ( !mm128_anybits1(a) ) -#define mm128_allbits1( a ) ( !mm128_anybits0(a) ) - -#endif // SSE4.1 else SSE2 // // Vector pointer cast @@ -204,11 +238,6 @@ static inline __m128i mm128_neg1_fn() #define casto_m128i(p,o) (((__m128i*)(p))+(o)) -// Memory functions -// Mostly for convenience, avoids calculating bytes. -// Assumes data is alinged and integral. -// n = number of __m128i, bytes/16 - // Memory functions // Mostly for convenience, avoids calculating bytes. // Assumes data is alinged and integral. @@ -223,6 +252,56 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n ) static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } +#if defined(__AVX512VL__) + +// a ^ b ^ c +#define mm128_xor3( a, b, c ) \ + _mm_ternarylogic_epi64( a, b, c, 0x96 ) + +// a ^ ( b & c ) +#define mm128_xorand( a, b, c ) \ + _mm_ternarylogic_epi64( a, b, c, 0x78 ) + +#else + +#define mm128_xor3( a, b, c ) \ + _mm_xor_si128( a, _mm_xor_si128( b, c ) ) + +#define mm128_xorand( a, b, c ) \ + _mm_xor_si128( a, _mm_and_si128( b, c ) ) + +#endif + +// Mask making + +// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask. +// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements. + +#define mm_movmask_64( v ) \ + _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) ) + +#define mm_movmask_32( v ) \ + _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) ) + + +// Diagonal blend + +// Blend 4 32 bit elements from 4 vectors + +#if defined (__AVX2__) + +#define mm128_diagonal_32( v3, v2, v1, v0 ) \ + mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \ + _mm_blend_epi32( s1, s0, 0x1 ), 0x3 ) + +#elif defined(__SSE4_1__) + +#define mm128_diagonal_32( v3, v2, v1, v0 ) \ + mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \ + _mm_blend_epi16( s1, s0, 0x03 ), 0x0f ) + +#endif + // // Bit rotations @@ -230,6 +309,10 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) // AVX512VL has implemented bit rotation for 128 bit vectors with // 64 and 32 bit elements. +// x2 rotates elements in 2 individual vectors in a double buffered +// optimization for SSE2, does nothing for AVX512 but is there for +// transparency. + // compiler doesn't like when a variable is used for the last arg of // _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same // specification but works with a variable. Therefore use rol_var where @@ -249,21 +332,78 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX512VL__) +//#if defined(__AVX512F__) && defined(__AVX512VL__) #define mm128_ror_64 _mm_ror_epi64 #define mm128_rol_64 _mm_rol_epi64 #define mm128_ror_32 _mm_ror_epi32 #define mm128_rol_32 _mm_rol_epi32 -#else +#define mm128_rorx2_64( v1, v0, c ) \ + _mm_ror_epi64( v0, c ); \ + _mm_ror_epi64( v1, c ) + +#define mm128_rolx2_64( v1, v0, c ) \ + _mm_rol_epi64( v0, c ); \ + _mm_rol_epi64( v1, c ) + +#define mm128_rorx2_32( v1, v0, c ) \ + _mm_ror_epi32( v0, c ); \ + _mm_ror_epi32( v1, c ) + +#define mm128_rolx2_32( v1, v0, c ) \ + _mm_rol_epi32( v0, c ); \ + _mm_rol_epi32( v1, c ) + +#else // SSE2 #define mm128_ror_64 mm128_ror_var_64 #define mm128_rol_64 mm128_rol_var_64 #define mm128_ror_32 mm128_ror_var_32 #define mm128_rol_32 mm128_rol_var_32 -#endif // AVX512 else +#define mm128_rorx2_64( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_srli_epi64( v0, c ); \ + __m128i t1 = _mm_srli_epi64( v1, c ); \ + v0 = _mm_slli_epi64( v0, 64-(c) ); \ + v1 = _mm_slli_epi64( v1, 64-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + +#define mm128_rolx2_64( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_slli_epi64( v0, c ); \ + __m128i t1 = _mm_slli_epi64( v1, c ); \ + v0 = _mm_srli_epi64( v0, 64-(c) ); \ + v1 = _mm_srli_epi64( v1, 64-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + +#define mm128_rorx2_32( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_srli_epi32( v0, c ); \ + __m128i t1 = _mm_srli_epi32( v1, c ); \ + v0 = _mm_slli_epi32( v0, 32-(c) ); \ + v1 = _mm_slli_epi32( v1, 32-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + +#define mm128_rolx2_32( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_slli_epi32( v0, c ); \ + __m128i t1 = _mm_slli_epi32( v1, c ); \ + v0 = _mm_srli_epi32( v0, 32-(c) ); \ + v1 = _mm_srli_epi32( v1, 32-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + +#endif // AVX512 else SSE2 #define mm128_ror_16( v, c ) \ _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ) @@ -271,67 +411,41 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_rol_16( v, c ) \ _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ) -// -// Rotate vector elements accross all lanes - -#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) -#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) -#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) -//#define mm128_swap_64( v ) _mm_alignr_epi8( v, v, 8 ) -//#define mm128_ror_1x32( v ) _mm_alignr_epi8( v, v, 4 ) -//#define mm128_rol_1x32( v ) _mm_alignr_epi8( v, v, 12 ) -#define mm128_ror_1x16( v ) _mm_alignr_epi8( v, v, 2 ) -#define mm128_rol_1x16( v ) _mm_alignr_epi8( v, v, 14 ) -#define mm128_ror_1x8( v ) _mm_alignr_epi8( v, v, 1 ) -#define mm128_rol_1x8( v ) _mm_alignr_epi8( v, v, 15 ) - -// Rotate by c bytes -#define mm128_ror_x8( v, c ) _mm_alignr_epi8( v, c ) -#define mm128_rol_x8( v, c ) _mm_alignr_epi8( v, 16-(c) ) +// Limited 2 input shuffle +#define mm128_shuffle2_64( a, b, c ) \ + _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \ + _mm_castsi128_pd( b ), c ) ); +#define mm128_shuffle2_32( a, b, c ) \ + _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \ + _mm_castsi128_ps( b ), c ) ); -// Invert vector: {3,2,1,0} -> {0,1,2,3} -#define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b ) -#if defined(__SSSE3__) - -#define mm128_invert_16( v ) \ - _mm_shuffle_epi8( v, mm128_const_64( 0x0100030205040706, \ - 0x09080b0a0d0c0f0e ) -#define mm128_invert_8( v ) \ - _mm_shuffle_epi8( v, mm128_const_64( 0x0001020304050607, \ - 0x08090a0b0c0d0e0f ) +// +// Rotate vector elements accross all lanes -#endif // SSSE3 +#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) +#define mm128_shuflr_64 mm128_swap_64 +#define mm128_shufll_64 mm128_swap_64 +#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 ) +#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 ) -// -// Rotate elements within lanes. +// Swap 32 bit elements in 64 bit lanes #define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) +#define mm128_shuflr64_32 mm128_swap64_32 +#define mm128_shufll64_32 mm128_swap64_32 -#define mm128_rol64_8( v, c ) \ - _mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \ - _mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) - -#define mm128_ror64_8( v, c ) \ - _mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \ - _mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) - -#define mm128_rol32_8( v, c ) \ - _mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \ - _mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) ) +#if defined(__SSSE3__) -#define mm128_ror32_8( v, c ) \ - _mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \ - _mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) ) - +// Rotate right by c bytes, no SSE2 equivalent. +static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) +{ return _mm_alignr_epi8( v, v, c ); } // // Endian byte swap. -#if defined(__SSSE3__) - #define mm128_bswap_64( v ) \ _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \ 0x0001020304050607 ) ) @@ -374,7 +488,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #else // SSE2 -// Use inline function instead of macro due to multiple statements. static inline __m128i mm128_bswap_64( __m128i v ) { v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); @@ -431,59 +544,86 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) v1 = _mm_xor_si128( v1, v2 ); +// Two input shuffle-rotate. // Concatenate v1 & v2 and rotate as one 256 bit vector. -#if defined(__SSE4_1__) +// Continue to use vror/vrol for now to avoid confusion with +// shufl2r/shufl2l function macros available with AVX512. + +#if defined(__SSSE3__) + +// Function macro with two inputs and one output, inputs are preserved. +// Two input functions are not available without SSSE3. Use procedure +// macros below instead. + +#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) +#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) + +#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 ) +#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 ) + +#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 ) +#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 ) + +#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) +#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) + +// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten. + +// These macros retain the vrol/vror name for now to avoid +// confusion with the shufl2r/shuffle2l function macros above. +// These may be renamed to something like shufl2r2 for 2 nputs and +// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs. -#define mm128_ror256_64( v1, v2 ) \ +#define mm128_vror256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v1 = _mm_alignr_epi8( v2, v1, 8 ); \ v2 = t; \ } while(0) -#define mm128_rol256_64( v1, v2 ) \ +#define mm128_vrol256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v2 = _mm_alignr_epi8( v2, v1, 8 ); \ v1 = t; \ } while(0) -#define mm128_ror256_32( v1, v2 ) \ +#define mm128_vror256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 4 ); \ v1 = _mm_alignr_epi8( v2, v1, 4 ); \ v2 = t; \ } while(0) -#define mm128_rol256_32( v1, v2 ) \ +#define mm128_vrol256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 12 ); \ v2 = _mm_alignr_epi8( v2, v1, 12 ); \ v1 = t; \ } while(0) -#define mm128_ror256_16( v1, v2 ) \ +#define mm128_vror256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 2 ); \ v1 = _mm_alignr_epi8( v2, v1, 2 ); \ v2 = t; \ } while(0) -#define mm128_rol256_16( v1, v2 ) \ +#define mm128_vrol256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 14 ); \ v2 = _mm_alignr_epi8( v2, v1, 14 ); \ v1 = t; \ } while(0) -#define mm128_ror256_8( v1, v2 ) \ +#define mm128_vror256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 1 ); \ v1 = _mm_alignr_epi8( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm128_rol256_8( v1, v2 ) \ +#define mm128_vrol256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 15 ); \ v2 = _mm_alignr_epi8( v2, v1, 15 ); \ @@ -492,7 +632,7 @@ do { \ #else // SSE2 -#define mm128_ror256_64( v1, v2 ) \ +#define mm128_vror256_64( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \ _mm_slli_si128( v2, 8 ) ); \ @@ -501,7 +641,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_64( v1, v2 ) \ +#define mm128_vrol256_64( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \ _mm_srli_si128( v2, 8 ) ); \ @@ -510,7 +650,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror256_32( v1, v2 ) \ +#define mm128_vror256_32( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \ _mm_slli_si128( v2, 12 ) ); \ @@ -519,7 +659,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_32( v1, v2 ) \ +#define mm128_vrol256_32( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \ _mm_srli_si128( v2, 12 ) ); \ @@ -528,7 +668,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror256_16( v1, v2 ) \ +#define mm128_vror256_16( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \ _mm_slli_si128( v2, 14 ) ); \ @@ -537,7 +677,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_16( v1, v2 ) \ +#define mm128_vrol256_16( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \ _mm_srli_si128( v2, 14 ) ); \ @@ -546,7 +686,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror256_8( v1, v2 ) \ +#define mm128_vror256_8( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \ _mm_slli_si128( v2, 15 ) ); \ @@ -555,7 +695,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_8( v1, v2 ) \ +#define mm128_vrol256_8( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \ _mm_srli_si128( v2, 15 ) ); \ diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 5f94cbc8..bede65c7 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -1,7 +1,7 @@ #if !defined(SIMD_256_H__) #define SIMD_256_H__ 1 -#if defined(__AVX2__) +//#if defined(__AVX2__) ///////////////////////////////////////////////////////////////////// // @@ -14,34 +14,75 @@ // is limited because 256 bit vectors are less likely to be used when 512 // is available. +#if defined(__AVX__) + +// Used instead of casting. +typedef union +{ + __m256i m256; + __m128i m128[2]; + uint64_t u64[4]; + uint32_t u32[8]; +} __attribute__ ((aligned (32))) m256_ovly; + +// +// Pointer casting + +// p = any aligned pointer +// returns p as pointer to vector type, not very useful +#define castp_m256i(p) ((__m256i*)(p)) + +// p = any aligned pointer +// returns *p, watch your pointer arithmetic +#define cast_m256i(p) (*((__m256i*)(p))) + +// p = any aligned pointer, i = scaled array index +// returns value p[i] +#define casti_m256i(p,i) (((__m256i*)(p))[(i)]) + +// p = any aligned pointer, o = scaled offset +// returns pointer p+o +#define casto_m256i(p,o) (((__m256i*)(p))+(o)) + +#endif +#if defined(__AVX2__) + + // Move integer to low element of vector, other elements are set to zero. +#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) ) +#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) ) + +// Move low element of vector to integer. +#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) ) +#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) ) -#define mm256_mov64_256( n ) _mm256_castsi128_si256( mm128_mov64_128( n ) ) -#define mm256_mov32_256( n ) _mm256_castsi128_si256( mm128_mov32_128( n ) ) +// deprecated +//#define mm256_mov256_64 u64_mov256_64 +//#define mm256_mov256_32 u32_mov256_32 -#define mm256_mov256_64( a ) mm128_mov128_64( _mm256_castsi256_si128( a ) ) -#define mm256_mov256_32( a ) mm128_mov128_32( _mm256_castsi256_si128( a ) ) // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo } #define mm256_concat_128( hi, lo ) \ _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) -// Equavalent of set, move 64 bit integer constants to respective 64 bit +// Equivalent of set, move 64 bit integer constants to respective 64 bit // elements. static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2, const uint64_t i1, const uint64_t i0 ) { - __m128i hi, lo; - lo = mm128_mov64_128( i0 ); - hi = mm128_mov64_128( i2 ); - lo = _mm_insert_epi64( lo, i1, 1 ); - hi = _mm_insert_epi64( hi, i3, 1 ); - return mm256_concat_128( hi, lo ); + union { __m256i m256i; + uint64_t u64[4]; } v; + v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3; + return v.m256i; } -// Equivalent of set1, broadcast integer constant to all elements. -#define m256_const1_128( v ) _mm256_broadcastsi128_si256( v ) +// Equivalent of set1. +// 128 bit vector argument +#define m256_const1_128( v ) \ + _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 ) +// 64 bit integer argument zero extended to 128 bits. +#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) ) #define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) ) #define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) ) #define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) ) @@ -50,189 +91,195 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2, #define m256_const2_64( i1, i0 ) \ m256_const1_128( m128_const_64( i1, i0 ) ) -#define m126_const2_32( i1, i0 ) \ - m256_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) ) - - // // All SIMD constant macros are actually functions containing executable // code and therefore can't be used as compile time initializers. -#define m256_zero _mm256_setzero_si256() -#define m256_one_256 mm256_mov64_256( 1 ) -#define m256_one_128 \ - _mm256_permute4x64_epi64( _mm256_castsi128_si256( \ - mm128_mov64_128( 1 ) ), 0x44 ) -#define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) ) -#define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) ) -#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) ) -#define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) ) +#define m256_zero _mm256_setzero_si256() +#define m256_one_256 mm256_mov64_256( 1 ) +#define m256_one_128 m256_const1_i128( 1 ) +#define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) ) +#define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) ) +#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) ) +#define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) ) static inline __m256i mm256_neg1_fn() { - __m256i a; - asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) ); - return a; + __m256i v; + asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(v) ); + return v; } #define m256_neg1 mm256_neg1_fn() +// Consistent naming for similar operations. +#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v ) +#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 ) // -// Vector size conversion. -// -// Allows operations on either or both halves of a 256 bit vector serially. -// Handy for parallel AES. -// Caveats when writing: -// _mm256_castsi256_si128 is free and without side effects. -// _mm256_castsi128_si256 is also free but leaves the high half -// undefined. That's ok if the hi half will be subseqnently assigned. -// If assigning both, do lo first, If assigning only 1, use -// _mm256_inserti128_si256. +// Memory functions +// n = number of 256 bit (32 byte) vectors + +static inline void memset_zero_256( __m256i *dst, const int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; } + +static inline void memset_256( __m256i *dst, const __m256i a, const int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = a; } + +static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) +{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } + + // -#define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a ) -#define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 ) - -// Extract integers from 256 bit vector, ineficient, avoid if possible.. -#define mm256_extr_4x64( a3, a2, a1, a0, src ) \ -do { \ - __m128i hi = _mm256_extracti128_si256( src, 1 ); \ - a0 = mm128_mov128_64( _mm256_castsi256_si128( src) ); \ - a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \ - a2 = mm128_mov128_64( hi ); \ - a3 = _mm_extract_epi64( hi, 1 ); \ -} while(0) +// Basic operations without SIMD equivalent -#define mm256_extr_8x32( a7, a6, a5, a4, a3, a2, a1, a0, src ) \ -do { \ - uint64_t t = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \ - __m128i hi = _mm256_extracti128_si256( src, 1 ); \ - a0 = mm256_mov256_32( src ); \ - a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \ - a2 = (uint32_t)( t ); \ - a3 = (uint32_t)( t<<32 ); \ - t = _mm_extract_epi64( hi, 1 ); \ - a4 = mm128_mov128_32( hi ); \ - a5 = _mm_extract_epi32( hi, 1 ); \ - a6 = (uint32_t)( t ); \ - a7 = (uint32_t)( t<<32 ); \ -} while(0) +// Bitwise not ( ~v ) +#define mm256_not( v ) _mm256_xor_si256( v, m256_neg1 ) \ +// Unary negation of each element ( -v ) +#define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v ) +#define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v ) +#define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v ) -// Bytewise test of all 256 bits -#define mm256_all0_8( a ) \ - ( _mm256_movemask_epi8( a ) == 0 ) -#define mm256_all1_8( a ) \ - ( _mm256_movemask_epi8( a ) == -1 ) +// Add 4 values, fewer dependencies than sequential addition. +#define mm256_add4_64( a, b, c, d ) \ + _mm256_add_epi64( _mm256_add_epi64( a, b ), _mm256_add_epi64( c, d ) ) -#define mm256_anybits0( a ) \ - ( _mm256_movemask_epi8( a ) & 0xffffffff ) +#define mm256_add4_32( a, b, c, d ) \ + _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) ) -#define mm256_anybits1( a ) \ - ( ( _mm256_movemask_epi8( a ) & 0xffffffff ) != 0xffffffff ) +#define mm256_add4_16( a, b, c, d ) \ + _mm256_add_epi16( _mm256_add_epi16( a, b ), _mm256_add_epi16( c, d ) ) +#define mm256_add4_8( a, b, c, d ) \ + _mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) ) -// Bitwise test of all 256 bits -#define mm256_allbits0( a ) _mm256_testc_si256( a, m256_neg1 ) -#define mm256_allbits1( a ) _mm256_testc_si256( m256_zero, a ) -//#define mm256_anybits0( a ) !mm256_allbits1( a ) -//#define mm256_anybits1( a ) !mm256_allbits0( a ) +#if defined(__AVX512VL__) +// AVX512 has ternary logic that supports any 3 input boolean expression. -// Parallel AES, for when x is expected to be in a 256 bit register. -// Use same 128 bit key. +// a ^ b ^ c +#define mm256_xor3( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x96 ) -#if defined(__VAES__) +// legacy convenience only +#define mm256_xor4( a, b, c, d ) \ + _mm256_xor_si256( a, mm256_xor3( b, c, d ) ) -#define mm256_aesenc_2x128( x, k ) \ - _mm256_aesenc_epi128( x, k ) +// a & b & c +#define mm256_and3( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x80 ) -#else +// a | b | c +#define mm256_or3( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0xfe ) -#define mm256_aesenc_2x128( x, k ) \ - mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \ - _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) ) +// a ^ ( b & c ) +#define mm256_xorand( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x78 ) -#endif +// a & ( b ^ c ) +#define mm256_andxor( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x60 ) -#define mm256_paesenc_2x128( y, x, k ) do \ -{ \ - __m128i *X = (__m128i*)x; \ - __m128i *Y = (__m128i*)y; \ - Y[0] = _mm_aesenc_si128( X[0], k ); \ - Y[1] = _mm_aesenc_si128( X[1], k ); \ -} while(0); +// a ^ ( b | c ) +#define mm256_xoror( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x1e ) -// -// Pointer casting +// a ^ ( ~b & c ) +#define mm256_xorandnot( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0xd2 ) -// p = any aligned pointer -// returns p as pointer to vector type, not very useful -#define castp_m256i(p) ((__m256i*)(p)) +// a | ( b & c ) +#define mm256_orand( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0xf8 ) -// p = any aligned pointer -// returns *p, watch your pointer arithmetic -#define cast_m256i(p) (*((__m256i*)(p))) +// ~( a ^ b ), same as (~a) ^ b +#define mm256_xnor( a, b ) \ + _mm256_ternarylogic_epi64( a, b, b, 0x81 ) + +#else -// p = any aligned pointer, i = scaled array index -// returns value p[i] -#define casti_m256i(p,i) (((__m256i*)(p))[(i)]) +#define mm256_xor3( a, b, c ) \ + _mm256_xor_si256( a, _mm256_xor_si256( b, c ) ) -// p = any aligned pointer, o = scaled offset -// returns pointer p+o -#define casto_m256i(p,o) (((__m256i*)(p))+(o)) +#define mm256_xor4( a, b, c, d ) \ + _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) ) +#define mm256_and3( a, b, c ) \ + _mm256_and_si256( a, _mm256_and_si256( b, c ) ) -// -// Memory functions -// n = number of 256 bit (32 byte) vectors +#define mm256_or3( a, b, c ) \ + _mm256_or_si256( a, _mm256_or_si256( b, c ) ) -static inline void memset_zero_256( __m256i *dst, const int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; } +#define mm256_xorand( a, b, c ) \ + _mm256_xor_si256( a, _mm256_and_si256( b, c ) ) -static inline void memset_256( __m256i *dst, const __m256i a, const int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = a; } +#define mm256_andxor( a, b, c ) \ + _mm256_and_si256( a, _mm256_xor_si256( b, c )) -static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) -{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } +#define mm256_xoror( a, b, c ) \ + _mm256_xor_si256( a, _mm256_or_si256( b, c ) ) +#define mm256_xorandnot( a, b, c ) \ + _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) ) -// -// Basic operations without SIMD equivalent +#define mm256_orand( a, b, c ) \ + _mm256_or_si256( a, _mm256_and_si256( b, c ) ) -// Bitwise not ( ~x ) -#define mm256_not( x ) _mm256_xor_si256( (x), m256_neg1 ) \ +#define mm256_xnor( a, b ) \ + mm256_not( _mm256_xor_si256( a, b ) ) -// Unary negation of each element ( -a ) -#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a ) -#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a ) -#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a ) +#endif +// Mask making -// Add 4 values, fewer dependencies than sequential addition. +// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask. +// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements. -#define mm256_add4_64( a, b, c, d ) \ - _mm256_add_epi64( _mm256_add_epi64( a, b ), _mm256_add_epi64( c, d ) ) +#define mm256_movmask_64( v ) \ + _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) ) -#define mm256_add4_32( a, b, c, d ) \ - _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) ) +#define mm256_movmask_32( v ) \ + _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) ) -#define mm256_add4_16( a, b, c, d ) \ - _mm256_add_epi16( _mm256_add_epi16( a, b ), _mm256_add_epi16( c, d ) ) -#define mm256_add4_8( a, b, c, d ) \ - _mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) ) +// Diagonal blending + +// Blend 4 64 bit elements from 4 vectors +#define mm256_diagonal_64( v3, v2, v1, v0 ) \ + mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \ + _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f ) + +// Blend 8 32 bit elements from 8 vectors +#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( v7, v6, 0x40 ), \ + _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( v3, v2, 0x04) \ + _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f ) + + +// Blend 4 32 bit elements from each 128 bit lane. +#define mm256_diagonal128_32( v3, v2, v1, v0 ) \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( v3, v2, 0x44) \ + _mm256_blend_epi32( v1, v0, 0x11 ) ) -#define mm256_xor4( a, b, c, d ) \ - _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) ) // // Bit rotations. // -// The only bit shift for more than 64 bits is with __int128. +// The only bit shift for more than 64 bits is with __int128 which is slow. // // AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements +// +// x2 rotates elements in 2 individual vectors in a double buffered +// optimization for SSE2, does nothing for AVX512 but is there for +// transparency. // compiler doesn't like when a variable is used for the last arg of @@ -256,7 +303,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) _mm256_srli_epi32( v, 32-(c) ) ) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +// The spec says both F & VL are required, but just in case AMD +// decides to implement ROL/R without AVX512F. +#if defined(__AVX512VL__) +//#if defined(__AVX512F__) && defined(__AVX512VL__) // AVX512, control must be 8 bit immediate. @@ -265,17 +315,70 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_ror_32 _mm256_ror_epi32 #define mm256_rol_32 _mm256_rol_epi32 -#else +#define mm256_rorx2_64( v1, v0, c ) \ + _mm256_ror_epi64( v0, c ); \ + _mm256_ror_epi64( v1, c ) + +#define mm256_rolx2_64( v1, v0, c ) \ + _mm256_rol_epi64( v0, c ); \ + _mm256_rol_epi64( v1, c ) +#define mm256_rorx2_32( v1, v0, c ) \ + _mm256_ror_epi32( v0, c ); \ + _mm256_ror_epi32( v1, c ) -// No AVX512, use fallback. +#define mm256_rolx2_32( v1, v0, c ) \ + _mm256_rol_epi32( v0, c ); \ + _mm256_rol_epi32( v1, c ) + +#else // AVX2 #define mm256_ror_64 mm256_ror_var_64 #define mm256_rol_64 mm256_rol_var_64 #define mm256_ror_32 mm256_ror_var_32 #define mm256_rol_32 mm256_rol_var_32 -#endif // AVX512 else +#define mm256_rorx2_64( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_srli_epi64( v0, c ); \ + __m256i t1 = _mm256_srli_epi64( v1, c ); \ + v0 = _mm256_slli_epi64( v0, 64-(c) ); \ + v1 = _mm256_slli_epi64( v1, 64-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + +#define mm256_rolx2_64( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_slli_epi64( v0, c ); \ + __m256i t1 = _mm256_slli_epi64( v1, c ); \ + v0 = _mm256_srli_epi64( v0, 64-(c) ); \ + v1 = _mm256_srli_epi64( v1, 64-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + +#define mm256_rorx2_32( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_srli_epi32( v0, c ); \ + __m256i t1 = _mm256_srli_epi32( v1, c ); \ + v0 = _mm256_slli_epi32( v0, 32-(c) ); \ + v1 = _mm256_slli_epi32( v1, 32-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + +#define mm256_rolx2_32( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_slli_epi32( v0, c ); \ + __m256i t1 = _mm256_slli_epi32( v1, c ); \ + v0 = _mm256_srli_epi32( v0, 32-(c) ); \ + v1 = _mm256_srli_epi32( v1, 32-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + +#endif // AVX512 else AVX2 #define mm256_ror_16( v, c ) \ _mm256_or_si256( _mm256_srli_epi16( v, c ), \ @@ -285,223 +388,59 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) _mm256_or_si256( _mm256_slli_epi16( v, c ), \ _mm256_srli_epi16( v, 16-(c) ) ) -// Rotate bits in each element of v by the amount in corresponding element of -// index vector c -#define mm256_rorv_64( v, c ) \ - _mm256_or_si256( \ - _mm256_srlv_epi64( v, c ), \ - _mm256_sllv_epi64( v, _mm256_sub_epi64( \ - _mm256_set1_epi64x( 64 ), c ) ) ) - -#define mm256_rolv_64( v, c ) \ - _mm256_or_si256( \ - _mm256_sllv_epi64( v, c ), \ - _mm256_srlv_epi64( v, _mm256_sub_epi64( \ - _mm256_set1_epi64x( 64 ), c ) ) ) - -#define mm256_rorv_32( v, c ) \ - _mm256_or_si256( \ - _mm256_srlv_epi32( v, c ), \ - _mm256_sllv_epi32( v, _mm256_sub_epi32( \ - _mm256_set1_epi32( 32 ), c ) ) ) - -#define mm256_rolv_32( v, c ) \ - _mm256_or_si256( \ - _mm256_sllv_epi32( v, c ), \ - _mm256_srlv_epi32( v, _mm256_sub_epi32( \ - _mm256_set1_epi32( 32 ), c ) ) ) - -// AVX512 can do 16 bit elements. -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -#define mm256_rorv_16( v, c ) \ - _mm256_or_si256( \ - _mm256_srlv_epi16( v, _mm256_set1_epi16( c ) ), \ - _mm256_sllv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) ) - -#define mm256_rolv_16( v, c ) \ - _mm256_or_si256( \ - _mm256_sllv_epi16( v, _mm256_set1_epi16( c ) ), \ - _mm256_srlv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) ) - -#endif // AVX512 // // Rotate elements accross all lanes. -// -// AVX2 has no full vector permute for elements less than 32 bits. -// AVX512 has finer granularity full vector permutes. -// AVX512 has full vector alignr which might be faster, especially for 32 bit - - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -#define mm256_swap_128( v ) _mm256_alignr_epi64( v, v, 2 ) -#define mm256_ror_1x64( v ) _mm256_alignr_epi64( v, v, 1 ) -#define mm256_rol_1x64( v ) _mm256_alignr_epi64( v, v, 3 ) -#define mm256_ror_1x32( v ) _mm256_alignr_epi32( v, v, 1 ) -#define mm256_rol_1x32( v ) _mm256_alignr_epi32( v, v, 7 ) -#define mm256_ror_3x32( v ) _mm256_alignr_epi32( v, v, 3 ) -#define mm256_rol_3x32( v ) _mm256_alignr_epi32( v, v, 5 ) - -#else // AVX2 // Swap 128 bit elements in 256 bit vector. #define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) +#define mm256_shuflr_128 mm256_swap_128 +#define mm256_shufll_128 mm256_swap_128 // Rotate 256 bit vector by one 64 bit element -#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) -#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) +#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 ) + +#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 ) // Rotate 256 bit vector by one 32 bit element. -#define mm256_ror_1x32( v ) \ +#define mm256_shuflr_32( v ) \ _mm256_permutevar8x32_epi32( v, \ m256_const_64( 0x0000000000000007, 0x0000000600000005, \ - 0x0000000400000003, 0x0000000200000001 ) + 0x0000000400000003, 0x0000000200000001 ) ) -#define mm256_rol_1x32( v ) \ +#define mm256_shufll_32( v ) \ _mm256_permutevar8x32_epi32( v, \ m256_const_64( 0x0000000600000005, 0x0000000400000003, \ - 0x0000000200000001, 0x0000000000000007 ) - -// Rotate 256 bit vector by three 32 bit elements (96 bits). -#define mm256_ror_3x32( v ) \ - _mm256_permutevar8x32_epi32( v, \ - m256_const_64( 0x0000000200000001, 0x0000000000000007, \ - 0x0000000600000005, 0x0000000400000003 ) - -#define mm256_rol_3x32( v ) \ - _mm256_permutevar8x32_epi32( v, \ - m256_const_64( 0x0000000400000003, 0x0000000200000001, \ - 0x0000000000000007, 0x0000000600000005 ) - -#endif // AVX512 else AVX2 - - -// AVX512 can do 16 & 8 bit elements. -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -// Rotate 256 bit vector by one 16 bit element. -#define mm256_ror_1x16( v ) \ - _mm256_permutexvar_epi16( m256_const_64( \ - 0x0000000f000e000d, 0x000c000b000a0009, \ - 0x0008000700060005, 0x0004000300020001 ), v ) - -#define mm256_rol_1x16( v ) \ - _mm256_permutexvar_epi16( m256_const_64( \ - 0x000e000d000c000b, 0x000a000900080007, \ - 0x0006000500040003, 0x000200010000000f ), v ) - -#if defined (__AVX512VBMI__) - -// Rotate 256 bit vector by one byte. -#define mm256_ror_1x8( v ) _mm256_permutexvar_epi8( m256_const_64( \ - 0x001f1e1d1c1b1a19, 0x1817161514131211, \ - 0x100f0e0d0c0b0a09, 0x0807060504030201 ), v ) - -#define mm256_rol_1x8( v ) _mm256_permutexvar_epi16( m256_const_64( \ - 0x1e1d1c1b1a191817, 0x161514131211100f, \ - 0x0e0d0c0b0a090807, 0x060504030201001f ), v ) - -#endif // VBMI - -#endif // AVX512 - - -// Invert vector: {3,2,1,0} -> {0,1,2,3} - -#define mm256_invert_64 ( v ) _mm256_permute4x64_epi64( v, 0x1b ) - -#define mm256_invert_32 ( v ) _mm256_permutevar8x32_epi32( v, \ - m256_const_64( 0x0000000000000001, 0x0000000200000003 \ - 0x0000000400000005, 0x0000000600000007 ) - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -// Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7} -#define mm256_invert_16 ( v ) \ - _mm256_permutexvar_epi16( m256_const_64( \ - 0x0000000100020003, 0x0004000500060007, \ - 0x00080009000a000b, 0x000c000d000e000f ), v ) - -#if defined(__AVX512VBMI__) - -#define mm256_invert_8( v ) \ - _mm256_permutexvar_epi8( m256_const_64( \ - 0x0001020304050607, 0x08090a0b0c0d0e0f, \ - 0x1011121314151617, 0x18191a1b1c1d1e1f ), v ) -#endif // VBMI -#endif // AVX512 - + 0x0000000200000001, 0x0000000000000007 ) ) + // // Rotate elements within each 128 bit lane of 256 bit vector. -#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) - -#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 ) - -#define mm256_rol128_32( v ) _mm256_shuffle_epi32( v, 0x93 ) - -#define mm256_ror128_x8( v, c ) _mm256_alignr_epi8( v, v, c ) +// Limited 2 input shuffle +#define mm256_shuffle2_64( a, b, c ) \ + _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \ + _mm256_castsi256_pd( b ), c ) ); -/* -// Rotate each 128 bit lane by c elements. -#define mm256_ror128_8( v, c ) \ - _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \ - _mm256_bslli_epi128( v, 16-(c) ) ) -#define mm256_rol128_8( v, c ) \ - _mm256_or_si256( _mm256_bslli_epi128( v, c ), \ - _mm256_bsrli_epi128( v, 16-(c) ) ) -*/ +#define mm256_shuffle2_32( a, b, c ) \ + _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \ + _mm256_castsi256_ps( b ), c ) ); -// Rotate elements in each 64 bit lane -#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 ) +#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) +#define mm256_shuflr128_64 mm256_swap128_64 +#define mm256_shufll128_64 mm256_swap128_64 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#define mm256_shuflr128_32( v ) _mm256_shuffle_epi32( v, 0x39 ) +#define mm256_shufll128_32( v ) _mm256_shuffle_epi32( v, 0x93 ) -#define mm256_rol64_8( v, c ) _mm256_rol_epi64( v, ((c)<<3) ) -#define mm256_ror64_8( v, c ) _mm256_ror_epi64( v, ((c)<<3) ) - -#else - -#define mm256_rol64_8( v, c ) \ - _mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \ - _mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) - -#define mm256_ror64_8( v, c ) \ - _mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \ - _mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) - -#endif - - -// Rotate elements in each 32 bit lane - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 ) - -#define mm256_rol32_8( v ) _mm256_rol_epi32( v, 8 ) -#define mm256_ror32_8( v ) _mm256_ror_epi32( v, 8 ) - -#else - -#define mm256_swap32_16( v ) \ - _mm256_or_si256( _mm256_slli_epi32( v, 16 ), \ - _mm256_srli_epi32( v, 16 ) ) - -#define mm256_rol32_8( v ) \ - _mm256_or_si256( _mm256_slli_epi32( v, 8 ), \ - _mm256_srli_epi32( v, 8 ) ) - -#define mm256_ror32_8( v, c ) \ - _mm256_or_si256( _mm256_srli_epi32( v, 8 ), \ - _mm256_slli_epi32( v, 8 ) ) - -#endif +static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) +{ return _mm256_alignr_epi8( v, v, c ); } +// Swap 32 bit elements in each 64 bit lane. +#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 ) +#define mm256_shuflr64_32 mm256_swap64_32 +#define mm256_shufll64_32 mm256_swap64_32 // // Swap bytes in vector elements, endian bswap. @@ -561,25 +500,13 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also // makes these macros unnecessary. +// continue using vror/vrol notation for now to avoid confusion with +// shufl2r/shufl2l macro functions available with AVX512. #define mm256_swap512_256( v1, v2 ) \ v1 = _mm256_xor_si256( v1, v2 ); \ v2 = _mm256_xor_si256( v1, v2 ); \ v1 = _mm256_xor_si256( v1, v2 ); -#define mm256_ror512_128( v1, v2 ) \ -do { \ - __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ - v1 = _mm256_permute2x128( v2, v1, 0x21 ); \ - v2 = t; \ -} while(0) - -#define mm256_rol512_128( v1, v2 ) \ -do { \ - __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ - v2 = _mm256_permute2x128( v2, v1, 0x21 ); \ - v1 = t; \ -} while(0) - #endif // __AVX2__ #endif // SIMD_256_H__ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index a13e88f4..6867a3d9 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -26,9 +26,6 @@ // _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute // usually shuffles accross all lanes. // -// Some instructions like cmp and blend use a mask regsiter now instead -// a mask vector. -// // permutexvar has args reversed, index is first arg. Previously all // permutes and shuffles have the index last. // @@ -64,7 +61,7 @@ // // Additionally, permutations using smaller vectors can be more efficient // if the permutation doesn't cross lane boundaries, typically 128 bits, -// and the smnaller vector can use an imm comtrol. +// and the smaller vector can use an imm comtrol. // // If the permutation doesn't cross lane boundaries a shuffle instructions // can be used with imm control instead of permute. @@ -77,60 +74,64 @@ // __AVX512VBMI__ __AVX512VAES__ // +// Used instead if casting. +typedef union +{ + __m512i m512; + __m128i m128[4]; + uint32_t u32[16]; + uint64_t u64[8]; +} __attribute__ ((aligned (64))) m512_ovly; + // Move integer to/from element 0 of vector. #define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) ) #define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) ) -#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) ) -#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) ) - - -// Insert and extract integers is a multistage operation. -// Insert integer into __m128i, then insert __m128i to __m256i, finally -// insert __256i into __m512i. Reverse the order for extract. -// Do not use __m512_insert_epi64 or _mm256_insert_epi64 to perform multiple -// inserts. -// Avoid small integers for multiple inserts. -// Shortcuts: -// Use castsi to reference the low bits of a vector or sub-vector. (free) -// Use mov to insert integer into low bits of vector or sub-vector. (cheap) -// Use _mm_insert only to reference the high bits of __m128i. (expensive) -// Sequence instructions to minimize data dependencies. -// Use const or const1 only when integer is either immediate or known to be in -// a GP register. Use set/set1 when data needs to be loaded from memory or -// cache. +#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) ) +#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) ) + +// A simple 128 bit permute, using function instead of macro avoids +// problems if the v arg passed as an expression. +static inline __m512i mm512_perm_128( const __m512i v, const int c ) +{ return _mm512_shuffle_i64x2( v, v, c ); } // Concatenate two 256 bit vectors into one 512 bit vector {hi, lo} #define mm512_concat_256( hi, lo ) \ _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 ) +#define m512_const_128( v3, v2, v1, v0 ) \ + mm512_concat_256( mm256_concat_128( v3, v2 ), \ + mm256_concat_128( v1, v0 ) ) + // Equivalent of set, assign 64 bit integers to respective 64 bit elements. +// Use stack memory overlay static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, const uint64_t i5, const uint64_t i4, const uint64_t i3, const uint64_t i2, const uint64_t i1, const uint64_t i0 ) { - __m256i hi, lo; - __m128i hi1, lo1; - lo = mm256_mov64_256( i0 ); - lo1 = mm128_mov64_128( i2 ); - hi = mm256_mov64_256( i4 ); - hi1 = mm128_mov64_128( i6 ); - lo = _mm256_castsi128_si256( - _mm_insert_epi64( _mm256_castsi256_si128( lo ), i1, 1 ) ); - lo1 = _mm_insert_epi64( lo1, i3, 1 ); - hi = _mm256_castsi128_si256( - _mm_insert_epi64( _mm256_castsi256_si128( hi ), i5, 1 ) ); - hi1 = _mm_insert_epi64( hi1, i7, 1 ); - lo = _mm256_inserti128_si256( lo, lo1, 1 ); - hi = _mm256_inserti128_si256( hi, hi1, 1 ); - return mm512_concat_256( hi, lo ); + union { __m512i m512i; + uint64_t u64[8]; } v; + v.u64[0] = i0; v.u64[1] = i1; + v.u64[2] = i2; v.u64[3] = i3; + v.u64[4] = i4; v.u64[5] = i5; + v.u64[6] = i6; v.u64[7] = i7; + return v.m512i; } -// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements. -#define m512_const1_256( v ) _mm512_broadcast_i64x4( v ) -#define m512_const1_128( v ) _mm512_broadcast_i64x2( v ) +// Equivalent of set1, broadcast lo element to all elements. +static inline __m512i m512_const1_256( const __m256i v ) +{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); } + +#define m512_const1_128( v ) \ + mm512_perm_128( _mm512_castsi128_si512( v ), 0 ) +// Integer input argument up to 64 bits +#define m512_const1_i128( i ) \ + mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 ) + +//#define m512_const1_256( v ) _mm512_broadcast_i64x4( v ) +//#define m512_const1_128( v ) _mm512_broadcast_i64x2( v ) #define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) ) #define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) ) #define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) ) @@ -142,23 +143,17 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, #define m512_const2_64( i1, i0 ) \ m512_const1_128( m128_const_64( i1, i0 ) ) -#define m512_const2_32( i1, i0 ) \ - m512_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) ) - -// { m128_1, m128_1, m128_0, m128_0 } -#define m512_const_2x128( v1, v0 ) \ - m512_mask_blend_epi64( 0x0f, m512_const1_128( v1 ), m512_const1_128( v0 ) ) static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, const uint64_t i1, const uint64_t i0 ) { - __m256i lo = mm256_mov64_256( i0 ); - __m128i hi = mm128_mov64_128( i2 ); - lo = _mm256_castsi128_si256( - _mm_insert_epi64( _mm256_castsi256_si128( - lo ), i1, 1 ) ); - hi = _mm_insert_epi64( hi, i3, 1 ); - return _mm512_broadcast_i64x4( _mm256_inserti128_si256( lo, hi, 1 ) ); + union { __m512i m512i; + uint64_t u64[8]; } v; + v.u64[0] = v.u64[4] = i0; + v.u64[1] = v.u64[5] = i1; + v.u64[2] = v.u64[6] = i2; + v.u64[3] = v.u64[7] = i3; + return v.m512i; } // @@ -170,20 +165,23 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, #define m512_zero _mm512_setzero_si512() #define m512_one_512 mm512_mov64_512( 1 ) -#define m512_one_256 _mm512_broadcast_i64x4 ( mm256_mov64_256( 1 ) ) -#define m512_one_128 _mm512_broadcast_i64x2 ( mm128_mov64_128( 1 ) ) -#define m512_one_64 _mm512_broadcastq_epi64( mm128_mov64_128( 1 ) ) -#define m512_one_32 _mm512_broadcastd_epi32( mm128_mov64_128( 1 ) ) -#define m512_one_16 _mm512_broadcastw_epi16( mm128_mov64_128( 1 ) ) -#define m512_one_8 _mm512_broadcastb_epi8 ( mm128_mov64_128( 1 ) ) +#define m512_one_256 _mm512_inserti64x4( m512_one_512, m256_one_256, 1 ) +#define m512_one_128 m512_const1_i128( 1 ) +#define m512_one_64 m512_const1_64( 1 ) +#define m512_one_32 m512_const1_32( 1 ) +#define m512_one_16 m512_const1_16( 1 ) +#define m512_one_8 m512_const1_8( 1 ) -#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) +//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) +#define m512_neg1 _mm512_movm_epi64( 0xff ) // // Basic operations without SIMD equivalent // ~x -#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) +// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) +static inline __m512i mm512_not( const __m512i x ) +{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); } // -x #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x ) @@ -238,20 +236,82 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_add4_8( a, b, c, d ) \ _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) ) +// +// Ternary logic uses 8 bit truth table to define any 3 input logical +// expression using any number or combinations of AND, OR, XOR, NOT. + +// a ^ b ^ c +#define mm512_xor3( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x96 ) + +// legacy convenience only #define mm512_xor4( a, b, c, d ) \ - _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) ) + _mm512_xor_si512( a, mm512_xor3( b, c, d ) ) +// a & b & c +#define mm512_and3( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x80 ) + +// a | b | c +#define mm512_or3( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0xfe ) + +// a ^ ( b & c ) +#define mm512_xorand( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x78 ) + +// a & ( b ^ c ) +#define mm512_andxor( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x60 ) + +// a ^ ( b | c ) +#define mm512_xoror( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x1e ) + +// a ^ ( ~b & c ) xor( a, andnot( b, c ) ) +#define mm512_xorandnot( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) + +// a | ( b & c ) +#define mm512_orand( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0xf8 ) + +// Some 2 input operations that don't have their own instruction mnemonic. + +// ~( a | b ), (~a) & (~b) +#define mm512_nor( a, b ) \ + _mm512_ternarylogic_epi64( a, b, b, 0x01 ) + +// ~( a ^ b ), (~a) ^ b +#define mm512_xnor( a, b ) \ + _mm512_ternarylogic_epi64( a, b, b, 0x81 ) + +// ~( a & b ) +#define mm512_nand( a, b ) \ + _mm512_ternarylogic_epi64( a, b, b, 0xef ) + + +// Diagonal blending +// Blend 8 64 bit elements from 8 vectors +#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \ + _mm512_mask_blend_epi64( 0x0f, \ + _mm512_mask_blend_epi64( 0x30, \ + _mm512_mask_blend_epi64( 0x40, v7, v6 ), \ + _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \ + _mm512_mask_blend_epi64( 0x03, \ + _mm512_mask_blend_epi64( 0x04, v3, v2 ) \ + _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) ) + + +// Blend 4 32 bit elements from each 128 bit lane. +#define mm512_diagonal128_32( v3, v2, v1, v0 ) \ + _mm512_mask_blend_epi32( 0x3333, \ + _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \ + _mm512_mask_blend_epi32( 0x1111, v1, v0 ) ) -// Horizontal vector testing -// Returns bit __mmask8 -#define mm512_allbits0( a ) _mm512_cmpeq_epi64_mask( a, m512_zero ) -#define mm512_allbits1( a ) _mm512_cmpeq_epi64_mask( a, m512_neg1 ) -#define mm512_anybits0( a ) _mm512_cmpneq_epi64_mask( a, m512_neg1 ) -#define mm512_anybits1( a ) _mm512_cmpneq_epi64_mask( a, m512_zero ) -// // Bit rotations. // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit @@ -262,37 +322,47 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32 // +// For convenience and consistency with AVX2 #define mm512_ror_64 _mm512_ror_epi64 #define mm512_rol_64 _mm512_rol_epi64 #define mm512_ror_32 _mm512_ror_epi32 #define mm512_rol_32 _mm512_rol_epi32 -#define mm512_ror_var_64( v, c ) \ - _mm512_or_si512( _mm512_srli_epi64( v, c ), \ - _mm512_slli_epi64( v, 64-(c) ) ) - -#define mm512_rol_var_64( v, c ) \ - _mm512_or_si512( _mm512_slli_epi64( v, c ), \ - _mm512_srli_epi64( v, 64-(c) ) ) - -#define mm512_ror_var_32( v, c ) \ - _mm512_or_si512( _mm512_srli_epi32( v, c ), \ - _mm512_slli_epi32( v, 32-(c) ) ) +static inline __m512i mm512_ror_var_64( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_srli_epi64( v, c ), + _mm512_slli_epi64( v, 64-c ) ); +} -#define mm512_rol_var_32( v, c ) \ - _mm512_or_si512( _mm512_slli_epi32( v, c ), \ - _mm512_srli_epi32( v, 32-(c) ) ) +static inline __m512i mm512_rol_var_64( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_slli_epi64( v, c ), + _mm512_srli_epi64( v, 64-c ) ); +} +static inline __m512i mm512_ror_var_32( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_srli_epi32( v, c ), + _mm512_slli_epi32( v, 32-c ) ); +} -// Here is a fixed bit rotate for 16 bit elements: -#define mm512_ror_16( v, c ) \ - _mm512_or_si512( _mm512_srli_epi16( v, c ), \ - _mm512_slli_epi16( v, 16-(c) ) -#define mm512_rol_16( v, c ) \ - _mm512_or_si512( _mm512_slli_epi16( v, c ), \ - _mm512_srli_epi16( v, 16-(c) ) +static inline __m512i mm512_rol_var_32( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_slli_epi32( v, c ), + _mm512_srli_epi32( v, 32-c ) ); +} +static inline __m512i mm512_ror_16( __m512i const v, const int c ) +{ + return _mm512_or_si512( _mm512_srli_epi16( v, c ), + _mm512_slli_epi16( v, 16-c ) ); +} +static inline __m512i mm512_rol_16( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_slli_epi16( v, c ), + _mm512_srli_epi16( v, 16-c ) ); +} // Rotations using a vector control index are very slow due to overhead // to generate the index vector. Repeated rotations using the same index @@ -359,253 +429,213 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ } while(0) - // -// Rotate elements in 512 bit vector. +// Shift with zero fill & shuffle-rotate elements in 512 bit vector. +// + +// rename plan change ror to vror for Vector ROtate Right, +// and vrol for Vector ROtate Left, not to be confused with +//variable rotate rorv, rolv, +// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate +// operation. 1xNN notaion ia also removed and replaced with simpler NN. +// Swap will still have its own mnemonic and will be aliased as both +// left and right shuffles. + +// Shift elements right or left in 512 bit vector, filling with zeros. +// Multiple element shifts can be combined into a single larger +// element shift. + +#define mm512_shiftr_256( v ) \ + _mm512_alignr_epi64( _mm512_setzero, v, 4 ) +#define mm512_shiftl_256( v ) mm512_shifr_256 + +#define mm512_shiftr_128( v ) \ + _mm512_alignr_epi64( _mm512_setzero, v, 2 ) +#define mm512_shiftl_128( v ) \ + _mm512_alignr_epi64( v, _mm512_setzero, 6 ) + +#define mm512_shiftr_64( v ) \ + _mm512_alignr_epi64( _mm512_setzero, v, 1 ) +#define mm512_shiftl_64( v ) \ + _mm512_alignr_epi64( v, _mm512_setzero, 7 ) +#define mm512_shiftr_32( v ) \ + _mm512_alignr_epi32( _mm512_setzero, v, 1 ) +#define mm512_shiftl_32( v ) \ + _mm512_alignr_epi32( v, _mm512_setzero, 15 ) -#define mm512_swap_256( v ) _mm512_alignr_epi64( v, v, 4 ) +// Shuffle-rotate elements left or right in 512 bit vector. -// 1x64 notation used to disinguish from bit rotation. -#define mm512_ror_1x128( v ) _mm512_alignr_epi64( v, v, 2 ) -#define mm512_rol_1x128( v ) _mm512_alignr_epi64( v, v, 6 ) +static inline __m512i mm512_swap_256( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 4 ); } +#define mm512_shuflr_256( v ) mm512_swap_256 +#define mm512_shufll_256( v ) mm512_swap_256 -#define mm512_ror_1x64( v ) _mm512_alignr_epi64( v, v, 1 ) -#define mm512_rol_1x64( v ) _mm512_alignr_epi64( v, v, 7 ) +static inline __m512i mm512_shuflr_128( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 2 ); } -#define mm512_ror_1x32( v ) _mm512_alignr_epi32( v, v, 1 ) -#define mm512_rol_1x32( v ) _mm512_alignr_epi32( v, v, 15 ) +static inline __m512i mm512_shufll_128( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 6 ); } -// Generic for odd rotations -#define mm512_ror_x64( v, n ) _mm512_alignr_epi64( v, v, n ) -#define mm512_rol_x64( v, n ) _mm512_alignr_epi64( v, v, 8-(n) ) +static inline __m512i mm512_shuflr_64( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 1 ); } -#define mm512_ror_x32( v, n ) _mm512_alignr_epi32( v, v, n ) -#define mm512_rol_x32( v, n ) _mm512_alignr_epi32( v, v, 16-(n) ) +static inline __m512i mm512_shufll_64( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 7 ); } -#define mm512_ror_1x16( v ) \ +static inline __m512i mm512_shuflr_32( const __m512i v ) +{ return _mm512_alignr_epi32( v, v, 1 ); } + +static inline __m512i mm512_shufll_32( const __m512i v ) +{ return _mm512_alignr_epi32( v, v, 15 ); } + +// Generic +static inline __m512i mm512_shuflr_x64( const __m512i v, const int n ) +{ return _mm512_alignr_epi64( v, v, n ); } + +static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) +{ return _mm512_alignr_epi32( v, v, n ); } + +#define mm512_shuflr_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x0000001F001E001D, 0x001C001B001A0019, \ 0X0018001700160015, 0X0014001300120011, \ 0X0010000F000E000D, 0X000C000B000A0009, \ 0X0008000700060005, 0X0004000300020001 ), v ) -#define mm512_rol_1x16( v ) \ +#define mm512_shufll_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x001E001D001C001B, 0x001A001900180017, \ 0X0016001500140013, 0X001200110010000F, \ 0X000E000D000C000B, 0X000A000900080007, \ 0X0006000500040003, 0X000200010000001F ), v ) -#define mm512_ror_1x8( v ) \ +#define mm512_shuflr_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x003F3E3D3C3B3A39, 0x3837363534333231, \ 0x302F2E2D2C2B2A29, 0x2827262524232221, \ 0x201F1E1D1C1B1A19. 0x1817161514131211, \ 0x100F0E0D0C0B0A09, 0x0807060504030201 ) ) -#define mm512_rol_1x8( v ) \ +#define mm512_shufll_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x3E3D3C3B3A393837, 0x363534333231302F. \ 0x2E2D2C2B2A292827, 0x262524232221201F, \ 0x1E1D1C1B1A191817, 0x161514131211100F, \ 0x0E0D0C0B0A090807, 0x060504030201003F ) ) - -// Invert vector: {3,2,1,0} -> {0,1,2,3} -#define mm512_invert_256( v ) \ - _mm512_permutexvar_epi64( v, m512_const_64( 3,2,1,0,7,6,5,4 ) ) - -#define mm512_invert_128( v ) \ - _mm512_permutexvar_epi64( v, m512_const_64( 1,0,3,2,5,4,7,6 ) ) - -#define mm512_invert_64( v ) \ - _mm512_permutexvar_epi64( v, m512_const_64( 0,1,2,3,4,5,6,7 ) ) - -#define mm512_invert_32( v ) \ - _mm512_permutexvar_epi32( m512_const_64( \ - 0x0000000000000001,0x0000000200000003, \ - 0x0000000400000005,0x0000000600000007, \ - 0x0000000800000009,0x0000000a0000000b, \ - 0x0000000c0000000d,0x0000000e0000000f ), v ) - -#define mm512_invert_16( v ) \ - _mm512_permutexvar_epi16( m512_const_64( \ - 0x0000000100020003, 0x0004000500060007, \ - 0x00080009000A000B, 0x000C000D000E000F, \ - 0x0010001100120013, 0x0014001500160017, \ - 0x00180019001A001B, 0x001C001D001E001F ), v ) - -#define mm512_invert_8( v ) \ - _mm512_shuffle_epi8( v, m512_const_64( \ - 0x0001020304050607, 0x08090A0B0C0D0E0F, \ - 0x1011121314151617, 0x18191A1B1C1D1E1F, \ - 0x2021222324252627, 0x28292A2B2C2D2E2F, \ - 0x3031323334353637, 0x38393A3B3C3D3E3F ) ) - // // Rotate elements within 256 bit lanes of 512 bit vector. +// 128 bit lane shift is handled by bslli bsrli. // Swap hi & lo 128 bits in each 256 bit lane #define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e ) +#define mm512_shuflr256_128 mm512_swap256_128 +#define mm512_shufll256_128 mm512_swap256_128 // Rotate 256 bit lanes by one 64 bit element -#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 ) -#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 ) +#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 ) +#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 ) // Rotate 256 bit lanes by one 32 bit element - -#define mm512_ror256_32( v ) \ +#define mm512_shuflr256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ 0x000000080000000f, 0x0000000e0000000d, \ 0x0000000c0000000b, 0x0000000a00000009, \ 0x0000000000000007, 0x0000000600000005, \ 0x0000000400000003, 0x0000000200000001 ), v ) -#define mm512_rol256_32( v ) \ +#define mm512_shufll256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ 0x0000000e0000000d, 0x0000000c0000000b, \ 0x0000000a00000009, 0x000000080000000f, \ 0x0000000600000005, 0x0000000400000003, \ 0x0000000200000001, 0x0000000000000007 ), v ) -#define mm512_ror256_16( v ) \ +#define mm512_shuflr256_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x00100001001e001d, 0x001c001b001a0019, \ 0x0018001700160015, 0x0014001300120011, \ 0x0000000f000e000d, 0x000c000b000a0009, \ 0x0008000700060005, 0x0004000300020001 ), v ) -#define mm512_rol256_16( v ) \ +#define mm512_shufll256_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x001e001d001c001b, 0x001a001900180017, \ 0x0016001500140013, 0x001200110010001f, \ 0x000e000d000c000b, 0x000a000900080007, \ 0x0006000500040003, 0x000200010000000f ), v ) -#define mm512_ror256_8( v ) \ +#define mm512_shuflr256_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x203f3e3d3c3b3a39, 0x3837363534333231, \ 0x302f2e2d2c2b2a29, 0x2827262524232221, \ 0x001f1e1d1c1b1a19, 0x1817161514131211, \ - 0x100f0e0d0c0b0a09, 0x0807060504030201 ), v ) + 0x100f0e0d0c0b0a09, 0x0807060504030201 ) ) -#define mm512_rol256_8( v ) \ +#define mm512_shufll256_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x3e3d3c3b3a393837, 0x363534333231302f, \ 0x2e2d2c2b2a292827, 0x262524232221203f, \ 0x1e1d1c1b1a191817, 0x161514131211100f, \ - 0x0e0d0c0b0a090807, 0x060504030201001f ), v ) - -// -// Rotate elements within 128 bit lanes of 512 bit vector. - -// Swap hi & lo 64 bits in each 128 bit lane -#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) - -// Rotate 128 bit lanes by one 32 bit element -#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) -#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) - -#define mm512_ror128_x8( v, c ) _mm512_alignr_epi8( v, v, c ) - -/* -// Rotate 128 bit lanes by c bytes, faster than building that monstrous -// constant above. -#define mm512_ror128_8( v, c ) \ - _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \ - _mm512_bslli_epi128( v, 16-(c) ) ) -#define mm512_rol128_8( v, c ) \ - _mm512_or_si512( _mm512_bslli_epi128( v, c ), \ - _mm512_bsrli_epi128( v, 16-(c) ) ) -*/ - -// -// Rotate elements within 64 bit lanes. - -#define mm512_rol64_x8( v, c ) _mm512_rol_epi64( v, ((c)<<3) ) -#define mm512_ror64_x8( v, c ) _mm512_ror_epi64( v, ((c)<<3) ) - -// Swap 32 bit elements in each 64 bit lane -#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 ) - -// Rotate each 64 bit lane by one 16 bit element. -#define mm512_ror64_16( v ) _mm512_ror_epi64( v, 16 ) -#define mm512_rol64_16( v ) _mm512_rol_epi64( v, 16 ) -#define mm512_ror64_8( v ) _mm512_ror_epi64( v, 8 ) -#define mm512_rol64_8( v ) _mm512_rol_epi64( v, 8 ) - -// -// Rotate elements within 32 bit lanes. - -#define mm512_rol32_x8( v, c ) _mm512_rol_epi32( v, ((c)<<2) ) -#define mm512_ror32_x8( v, c ) _mm512_ror_epi32( v, ((c)<<2) ) - + 0x0e0d0c0b0a090807, 0x060504030201001f ) ) // -// Rotate elements from 2 512 bit vectors in place, source arguments -// are overwritten. - -#define mm512_swap1024_512(v1, v2) \ - v1 = _mm512_xor_si512(v1, v2); \ - v2 = _mm512_xor_si512(v1, v2); \ - v1 = _mm512_xor_si512(v1, v2); +// Shuffle/rotate elements within 128 bit lanes of 512 bit vector. + +// Limited 2 input, 1 output shuffle within 128 bit lanes. +#define mm512_shuffle2_64( a, b, c ) \ + _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \ + _mm512_castsi512_pd( b ), c ) ); -#define mm512_ror1024_256( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ - v1 = _mm512_alignr_epi64( v2, v1, 4 ); \ - v2 = t; \ -} while(0) +#define mm512_shuffle2_32( a, b, c ) \ + _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \ + _mm512_castsi512_ps( b ), c ) ); -#define mm512_rol1024_256( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ - v2 = _mm512_alignr_epi64( v2, v1, 4 ); \ - v1 = t; \ -} while(0) +// Swap 64 bits in each 128 bit lane +#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) +#define mm512_shuflr128_64 mm512_swap128_64 +#define mm512_shufll128_64 mm512_swap128_64 -#define mm512_ror1024_128( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \ - v1 = _mm512_alignr_epi64( v2, v1, 2 ); \ - v2 = t; \ -} while(0) - -#define mm512_rol1024_128( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \ - v2 = _mm512_alignr_epi64( v2, v1, 6 ); \ - v1 = t; \ -} while(0) - -#define mm512_ror1024_64( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \ - v1 = _mm512_alignr_epi64( v2, v1, 1 ); \ - v2 = t; \ -} while(0) - -#define mm512_rol1024_64( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \ - v2 = _mm512_alignr_epi64( v2, v1, 7 ); \ - v1 = t; \ -} while(0) - -#define mm512_ror1024_32( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \ - v1 = _mm512_alignr_epi32( v2, v1, 1 ); \ - v2 = t; \ -} while(0) - -#define mm512_rol1024_32( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \ - v2 = _mm512_alignr_epi32( v2, v1, 15 ); \ - v1 = t; \ -} while(0) +// Rotate 128 bit lanes by one 32 bit element +#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) +#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) + +// Rotate right 128 bit lanes by c bytes, versatile and just as fast +static inline __m512i mm512_shuflr128_8( const __m512i v, const int c ) +{ return _mm512_alignr_epi8( v, v, c ); } + +// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction +// but only with AVX512. Shuffle is just as fast and availble with AVX2 +// & SSE2. +#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 ) +#define mm512_shuflr64_32 mm512_swap64_32 +#define mm512_shufll64_32 mm512_swap64_32 + +// Need good way to distinguish 1 input shuffles, 2 input shuffle functions, +// and 2 input 2 output shuffle macros. +// +// shuflr is 1 input +// shufl2r is 2 input ... +// Drop macros? They can easilly be rebuilt using shufl2 functions + +// 2 input, 1 output +// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return +// rotated v1 +// visually confusing for shif2r because of arg order. First arg is always +// the target for modification, either update by reference or by function +// return. +#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 ) +#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 ) + +#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 ) +#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 ) + +#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 ) +#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 ) + +#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 ) +#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 ) #endif // AVX512 #endif // SIMD_512_H__ diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h index 2f50ec1a..31b0b89a 100644 --- a/simd-utils/simd-64.h +++ b/simd-utils/simd-64.h @@ -1,18 +1,18 @@ #if !defined(SIMD_64_H__) #define SIMD_64_H__ 1 -#if defined(__MMX__) +#if defined(__MMX__) && defined(__SSE__) //////////////////////////////////////////////////////////////// // // 64 bit MMX vectors. // -// There are rumours MMX wil be removed. Although casting with int64 -// works there is likely some overhead to move the data to An MMX register -// and back. - +// This code is not used anywhere annd likely never will. It's intent was +// to support 2 way parallel hashing using SSE2 for 64 bit, and MMX for 32 +// bit hash functions, but was never implemented. // Pseudo constants + /* #define m64_zero _mm_setzero_si64() #define m64_one_64 _mm_set_pi32( 0UL, 1UL ) @@ -30,79 +30,67 @@ #define casti_m64(p,i) (((__m64*)(p))[(i)]) -// cast all arguments as the're likely to be uint64_t - // Bitwise not: ~(a) //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 ) #define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) ) // Unary negate elements -#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v ) -#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, (__m64)v ) -#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, (__m64)v ) +#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v ) +#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v ) +#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v ) // Rotate bits in packed elements of 64 bit vector #define mm64_rol_64( a, n ) \ - _mm_or_si64( _mm_slli_si64( (__m64)(a), n ), \ - _mm_srli_si64( (__m64)(a), 64-(n) ) ) + _mm_or_si64( _mm_slli_si64( a, n ), \ + _mm_srli_si64( a, 64-(n) ) ) #define mm64_ror_64( a, n ) \ - _mm_or_si64( _mm_srli_si64( (__m64)(a), n ), \ - _mm_slli_si64( (__m64)(a), 64-(n) ) ) + _mm_or_si64( _mm_srli_si64( a, n ), \ + _mm_slli_si64( a, 64-(n) ) ) #define mm64_rol_32( a, n ) \ - _mm_or_si64( _mm_slli_pi32( (__m64)(a), n ), \ - _mm_srli_pi32( (__m64)(a), 32-(n) ) ) + _mm_or_si64( _mm_slli_pi32( a, n ), \ + _mm_srli_pi32( a, 32-(n) ) ) #define mm64_ror_32( a, n ) \ - _mm_or_si64( _mm_srli_pi32( (__m64)(a), n ), \ - _mm_slli_pi32( (__m64)(a), 32-(n) ) ) + _mm_or_si64( _mm_srli_pi32( a, n ), \ + _mm_slli_pi32( a, 32-(n) ) ) #define mm64_rol_16( a, n ) \ - _mm_or_si64( _mm_slli_pi16( (__m64)(a), n ), \ - _mm_srli_pi16( (__m64)(a), 16-(n) ) ) + _mm_or_si64( _mm_slli_pi16( a, n ), \ + _mm_srli_pi16( a, 16-(n) ) ) #define mm64_ror_16( a, n ) \ - _mm_or_si64( _mm_srli_pi16( (__m64)(a), n ), \ - _mm_slli_pi16( (__m64)(a), 16-(n) ) ) + _mm_or_si64( _mm_srli_pi16( a, n ), \ + _mm_slli_pi16( a, 16-(n) ) ) // Rotate packed elements accross lanes. Useful for byte swap and byte // rotation. -// _mm_shuffle_pi8 requires SSSE3 while _mm_shuffle_pi16 requires SSE -// even though these are MMX instructions. - // Swap hi & lo 32 bits. -#define mm64_swap32( a ) _mm_shuffle_pi16( (__m64)(a), 0x4e ) +#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e ) -#define mm64_ror1x16_64( a ) _mm_shuffle_pi16( (__m64)(a), 0x39 ) -#define mm64_rol1x16_64( a ) _mm_shuffle_pi16( (__m64)(a), 0x93 ) +#define mm64_shulfr_16( a ) _mm_shuffle_pi16( a, 0x39 ) +#define mm64_shufll_16( a ) _mm_shuffle_pi16( a, 0x93 ) // Swap hi & lo 16 bits of each 32 bit element -#define mm64_swap16_32( a ) _mm_shuffle_pi16( (__m64)(a), 0xb1 ) +#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 ) #if defined(__SSSE3__) // Endian byte swap packed elements -// A vectorized version of the u64 bswap, use when data already in MMX reg. -#define mm64_bswap_64( v ) \ - _mm_shuffle_pi8( (__m64)v, (__m64)0x0001020304050607 ) - #define mm64_bswap_32( v ) \ - _mm_shuffle_pi8( (__m64)v, (__m64)0x0405060700010203 ) + _mm_shuffle_pi8( v, (__m64)0x0405060700010203 ) #define mm64_bswap_16( v ) \ - _mm_shuffle_pi8( (__m64)v, (__m64)0x0607040502030001 ); + _mm_shuffle_pi8( v, (__m64)0x0607040502030001 ); -#else +// Rotate right by c bytes +static inline __m64 mm64_vror_x8( __m64 v, const int c ) +{ return _mm_alignr_pi8( v, v, c ); } -#define mm64_bswap_64( v ) \ - (__m64)__builtin_bswap64( (uint64_t)v ) +#else -// These exist only for compatibility with CPUs without SSSE3. MMX doesn't -// have extract 32 instruction so pointers are needed to access elements. -// It' more efficient for the caller to use scalar variables and call -// bswap_32 directly. #define mm64_bswap_32( v ) \ _mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \ __builtin_bswap32( ((uint32_t*)&v)[0] ) ) @@ -115,17 +103,6 @@ #endif -// 64 bit mem functions use integral sizes instead of bytes, data must -// be aligned to 64 bits. -static inline void memcpy_m64( __m64 *dst, const __m64 *src, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = src[i]; } - -static inline void memset_zero_m64( __m64 *src, int n ) -{ for ( int i = 0; i < n; i++ ) src[i] = (__m64)0ULL; } - -static inline void memset_m64( __m64 *dst, const __m64 a, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = a; } - #endif // MMX #endif // SIMD_64_H__ diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index 711134c8..58caa3e5 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -1,69 +1,28 @@ #if !defined(SIMD_INT_H__) #define SIMD_INT_H__ 1 -/////////////////////////////////// -// -// Integers up to 128 bits. -// -// These utilities enhance support for integers up to 128 bits. -// All standard operations are supported on 128 bit integers except -// numeric constant representation and IO. 128 bit integers must be built -// and displayed as 2 64 bit halves, just like the old times. -// -// Some utilities are also provided for smaller integers, most notably -// bit rotation. - - - -// MMX has no extract instruction for 32 bit elements so this: -// Lo is trivial, high is a simple shift. -// Input may be uint64_t or __m64, returns uint32_t. -#define u64_extr_lo32(a) ( (uint32_t)( (uint64_t)(a) ) ) -#define u64_extr_hi32(a) ( (uint32_t)( ((uint64_t)(a)) >> 32) ) - -#define u64_extr_32( a, n ) ( (uint32_t)( (a) >> ( ( 2-(n)) <<5 ) ) ) -#define u64_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 4-(n)) <<4 ) ) ) -#define u64_extr_8( a, n ) ( (uint8_t) ( (a) >> ( ( 8-(n)) <<3 ) ) ) - -// Rotate bits in various sized integers. -#define u64_ror_64( x, c ) \ - (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) ) -#define u64_rol_64( x, c ) \ - (uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) ) -#define u32_ror_32( x, c ) \ - (uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) ) -#define u32_rol_32( x, c ) \ - (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) ) -#define u16_ror_16( x, c ) \ - (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) ) -#define u16_rol_16( x, c ) \ - (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) ) -#define u8_ror_8( x, c ) \ - (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) ) -#define u8_rol_8( x, c ) \ - (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) ) - // Endian byte swap -#define bswap_64( a ) __builtin_bswap64( a ) -#define bswap_32( a ) __builtin_bswap32( a ) - -// 64 bit mem functions use integral sizes instead of bytes, data must -// be aligned to 64 bits. Mostly for scaled indexing convenience. -static inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = src[i]; } +#define bswap_64 __builtin_bswap64 +#define bswap_32 __builtin_bswap32 -static inline void memset_zero_64( uint64_t *src, int n ) -{ for ( int i = 0; i < n; i++ ) src[i] = 0ull; } +// Bit rotation +#define rol64 __rolq +#define ror64 __rorq +#define rol32 __rold +#define ror32 __rord -static inline void memset_64( uint64_t *dst, const uint64_t a, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = a; } +// Safe division, integer or floating point. For floating point it's as +// safe as 0 is precisely zero. +// Returns safe_result if division by zero, typically zero. +#define safe_div( dividend, divisor, safe_result ) \ + ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) ) /////////////////////////////////////// // // 128 bit integers // -// 128 bit integers are inneficient and not a shortcut for __m128i. +// 128 bit integers are inneficient and not a shortcut for __m128i. // Native type __int128 supported starting with GCC-4.8. // // __int128 uses two 64 bit GPRs to hold the data. The main benefits are @@ -78,12 +37,14 @@ static inline void memset_64( uint64_t *dst, const uint64_t a, int n ) // __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 ); // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 ); +// obsolete test // Compiler check for __int128 support // Configure also has a test for int128. #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) ) #define GCC_INT128 1 #endif +// obsolte test #if !defined(GCC_INT128) #warning "__int128 not supported, requires GCC-4.8 or newer." #endif @@ -94,31 +55,12 @@ static inline void memset_64( uint64_t *dst, const uint64_t a, int n ) typedef __int128 int128_t; typedef unsigned __int128 uint128_t; - - -// Maybe usefull for making constants. -#define mk_uint128( hi, lo ) \ - ( ( (uint128_t)(hi) << 64 ) | ( (uint128_t)(lo) ) ) - - // Extracting the low bits is a trivial cast. // These specialized functions are optimized while providing a // consistent interface. #define u128_hi64( x ) ( (uint64_t)( (uint128_t)(x) >> 64 ) ) #define u128_lo64( x ) ( (uint64_t)(x) ) -// Generic extract, don't use for extracting low bits, cast instead. -#define u128_extr_64( a, n ) ( (uint64_t)( (a) >> ( ( 2-(n)) <<6 ) ) ) -#define u128_extr_32( a, n ) ( (uint32_t)( (a) >> ( ( 4-(n)) <<5 ) ) ) -#define u128_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 8-(n)) <<4 ) ) ) -#define u128_extr_8( a, n ) ( (uint8_t) ( (a) >> ( (16-(n)) <<3 ) ) ) - -// Not much need for this but it fills a gap. -#define u128_ror_128( x, c ) \ - ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) ) -#define u128_rol_128( x, c ) \ - ( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) ) - #endif // GCC_INT128 #endif // SIMD_INT_H__ diff --git a/sysinfos.c b/sysinfos.c index 010c78f4..999df9fe 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -209,7 +209,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz) { memset(outbuf, 0, maxsz); #ifdef WIN32 - char brand[0xC0] = { 0 }; + char brand[256] = { 0 }; int output[4] = { 0 }, ext; cpuid(0x80000000, output); ext = output[0]; @@ -218,7 +218,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz) for (int i = 2; i <= (ext & 0xF); i++) { cpuid(0x80000000+i, output); - memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int)); + memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int)); } snprintf(outbuf, maxsz, "%s", brand); } diff --git a/util.c b/util.c index 5df1eb93..b746ef9a 100644 --- a/util.c +++ b/util.c @@ -47,6 +47,7 @@ //#include "miner.h" #include "elist.h" #include "algo-gate-api.h" +#include "algo/sha/sha256d.h" //extern pthread_mutex_t stats_lock; @@ -129,17 +130,19 @@ void applog2( int prio, const char *fmt, ... ) // localtime_r(&now, &tm); - switch (prio) { + switch ( prio ) + { + case LOG_CRIT: color = CL_LRD; break; case LOG_ERR: color = CL_RED; break; - case LOG_WARNING: color = CL_YLW; break; + case LOG_WARNING: color = CL_YL2; break; + case LOG_MAJR: color = CL_YL2; break; case LOG_NOTICE: color = CL_WHT; break; case LOG_INFO: color = ""; break; case LOG_DEBUG: color = CL_GRY; break; - - case LOG_BLUE: - prio = LOG_NOTICE; - color = CL_CYN; - break; + case LOG_MINR: color = CL_YLW; break; + case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break; + case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break; + case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break; } if (!use_colors) color = ""; @@ -206,17 +209,19 @@ void applog(int prio, const char *fmt, ...) localtime_r(&now, &tm); - switch (prio) { - case LOG_ERR: color = CL_RED; break; - case LOG_WARNING: color = CL_YLW; break; + switch ( prio ) + { + case LOG_CRIT: color = CL_LRD; break; + case LOG_ERR: color = CL_RED; break; + case LOG_WARNING: color = CL_YL2; break; + case LOG_MAJR: color = CL_YL2; break; case LOG_NOTICE: color = CL_WHT; break; - case LOG_INFO: color = ""; break; + case LOG_INFO: color = ""; break; case LOG_DEBUG: color = CL_GRY; break; - - case LOG_BLUE: - prio = LOG_NOTICE; - color = CL_CYN; - break; + case LOG_MINR: color = CL_YLW; break; + case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break; + case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break; + case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break; } if (!use_colors) color = ""; @@ -303,6 +308,29 @@ void format_hashrate(double hashrate, char *output) ); } +// For use with MiB etc +void format_number_si( double* n, char* si_units ) +{ + if ( *n < 1024*10 ) { *si_units = 0; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'k'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'M'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'G'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'T'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'P'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'E'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'Z'; return; } + *n /= 1024; + *si_units = 'Y'; +} + + /* Modify the representation of integer numbers which would cause an overflow * so that they are treated as floating-point numbers. * This is a hack to overcome the limitations of some versions of Jansson. */ @@ -795,6 +823,15 @@ char *abin2hex(const unsigned char *p, size_t len) return s; } +char *bebin2hex(const unsigned char *p, size_t len) +{ + char *s = (char*) malloc((len * 2) + 1); + if (!s) return NULL; + for ( size_t i = 0, j = len - 1; i < len; i++, j-- ) + sprintf( s + ( i*2 ), "%02x", (unsigned int) p[ j ] ); + return s; +} + bool hex2bin(unsigned char *p, const char *hexstr, size_t len) { char hex_byte[3]; @@ -943,6 +980,140 @@ bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen) return true; } +static uint32_t bech32_polymod_step(uint32_t pre) { + uint8_t b = pre >> 25; + return ((pre & 0x1FFFFFF) << 5) ^ + (-((b >> 0) & 1) & 0x3b6a57b2UL) ^ + (-((b >> 1) & 1) & 0x26508e6dUL) ^ + (-((b >> 2) & 1) & 0x1ea119faUL) ^ + (-((b >> 3) & 1) & 0x3d4233ddUL) ^ + (-((b >> 4) & 1) & 0x2a1462b3UL); +} + +static const int8_t bech32_charset_rev[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 15, -1, 10, 17, 21, 20, 26, 30, 7, 5, -1, -1, -1, -1, -1, -1, + -1, 29, -1, 24, 13, 25, 9, 8, 23, -1, 18, 22, 31, 27, 19, -1, + 1, 0, 3, 16, 11, 28, 12, 14, 6, 4, 2, -1, -1, -1, -1, -1, + -1, 29, -1, 24, 13, 25, 9, 8, 23, -1, 18, 22, 31, 27, 19, -1, + 1, 0, 3, 16, 11, 28, 12, 14, 6, 4, 2, -1, -1, -1, -1, -1 +}; + +static bool bech32_decode(char *hrp, uint8_t *data, size_t *data_len, const char *input) { + uint32_t chk = 1; + size_t i; + size_t input_len = strlen(input); + size_t hrp_len; + int have_lower = 0, have_upper = 0; + if (input_len < 8 || input_len > 90) { + return false; + } + *data_len = 0; + while (*data_len < input_len && input[(input_len - 1) - *data_len] != '1') { + ++(*data_len); + } + hrp_len = input_len - (1 + *data_len); + if (1 + *data_len >= input_len || *data_len < 6) { + return false; + } + *(data_len) -= 6; + for (i = 0; i < hrp_len; ++i) { + int ch = input[i]; + if (ch < 33 || ch > 126) { + return false; + } + if (ch >= 'a' && ch <= 'z') { + have_lower = 1; + } else if (ch >= 'A' && ch <= 'Z') { + have_upper = 1; + ch = (ch - 'A') + 'a'; + } + hrp[i] = ch; + chk = bech32_polymod_step(chk) ^ (ch >> 5); + } + hrp[i] = 0; + chk = bech32_polymod_step(chk); + for (i = 0; i < hrp_len; ++i) { + chk = bech32_polymod_step(chk) ^ (input[i] & 0x1f); + } + ++i; + while (i < input_len) { + int v = (input[i] & 0x80) ? -1 : bech32_charset_rev[(int)input[i]]; + if (input[i] >= 'a' && input[i] <= 'z') have_lower = 1; + if (input[i] >= 'A' && input[i] <= 'Z') have_upper = 1; + if (v == -1) { + return false; + } + chk = bech32_polymod_step(chk) ^ v; + if (i + 6 < input_len) { + data[i - (1 + hrp_len)] = v; + } + ++i; + } + if (have_lower && have_upper) { + return false; + } + return chk == 1; +} + +static bool convert_bits(uint8_t *out, size_t *outlen, int outbits, const uint8_t *in, size_t inlen, int inbits, int pad) { + uint32_t val = 0; + int bits = 0; + uint32_t maxv = (((uint32_t)1) << outbits) - 1; + while (inlen--) { + val = (val << inbits) | *(in++); + bits += inbits; + while (bits >= outbits) { + bits -= outbits; + out[(*outlen)++] = (val >> bits) & maxv; + } + } + if (pad) { + if (bits) { + out[(*outlen)++] = (val << (outbits - bits)) & maxv; + } + } else if (((val << (outbits - bits)) & maxv) || bits >= inbits) { + return false; + } + return true; +} + +static bool segwit_addr_decode(int *witver, uint8_t *witdata, size_t *witdata_len, const char *addr) { + uint8_t data[84]; + char hrp_actual[84]; + size_t data_len; + if (!bech32_decode(hrp_actual, data, &data_len, addr)) return false; + if (data_len == 0 || data_len > 65) return false; + if (data[0] > 16) return false; + *witdata_len = 0; + if (!convert_bits(witdata, witdata_len, 8, data + 1, data_len - 1, 5, 0)) return false; + if (*witdata_len < 2 || *witdata_len > 40) return false; + if (data[0] == 0 && *witdata_len != 20 && *witdata_len != 32) return false; + *witver = data[0]; + return true; +} + +static size_t bech32_to_script(uint8_t *out, size_t outsz, const char *addr) { + uint8_t witprog[40]; + size_t witprog_len; + int witver; + + if (!segwit_addr_decode(&witver, witprog, &witprog_len, addr)) + return 0; + if (outsz < witprog_len + 2) + return 0; + out[0] = witver ? (0x50 + witver) : 0; + out[1] = witprog_len; + memcpy(out + 2, witprog, witprog_len); + + if ( opt_debug ) + applog( LOG_INFO, "Coinbase address uses Bech32 coding"); + + return witprog_len + 2; +} + size_t address_to_script( unsigned char *out, size_t outsz, const char *addr ) { unsigned char addrbin[ pk_buffer_size_max ]; @@ -950,12 +1121,15 @@ size_t address_to_script( unsigned char *out, size_t outsz, const char *addr ) size_t rv; if ( !b58dec( addrbin, outsz, addr ) ) - return 0; + return bech32_to_script( out, outsz, addr ); addrver = b58check( addrbin, outsz, addr ); if ( addrver < 0 ) return 0; + if ( opt_debug ) + applog( LOG_INFO, "Coinbase address uses B58 coding"); + switch ( addrver ) { case 5: /* Bitcoin script hash */ @@ -1048,53 +1222,51 @@ bool fulltest( const uint32_t *hash, const uint32_t *target ) return rc; } -// Mathmatically the difficulty is simply the reciprocal of the hash. +// Mathmatically the difficulty is simply the reciprocal of the hash: d = 1/h. // Both are real numbers but the hash (target) is represented as a 256 bit -// number with the upper 32 bits representing the whole integer part and the -// lower 224 bits representing the fractional part: +// fixed point number with the upper 32 bits representing the whole integer +// part and the lower 224 bits representing the fractional part: // target[ 255:224 ] = trunc( 1/diff ) // target[ 223: 0 ] = frac( 1/diff ) // // The 256 bit hash is exact but any floating point representation is not. -// Stratum provides the target difficulty as double precision, inexcact, and +// Stratum provides the target difficulty as double precision, inexcact, // which must be converted to a hash target. The converted hash target will -// likely be less precise to to inexact input and conversion error. -// converted to 256 bit hash which will also be inexact and likelyless -// accurate to to error in conversion. +// likely be less precise due to inexact input and conversion error. // On the other hand getwork provides a 256 bit hash target which is exact. // // How much precision is needed? // -// 128 bit types are implemented in software by the compiler using 64 bit +// 128 bit types are implemented in software by the compiler on 64 bit // hardware resulting in lower performance and more error than would be -// expected with a hardware 128 bit implementtaion. +// expected with a hardware 128 bit implementaion. // Float80 exploits the internals of the FP unit which provide a 64 bit // mantissa in an 80 bit register with hardware rounding. When the destination // is double the data is rounded to float64 format. Long double returns all // 80 bits without rounding and including any accumulated computation error. // Float80 does not fit efficiently in memory. // -// 256 bit hash: 76 +// Significant digits: +// 256 bit hash: 76 // float: 7 (float32, 80 bits with rounding to 32 bits) // double: 15 (float64, 80 bits with rounding to 64 bits) -// long double 19 (float80, 80 bits with no rounding) -// __float128 33 (128 bits with no rounding) +// long double: 19 (float80, 80 bits with no rounding) +// __float128: 33 (128 bits with no rounding) // uint32_t: 9 // uint64_t: 19 // uint128_t 38 // // The concept of significant digits doesn't apply to the 256 bit hash -// representation. It's fixed point making leading zeros significant -// Leading zeros count in the 256 bit +// representation. It's fixed point making leading zeros significant, +// limiting its range and precision due to fewer zon-zero significant digits. // // Doing calculations with float128 and uint128 increases precision for // target_to_diff, but doesn't help with stratum diff being limited to // double precision. Is the extra precision really worth the extra cost? -// -// With double the error rate is 1/1e15, or one hash in every Petahash -// with a very low difficulty, not a likely sitiation. Higher difficulty -// increases the effective precision. Due to the floating nature of the -// decimal point leading zeros aren't counted. +// With float128 the error rate is 1/1e33 compared with 1/1e15 for double. +// For double that's 1 error in every petahash with a very low difficulty, +// not a likely situation. With higher difficulty effective precision +// increases. // // Unfortunately I can't get float128 to work so long double (float80) is // as precise as it gets. @@ -1486,11 +1658,8 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i pthread_mutex_unlock(&sctx->work_lock); if ( !opt_quiet ) /* pool dynamic change */ - applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d", + applog( LOG_INFO, "Stratum extranonce1 0x%s, extranonce2 size %d", xnonce1, xn2_size); -// if (pndx == 0 && opt_debug) -// applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d", -// xnonce1, xn2_size); return true; out: @@ -1640,8 +1809,6 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p opt_extranonce = false; goto out; } - if ( !opt_quiet ) - applog( LOG_INFO, "Extranonce subscription enabled" ); sret = stratum_recv_line( sctx ); if ( sret ) @@ -1659,10 +1826,14 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p if ( !stratum_handle_method( sctx, sret ) ) applog( LOG_WARNING, "Stratum answer id is not correct!" ); } - res_val = json_object_get( extra, "result" ); -// if (opt_debug && (!res_val || json_is_false(res_val))) -// applog(LOG_DEBUG, "extranonce subscribe not supported"); - json_decref( extra ); + else + { + res_val = json_object_get( extra, "result" ); + if ( opt_debug && ( !res_val || json_is_false( res_val ) ) ) + applog( LOG_DEBUG, + "Method extranonce.subscribe is not supported" ); + } + json_decref( extra ); } free(sret); } @@ -1675,6 +1846,25 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p return ret; } +bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff ) +{ + char *s; + s = (char*) malloc( 80 ); + bool rc = true; + + // response is handled seperately, what ID? + sprintf( s, "{\"id\": 1, \"method\": \"mining.suggest_difficulty\", \"params\": [\"%f\"]}", diff ); + if ( !stratum_send_line( sctx, s ) ) + { + applog(LOG_WARNING,"stratum.suggest_difficulty send failed"); + rc = false; + } + free ( s ); + return rc; +} + + + /** * Extract bloc height L H... here len=3, height=0x1333e8 * "...0000000000ffffffff2703e83313062f503253482f043d61105408" diff --git a/verthash-help.txt b/verthash-help.txt new file mode 100644 index 00000000..f8e02db4 --- /dev/null +++ b/verthash-help.txt @@ -0,0 +1,80 @@ +Quickstart: +---------- + +First time mining verthash or don't have a Verthash data file: + +--algo verthash --verify --url ... + +Verthash data file already exists: + +--algo verthash --data-file /path/to/verthash.dat --url ... + + +Background: +---------- + +Verthash algorithm requires a data file for hashing. This file is +static, portable, and only needs to be created once. + +A Verthash data file created by VerthashMiner can also be used by cpuminer-opt +and used simultaneously by both miners. + +Due to its size >1GB it is recommened one data file be created and +stored in a permanent location accessible to any miner that wants to use it. + +New command line options: +------------------------ + +cpuminer-opt adds two new command line options for verthash. The names +and some behaviour is changed from VerthashMiner. + +--data-file /path/to/verthash.dat + default when not used is verthash.dat in current working directory. + +--verify + verify integrity of file specified by --data-file, or if not specified + the default data file if it exists, or create a default file and verify it + if one does not yet exist. Data file verification is disabled by default. + +Detailed usage: +-------------- + +If a data file already exists it can be selected using the --data-file +option to specify the path and name of the file. + +--algo verthash --datafile /path/to/verthash.dat --url ... + +If the --data-file option is not used the default is to use 'verthash.dat' +from the current working directory. + +If no data file exists it can be created by using the --verify option +without the --data-file option. If the default data file is not found in +the current directory it will be created. + +--algo verthash --verify --url ... + +Data file creation can take up to 30 minutes on a spinning hard drive. +Once created the new data file will be verified and used immediately +if a valid url and user were included on the command line. + +A default data file can be created by ommitting the url option. That will +either verify an existing default data file or create one and verify it, +then exit. + +--algo verthash --verify + +A data file will never be created if --data-file is specified. The miner +will exit with an error if the file is not found. This is to avoid accidentally +creating an unwanted data file due to a typo. + +After creation the data file can moved to a more convenient location and +referenced by --data-file, or left where it is and used by default without the +--data-file option. + +Data file verification takes a few seconds and is disabled by default. +VerthashMiner enables data file verification by default and has an option to +disable it. + +The --verify option is intended primarily to create a new file. It's +not necessary or useful to verify a file every time the miner is started. + diff --git a/winbuild-cross.sh b/winbuild-cross.sh index 21764bfd..26d10769 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -16,21 +16,22 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib" export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32" # used by GCC export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" +# Support for Windows 7 CPU groups, AES sometimes not included in -march +export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601" +export DEFAULT_CFLAGS_OLD="-O3 -Wall" # make link to local gmp header file. ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h -# edit configure to fix pthread lib name for Windows. -#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac - # make release directory and copy selected DLLs. rm -rf release > /dev/null - mkdir release + cp README.txt release/ cp README.md release/ cp RELEASE_NOTES release/ +cp verthash-help.txt release/ cp $MINGW_LIB/zlib1.dll release/ cp $MINGW_LIB/libwinpthread-1.dll release/ cp $GCC_MINGW_LIB/libstdc++-6.dll release/ @@ -40,82 +41,67 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ # Start building... -# Icelake AVX512 SHA VAES +# AVX512 SHA VAES: Intel Core Icelake, Rocketlake ./clean-all.sh || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=icelake-client -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe -# Rocketlake AVX512 SHA AES -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=cascadelake -msha -Wall" ./configure $CONFIGURE_ARGS -#CFLAGS="-O3 -march=rocketlake -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) -strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-avx512-sha.exe - -# Zen1 AVX2 AES SHA +# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-zen.exe +mv cpuminer.exe release/cpuminer-avx512.exe -# Zen3 AVX2 SHA VAES -make clean || echo clean +# AVX2 SHA VAES: Intel Alderlake, AMD Zen3 +make clean || echo done rm -f config.status -CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS -# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-zen3.exe +mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe -# Slylake-X AVX512 AES -# mingw won't compile avx512 without -fno-asynchronous-unwind-tables +# AVX2 AES SHA: AMD Zen1 make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure $CONFIGURE_ARGS -#CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-avx512.exe +mv cpuminer.exe release/cpuminer-avx2-sha.exe -# Haswell AVX2 AES +# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake make clean || echo clean rm -f config.status -# GCC 9 doesn't include AES in -march=core-avx2 -CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2.exe -# Sandybridge AVX AES +# AVX AES: Intel Sandybridge, Ivybridge make clean || echo clean rm -f config.status -# -march=corei7-avx still includes aes, but just in case -CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx.exe -# Westmere SSE4.2 AES +# SSE4.2 AES: Intel Westmere make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS -#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-aes-sse42.exe # Nehalem SSE4.2 #make clean || echo clean #rm -f config.status -#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS +#CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7" ./configure $CONFIGURE_ARGS #make #strip -s cpuminer.exe #mv cpuminer.exe release/cpuminer-sse42.exe @@ -123,7 +109,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe # Core2 SSSE3 #make clean || echo clean #rm -f config.status -#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS +#CFLAGS="$DEFAULT_CFLAGS_OLD -march=core2" ./configure $CONFIGURE_ARGS #make #strip -s cpuminer.exe #mv cpuminer.exe release/cpuminer-ssse3.exe @@ -132,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe # Generic SSE2 make clean || echo clean rm -f config.status -CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS -make -j $(nproc) +CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS +make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-sse2.exe make clean || echo clean +# Native with CPU groups ennabled +make clean || echo clean +rm -f config.status +CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +make -j 8 +strip -s cpuminer.exe +