From be88afc349ba38c5c25da733b0139a6c2207fd4b Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Thu, 21 Sep 2023 12:34:06 -0400 Subject: [PATCH] v3.23.2 --- Makefile.am | 7 +- RELEASE_NOTES | 5 + algo-gate-api.c | 2 +- algo-gate-api.h | 2 +- algo/argon2/argon2a/argon2a.c | 2 +- algo/blake/sph-blake2s.c | 2 +- algo/blake/sph_blake.h | 2 +- algo/blake/sph_blake2b.c | 2 +- algo/bmw/bmw-hash-4way.h | 6 +- algo/bmw/bmw256-hash-4way.c | 4 +- algo/bmw/bmw512-hash-4way.c | 24 +- algo/bmw/sph_bmw.h | 2 +- algo/cubehash/cubehash_sse2.c | 1 - algo/cubehash/cubehash_sse2.h | 2 +- algo/cubehash/sph_cubehash.h | 2 +- algo/echo/aes_ni/hash_api.h | 2 +- algo/echo/sph_echo.c | 2 +- algo/echo/sph_echo.h | 2 +- algo/fugue/fugue-aesni.h | 2 +- algo/fugue/sph_fugue.h | 2 +- algo/gost/sph_gost.h | 2 +- algo/groestl/aes_ni/hash-groestl.h | 4 +- algo/groestl/aes_ni/hash-groestl256.h | 3 +- algo/groestl/groestl-gate.c | 2 +- algo/groestl/groestl256-hash-4way.h | 4 - algo/groestl/myrgr-4way.c | 2 +- algo/groestl/sph_groestl.h | 2 +- algo/hamsi/hamsi-hash-4way.c | 1686 ++++++++++++++++--------- algo/hamsi/hamsi-hash-4way.h | 70 +- algo/hamsi/sph_hamsi.h | 2 +- algo/haval/haval-4way-helper.c | 4 +- algo/haval/haval-hash-4way.c | 10 +- algo/haval/haval-hash-4way.h | 3 +- algo/haval/sph-haval.h | 2 +- algo/jh/sph_jh.h | 2 +- algo/keccak/keccak-4way.c | 1 - algo/keccak/keccak-gate.c | 2 +- algo/keccak/keccak-hash-4way.h | 43 - algo/keccak/sha3d-4way.c | 1 - algo/keccak/sph_keccak.h | 2 +- algo/lanehash/lane.h | 1 - algo/luffa/luffa-hash-2way.c | 6 +- algo/luffa/luffa-hash-2way.h | 6 +- algo/luffa/luffa_for_sse2.h | 2 +- algo/luffa/sph_luffa.h | 2 +- algo/lyra2/lyra2.h | 3 +- algo/lyra2/lyra2rev2.c | 1 - algo/lyra2/lyra2rev3.c | 1 - algo/panama/sph_panama.h | 2 +- algo/quark/hmq1725-4way.c | 2 +- algo/ripemd/lbry-4way.c | 3 +- algo/ripemd/ripemd-hash-4way.h | 1 - algo/ripemd/sph_ripemd.h | 2 +- algo/scrypt/scrypt.c | 1 - algo/sha/hmac-sha256-hash-4way.h | 2 +- algo/sha/sha-hash-4way.h | 168 --- algo/sha/sha256-hash-2way-ni.c | 689 ---------- algo/sha/sha256-hash-4way.c | 269 ++-- algo/sha/sha256-hash-opt.c | 388 ------ algo/sha/sha256-hash.c | 1386 +++++++++++++++++++- algo/sha/sha256-hash.h | 106 +- algo/sha/sha256d-4way.c | 327 ++--- algo/sha/sha256dt.c | 198 +-- algo/sha/sha256q-4way.c | 2 +- algo/sha/sha256t-4way.c | 121 +- algo/sha/sha256t-gate.c | 6 +- algo/sha/sha256t-gate.h | 6 +- algo/sha/sha256t.c | 102 -- algo/sha/sha512-hash-4way.c | 2 +- algo/sha/sha512-hash.h | 46 + algo/sha/sha512256d-4way.c | 3 +- algo/sha/sph_sha2.h | 2 +- algo/shabal/shabal-hash-4way.c | 4 +- algo/shabal/shabal-hash-4way.h | 48 +- algo/shabal/sph_shabal.h | 2 +- algo/shavite/shavite-hash-2way.c | 2 - algo/shavite/sph_shavite.c | 2 +- algo/shavite/sph_shavite.h | 2 +- algo/simd/nist.h | 2 +- algo/simd/simd-compat.h | 2 +- algo/simd/sph_simd.h | 2 +- algo/skein/skein-4way.c | 1 - algo/skein/skein-gate.c | 1 - algo/skein/sph_skein.h | 2 +- algo/tiger/sph_tiger.h | 2 +- algo/whirlpool/sph_whirlpool.h | 2 +- algo/x11/x11-4way.c | 80 +- algo/x16/x16r-gate.c | 2 +- algo/x16/x16r-gate.h | 3 +- algo/x16/x21s-4way.c | 4 +- algo/x17/sonoa-4way.c | 2 +- algo/x17/x17-4way.c | 4 +- algo/x17/xevan-4way.c | 2 +- algo/x22/x22i-4way.c | 6 +- algo/x22/x25x-4way.c | 6 +- asm/aesb-x64.S | 72 -- asm/aesb-x86.S | 21 - comp.log | 50 - {algo/sha => compat}/aes_helper.c | 19 +- {algo/sha => compat}/brg_types.h | 0 {algo/sha => compat}/sha3-defs.h | 0 {algo/sha => compat}/sha3_common.h | 0 {algo/sha => compat}/sph_types.h | 0 config-template.json | 22 + configure | 20 +- configure.ac | 2 +- configure~ | 20 +- cpu-miner.c | 58 +- cpuminer-conf.json | 20 - simd-utils/simd-128.h | 3 + simd-utils/simd-256.h | 4 + simd-utils/simd-512.h | 3 + verthash-help.txt | 4 +- 113 files changed, 3354 insertions(+), 2925 deletions(-) delete mode 100644 algo/sha/sha-hash-4way.h delete mode 100644 algo/sha/sha256-hash-2way-ni.c delete mode 100644 algo/sha/sha256-hash-opt.c delete mode 100644 algo/sha/sha256t.c create mode 100644 algo/sha/sha512-hash.h delete mode 100644 asm/aesb-x64.S delete mode 100644 asm/aesb-x86.S delete mode 100644 comp.log rename {algo/sha => compat}/aes_helper.c (98%) rename {algo/sha => compat}/brg_types.h (100%) rename {algo/sha => compat}/sha3-defs.h (100%) rename {algo/sha => compat}/sha3_common.h (100%) rename {algo/sha => compat}/sph_types.h (100%) create mode 100644 config-template.json delete mode 100644 cpuminer-conf.json diff --git a/Makefile.am b/Makefile.am index c7a051f9..92a18eb2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -163,8 +163,6 @@ cpuminer_SOURCES = \ algo/sha/sph_sha2big.c \ algo/sha/sha256-hash-4way.c \ algo/sha/sha512-hash-4way.c \ - algo/sha/sha256-hash-opt.c \ - algo/sha/sha256-hash-2way-ni.c \ algo/sha/hmac-sha256-hash.c \ algo/sha/hmac-sha256-hash-4way.c \ algo/sha/sha256d.c \ @@ -172,7 +170,6 @@ cpuminer_SOURCES = \ algo/sha/sha256d-4way.c \ algo/sha/sha256t-gate.c \ algo/sha/sha256t-4way.c \ - algo/sha/sha256t.c \ algo/sha/sha256q-4way.c \ algo/sha/sha256q.c \ algo/sha/sha512256d-4way.c \ @@ -294,10 +291,10 @@ disable_flags = if USE_ASM cpuminer_SOURCES += asm/neoscrypt_asm.S if ARCH_x86 - cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S asm/aesb-x86.S + cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S endif if ARCH_x86_64 - cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S asm/aesb-x64.S + cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S endif if ARCH_ARM cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 2e49122e..e561f497 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,11 @@ If not what makes it happen or not happen? Change Log ---------- +v3.23.2 + +sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2. +Other small improvements and code cleanup. + v3.23.1 #349: Fix sha256t low difficulty shares and low effective hash rate. diff --git a/algo-gate-api.c b/algo-gate-api.c index 7f971bd9..e86b304f 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -248,7 +248,7 @@ int null_hash() return 0; }; -void init_algo_gate( algo_gate_t* gate ) +static void init_algo_gate( algo_gate_t* gate ) { gate->miner_thread_init = (void*)&return_true; gate->scanhash = (void*)&scanhash_generic; diff --git a/algo-gate-api.h b/algo-gate-api.h index 12b606a7..5cd31826 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -269,7 +269,7 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id, void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx ); void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx ); // OpenSSL sha256 deprecated -void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx ); +//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx ); bool std_le_work_decode( struct work *work ); bool std_be_work_decode( struct work *work ); diff --git a/algo/argon2/argon2a/argon2a.c b/algo/argon2/argon2a/argon2a.c index 5a7c54d1..51a34aae 100644 --- a/algo/argon2/argon2a/argon2a.c +++ b/algo/argon2/argon2a/argon2a.c @@ -77,7 +77,7 @@ bool register_argon2_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT; gate->scanhash = (void*)&scanhash_argon2; gate->hash = (void*)&argon2hash; - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; + gate->gen_merkle_root = (void*)&sha256_gen_merkle_root; opt_target_factor = 65536.0; return true; diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c index 0ebe547b..32aad562 100644 --- a/algo/blake/sph-blake2s.c +++ b/algo/blake/sph-blake2s.c @@ -15,7 +15,7 @@ #include #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #include "sph-blake2s.h" static const uint32_t blake2s_IV[8] = diff --git a/algo/blake/sph_blake.h b/algo/blake/sph_blake.h index 37fb6516..087c23d5 100644 --- a/algo/blake/sph_blake.h +++ b/algo/blake/sph_blake.h @@ -42,7 +42,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for BLAKE-224. diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index 50a97586..19c73196 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -31,7 +31,7 @@ #include #include #include "simd-utils.h" -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #include "sph_blake2b.h" // Little-endian byte access. diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h index afeecfe1..2befb994 100644 --- a/algo/bmw/bmw-hash-4way.h +++ b/algo/bmw/bmw-hash-4way.h @@ -41,8 +41,6 @@ extern "C"{ #endif #include - -#include "algo/sha/sph_types.h" #include "simd-utils.h" #define SPH_SIZE_bmw256 256 @@ -57,7 +55,7 @@ typedef struct { __m128i buf[64]; __m128i H[16]; size_t ptr; - sph_u32 bit_count; // assume bit_count fits in 32 bits + uint32_t bit_count; // assume bit_count fits in 32 bits } bmw_4way_small_context; typedef bmw_4way_small_context bmw256_4way_context; @@ -144,7 +142,7 @@ typedef struct { __m256i buf[16]; __m256i H[16]; size_t ptr; - sph_u64 bit_count; + uint64_t bit_count; } bmw_4way_big_context __attribute__((aligned(128))); typedef bmw_4way_big_context bmw512_4way_context; diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index 08f7621f..d15890b0 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -109,7 +109,7 @@ static const uint32_t IV256[] = { _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \ rol_off_32( M, j, 3 ) ), \ rol_off_32( M, j, 10 ) ), \ - _mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \ + _mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \ H[ ( (j)+7 ) & 0xF ] ) @@ -485,7 +485,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) size_t ptr; const int buf_size = 64; // bytes of one lane, compatible with len - sc->bit_count += (sph_u32)len << 3; + sc->bit_count += (uint32_t)len << 3; buf = sc->buf; ptr = sc->ptr; h1 = sc->H; diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index 81378a0c..6773bd07 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -45,15 +45,15 @@ extern "C"{ #define LPAR ( -static const sph_u64 IV512[] = { - SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), - SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), - SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF), - SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF), - SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF), - SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF), - SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF), - SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) +static const uint64_t IV512[] = { + 0x8081828384858687, 0x88898A8B8C8D8E8F, + 0x9091929394959697, 0x98999A9B9C9D9E9F, + 0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF, + 0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF, + 0xC0C1C2C3C4C5C6C7, 0xC8C9CACBCCCDCECF, + 0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF, + 0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF, + 0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF }; #if defined(__SSE2__) @@ -894,7 +894,7 @@ static const __m256i final_b[16] = }; static void -bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv ) +bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv ) { sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 ); sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F ); @@ -926,7 +926,7 @@ bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len ) size_t ptr; const int buf_size = 128; // bytes of one lane, compatible with len - sc->bit_count += (sph_u64)len << 3; + sc->bit_count += (uint64_t)len << 3; buf = sc->buf; ptr = sc->ptr; h1 = sc->H; @@ -1377,7 +1377,7 @@ static const __m512i final_b8[16] = void bmw512_8way_init( bmw512_8way_context *ctx ) -//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv ) +//bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv ) { ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 ); ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F ); diff --git a/algo/bmw/sph_bmw.h b/algo/bmw/sph_bmw.h index f53dd27f..e1d06838 100644 --- a/algo/bmw/sph_bmw.h +++ b/algo/bmw/sph_bmw.h @@ -41,7 +41,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for BMW-224. diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index 20967fbf..7f620993 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -9,7 +9,6 @@ #include #endif #include "cubehash_sse2.h" -#include "algo/sha/sha3-defs.h" #include #include #include diff --git a/algo/cubehash/cubehash_sse2.h b/algo/cubehash/cubehash_sse2.h index 1f06ebae..5b69ac77 100644 --- a/algo/cubehash/cubehash_sse2.h +++ b/algo/cubehash/cubehash_sse2.h @@ -3,7 +3,7 @@ #include "compat.h" #include -#include "algo/sha/sha3-defs.h" +#include "compat/sha3-defs.h" #define OPTIMIZE_SSE2 diff --git a/algo/cubehash/sph_cubehash.h b/algo/cubehash/sph_cubehash.h index 4ef6794f..08e96ddc 100644 --- a/algo/cubehash/sph_cubehash.h +++ b/algo/cubehash/sph_cubehash.h @@ -42,7 +42,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for CubeHash-224. diff --git a/algo/echo/aes_ni/hash_api.h b/algo/echo/aes_ni/hash_api.h index a5500885..816d4579 100644 --- a/algo/echo/aes_ni/hash_api.h +++ b/algo/echo/aes_ni/hash_api.h @@ -22,7 +22,7 @@ #endif -#include "algo/sha/sha3_common.h" +#include "compat/sha3_common.h" #include diff --git a/algo/echo/sph_echo.c b/algo/echo/sph_echo.c index 99e7dacd..b7b3c065 100644 --- a/algo/echo/sph_echo.c +++ b/algo/echo/sph_echo.c @@ -73,7 +73,7 @@ extern "C"{ #endif #define AES_BIG_ENDIAN 0 -#include "algo/sha/aes_helper.c" +#include "compat/aes_helper.c" #if SPH_ECHO_64 diff --git a/algo/echo/sph_echo.h b/algo/echo/sph_echo.h index ae5a3507..8165f7b0 100644 --- a/algo/echo/sph_echo.h +++ b/algo/echo/sph_echo.h @@ -43,7 +43,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for ECHO-224. diff --git a/algo/fugue/fugue-aesni.h b/algo/fugue/fugue-aesni.h index 13fd8f87..389e5793 100644 --- a/algo/fugue/fugue-aesni.h +++ b/algo/fugue/fugue-aesni.h @@ -20,7 +20,7 @@ #error "Unsupported configuration, AES needs SSE4.1. Compile without AES." #endif -#include "algo/sha/sha3_common.h" +#include "compat/sha3_common.h" #include "simd-utils.h" diff --git a/algo/fugue/sph_fugue.h b/algo/fugue/sph_fugue.h index 08d4dde0..6a73d5c5 100644 --- a/algo/fugue/sph_fugue.h +++ b/algo/fugue/sph_fugue.h @@ -2,7 +2,7 @@ #define SPH_FUGUE_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #ifdef __cplusplus extern "C"{ diff --git a/algo/gost/sph_gost.h b/algo/gost/sph_gost.h index 5f8f3491..3467ae9a 100644 --- a/algo/gost/sph_gost.h +++ b/algo/gost/sph_gost.h @@ -41,7 +41,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for GOST-256. diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h index b76d8098..558215a7 100644 --- a/algo/groestl/aes_ni/hash-groestl.h +++ b/algo/groestl/aes_ni/hash-groestl.h @@ -20,8 +20,8 @@ #define LENGTH (512) #include "brg_endian.h" -#define NEED_UINT_64T -#include "algo/sha/brg_types.h" +//#define NEED_UINT_64T +#include "compat/brg_types.h" /* some sizes (number of bytes) */ #define ROWS (8) diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h index 32ce1a5f..24544a50 100644 --- a/algo/groestl/aes_ni/hash-groestl256.h +++ b/algo/groestl/aes_ni/hash-groestl256.h @@ -34,8 +34,7 @@ typedef crypto_uint64 u64; //#define LENGTH (512) #include "brg_endian.h" -#define NEED_UINT_64T -#include "algo/sha/brg_types.h" +#include "compat/brg_types.h" #ifdef IACA_TRACE #include IACA_MARKS diff --git a/algo/groestl/groestl-gate.c b/algo/groestl/groestl-gate.c index 92c79bce..eb2d4988 100644 --- a/algo/groestl/groestl-gate.c +++ b/algo/groestl/groestl-gate.c @@ -17,7 +17,7 @@ bool register_dmd_gr_algo( algo_gate_t *gate ) bool register_groestl_algo( algo_gate_t* gate ) { register_dmd_gr_algo( gate ); - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; + gate->gen_merkle_root = (void*)&sha256_gen_merkle_root; return true; }; diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h index 59c62708..05ddccb9 100644 --- a/algo/groestl/groestl256-hash-4way.h +++ b/algo/groestl/groestl256-hash-4way.h @@ -22,10 +22,6 @@ #define LENGTH (256) -//#include "brg_endian.h" -//#define NEED_UINT_64T -//#include "algo/sha/brg_types.h" - /* some sizes (number of bytes) */ #define ROWS (8) #define LENGTHFIELDLEN (ROWS) diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index c9f558cc..0b13ad21 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -4,7 +4,7 @@ #include #include #include "aes_ni/hash-groestl.h" -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha256-hash.h" #if defined(__VAES__) #include "groestl512-hash-4way.h" #endif diff --git a/algo/groestl/sph_groestl.h b/algo/groestl/sph_groestl.h index 02465e3c..899d716e 100644 --- a/algo/groestl/sph_groestl.h +++ b/algo/groestl/sph_groestl.h @@ -40,7 +40,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #if !defined(__AES__) /** diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 89f8646c..3e61cc6d 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -34,498 +34,303 @@ #include #include "hamsi-hash-4way.h" -#if defined(__AVX2__) - -#ifdef __cplusplus -extern "C"{ -#endif - -/* - * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one - * table lookup during message expansion (1 to 8, inclusive). If we note - * w the number of bits per message word (w=32 for Hamsi-224/256, w=64 - * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for - * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level, - * then we will get t tables (where t=ceil(w/n)) of individual size - * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and - * n=5, there are 7 tables, but the last one uses only two bits on - * input, not five). - * - * Also, we read t rows of r words from RAM. Words in a given row are - * concatenated in RAM in that order, so most of the cost is about - * reading the first row word; comparatively, cache misses are thus - * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8). - * - * When n=1, tables are "special" in that we omit the first entry of - * each table (which always contains 0), so that total table size is - * halved. - * - * We thus have the following (size1 is the cumulative table size of - * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2 - * are for Hamsi-224/256 and Hamsi-384/512, respectively). - * - * n size1 size2 t1 t2 - * --------------------------------------- - * 1 1024 4096 32 64 - * 2 2048 8192 16 32 - * 3 2688 10880 11 22 - * 4 4096 16384 8 16 - * 5 6272 25600 7 13 - * 6 10368 41984 6 11 - * 7 16896 73856 5 10 - * 8 32768 131072 4 8 - * - * So there is a trade-off: a lower n makes the tables fit better in - * L1 cache, but increases the number of memory accesses. The optimal - * value depends on the amount of available L1 cache and the relative - * impact of a cache miss. - * - * Experimentally, in ideal benchmark conditions (which are not necessarily - * realistic with regards to L1 cache contention), it seems that n=8 is - * the best value on "big" architectures (those with 32 kB or more of L1 - * cache), while n=4 is better on "small" architectures. This was tested - * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3 - * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302 - * (8 kB L1 cache). - * - * Note: with n=1, the 32 tables (actually implemented as one big table) - * are read entirely and sequentially, regardless of the input data, - * thus avoiding any data-dependent table access pattern. - */ +#include -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif +#if defined(__AVX2__) //#include "hamsi-helper-4way.c" -/* -static const sph_u32 IV512[] = { - SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172), - SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062), - SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33), - SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48), - SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c), - SPH_C32(0x6769756d) -}; -*/ -static const sph_u32 alpha_n[] = { - SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), - SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), - SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), - SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00), - SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), - SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) + +static const uint32_t HAMSI_IV512[] = +{ + 0x73746565, 0x6c706172, 0x6b204172, 0x656e6265, + 0x72672031, 0x302c2062, 0x75732032, 0x3434362c, + 0x20422d33, 0x30303120, 0x4c657576, 0x656e2d48, + 0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d }; -static const sph_u32 alpha_f[] = { - SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), - SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), - SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9), - SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c), - SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) +static const uint32_t alpha_n[] = { + 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, + 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00, + 0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, + 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa, + 0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, + 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, + 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, + 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0 }; +static const uint32_t alpha_f[] = { + 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, + 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9, + 0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, + 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0, + 0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, + 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, + 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, + 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c +}; // imported from hamsi helper /* Note: this table lists bits within each byte from least siginificant to most significant. */ -static const sph_u32 T512[64][16] = { - { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), - SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), - SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), - SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), - SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), - SPH_C32(0x9e69af68) }, - { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), - SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), - SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), - SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), - SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), - SPH_C32(0x0c26f262) }, - { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), - SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), - SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), - SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), - SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), - SPH_C32(0xdc24e61f) }, - { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), - SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), - SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), - SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), - SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), - SPH_C32(0x3daac2da) }, - { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), - SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), - SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), - SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), - SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), - SPH_C32(0x78cace29) }, - { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), - SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), - SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), - SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), - SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), - SPH_C32(0x2dd1f9ab) }, - { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), - SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), - SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), - SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), - SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), - SPH_C32(0xbf2c0be2) }, - { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), - SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), - SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), - SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), - SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), - SPH_C32(0x32219526) }, - { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), - SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), - SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), - SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), - SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), - SPH_C32(0xac8e6c88) }, - { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), - SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), - SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), - SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), - SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), - SPH_C32(0x7b1bd6b9) }, - { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), - SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), - SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), - SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), - SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), - SPH_C32(0xf746c320) }, - { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), - SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), - SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), - SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), - SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), - SPH_C32(0x69505b3a) }, - { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), - SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), - SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), - SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), - SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), - SPH_C32(0x8a341574) }, - { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), - SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), - SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), - SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), - SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), - SPH_C32(0x450360bf) }, - { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), - SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), - SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), - SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), - SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), - SPH_C32(0xf3d45758) }, - { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), - SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), - SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), - SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), - SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), - SPH_C32(0x925c44e9) }, - { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), - SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), - SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), - SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), - SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), - SPH_C32(0xa123ff9f) }, - { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), - SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), - SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), - SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), - SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), - SPH_C32(0x1568ff0f) }, - { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), - SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), - SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), - SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), - SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), - SPH_C32(0xc5c1eb3e) }, - { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), - SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), - SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), - SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), - SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), - SPH_C32(0x1af21fe1) }, - { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), - SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), - SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), - SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), - SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), - SPH_C32(0x857f3c2b) }, - { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), - SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), - SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), - SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), - SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), - SPH_C32(0x2ba05a55) }, - { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), - SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), - SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), - SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), - SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), - SPH_C32(0xfeabf254) }, - { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), - SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), - SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), - SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), - SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), - SPH_C32(0xfe1cdc7f) }, - { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), - SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), - SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), - SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), - SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), - SPH_C32(0xb0a51834) }, - { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), - SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), - SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), - SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), - SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), - SPH_C32(0xa6b8c28d) }, - { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), - SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), - SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), - SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), - SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), - SPH_C32(0x3a4e99d7) }, - { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), - SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), - SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), - SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), - SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), - SPH_C32(0xe1844257) }, - { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), - SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), - SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), - SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), - SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), - SPH_C32(0x2c3b504e) }, - { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), - SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), - SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), - SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), - SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), - SPH_C32(0x524a0d59) }, - { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), - SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), - SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), - SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), - SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), - SPH_C32(0x378dd173) }, - { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), - SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), - SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), - SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), - SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), - SPH_C32(0x8b6c72bd) }, - { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), - SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), - SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), - SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), - SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), - SPH_C32(0x8e67b7fa) }, - { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), - SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), - SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), - SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), - SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), - SPH_C32(0x443d3004) }, - { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), - SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), - SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), - SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), - SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), - SPH_C32(0xf4f6ea7b) }, - { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), - SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), - SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), - SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), - SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), - SPH_C32(0x979961d0) }, - { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), - SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), - SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), - SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), - SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), - SPH_C32(0x98aa496e) }, - { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), - SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), - SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), - SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), - SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), - SPH_C32(0x094e3198) }, - { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), - SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), - SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), - SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), - SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), - SPH_C32(0xe86cba2e) }, - { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), - SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), - SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), - SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), - SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), - SPH_C32(0x4b7eec55) }, - { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), - SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), - SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), - SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), - SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), - SPH_C32(0x1e7536a6) }, - { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), - SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), - SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), - SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), - SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), - SPH_C32(0x24314f17) }, - { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), - SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), - SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), - SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), - SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), - SPH_C32(0x9075b1ce) }, - { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), - SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), - SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), - SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), - SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), - SPH_C32(0x9b6ef888) }, - { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), - SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), - SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), - SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), - SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), - SPH_C32(0xd8b61463) }, - { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), - SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), - SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), - SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), - SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), - SPH_C32(0x3ea660f7) }, - { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), - SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), - SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), - SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), - SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), - SPH_C32(0x7f975691) }, - { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), - SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), - SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), - SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), - SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), - SPH_C32(0x2c94459e) }, - { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), - SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), - SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), - SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), - SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), - SPH_C32(0x56a7b19f) }, - { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), - SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), - SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), - SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), - SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), - SPH_C32(0x81fdf908) }, - { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), - SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), - SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), - SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), - SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), - SPH_C32(0x5bd61539) }, - { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), - SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), - SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), - SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), - SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), - SPH_C32(0x15b961e7) }, - { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), - SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), - SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), - SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), - SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), - SPH_C32(0x2a2c18f0) }, - { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), - SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), - SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), - SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), - SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), - SPH_C32(0x551e3d6e) }, - { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), - SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), - SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), - SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), - SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), - SPH_C32(0x33c5244f) }, - { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), - SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), - SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), - SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), - SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), - SPH_C32(0x8a58e6a4) }, - { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), - SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), - SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), - SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), - SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), - SPH_C32(0xda878000) }, - { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), - SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), - SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), - SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), - SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), - SPH_C32(0x3c5dfffe) }, - { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), - SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), - SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), - SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), - SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), - SPH_C32(0x7b1675d7) }, - { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), - SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), - SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), - SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), - SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), - SPH_C32(0x2879ebac) }, - { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), - SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), - SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), - SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), - SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), - SPH_C32(0xbe0a679e) }, - { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), - SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), - SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), - SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), - SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), - SPH_C32(0x30aebcf7) }, - { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), - SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), - SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), - SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), - SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), - SPH_C32(0xc7ff60f0) }, - { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), - SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), - SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), - SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), - SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), - SPH_C32(0xe7e00a94) } +static const uint32_t T512[64][16] = { + { 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, + 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, + 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, + 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 }, + { 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, + 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, + 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, + 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 }, + { 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, + 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5, + 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, + 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f }, + { 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, + 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, + 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, + 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da }, + { 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, + 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782, + 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, + 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 }, + { 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, + 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, + 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, + 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab }, + { 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, + 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4, + 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, + 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 }, + { 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, + 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, + 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, + 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 }, + { 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, + 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31, + 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, + 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 }, + { 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, + 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, + 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, + 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 }, + { 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, + 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a, + 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, + 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 }, + { 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, + 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, + 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, + 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a }, + { 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, + 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb, + 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, + 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 }, + { 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, + 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, + 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, + 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf }, + { 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, + 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, + 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, + 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 }, + { 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, + 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, + 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, + 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 }, + { 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, + 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090, + 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, + 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f }, + { 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, + 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, + 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, + 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f }, + { 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, + 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df, + 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, + 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e }, + { 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, + 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, + 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, + 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 }, + { 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, + 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e, + 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, + 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b }, + { 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, + 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, + 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, + 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 }, + { 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, + 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b, + 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, + 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 }, + { 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, + 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, + 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, + 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f }, + { 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, + 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9, + 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, + 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 }, + { 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, + 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, + 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, + 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d }, + { 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, + 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80, + 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, + 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 }, + { 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, + 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, + 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, + 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 }, + { 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, + 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, + 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, + 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e }, + { 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, + 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, + 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, + 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 }, + { 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, + 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce, + 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, + 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 }, + { 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, + 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, + 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, + 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd }, + { 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, + 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe, + 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, + 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa }, + { 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, + 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, + 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, + 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 }, + { 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, + 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab, + 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, + 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b }, + { 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, + 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, + 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, + 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 }, + { 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, + 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6, + 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, + 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e }, + { 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, + 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, + 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, + 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 }, + { 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, + 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b, + 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, + 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e }, + { 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, + 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, + 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, + 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 }, + { 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, + 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1, + 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, + 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 }, + { 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, + 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, + 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, + 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 }, + { 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, + 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, + 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, + 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce }, + { 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, + 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, + 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, + 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 }, + { 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, + 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494, + 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, + 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 }, + { 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, + 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, + 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, + 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 }, + { 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, + 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f, + 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, + 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 }, + { 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, + 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, + 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, + 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e }, + { 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, + 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897, + 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, + 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f }, + { 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, + 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, + 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, + 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 }, + { 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, + 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de, + 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, + 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 }, + { 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, + 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, + 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, + 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 }, + { 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, + 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e, + 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, + 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 }, + { 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, + 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, + 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, + 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e }, + { 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, + 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb, + 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, + 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f }, + { 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, + 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, + 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, + 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 }, + { 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, + 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, + 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, + 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 }, + { 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, + 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, + 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, + 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe }, + { 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, + 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b, + 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, + 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 }, + { 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, + 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, + 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, + 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac }, + { 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, + 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69, + 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, + 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e }, + { 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, + 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, + 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, + 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 }, + { 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, + 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64, + 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, + 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 }, + { 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, + 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, + 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, + 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 } }; #define s0 m0 @@ -545,6 +350,39 @@ static const sph_u32 T512[64][16] = { #define sE c7 #define sF m7 +#define S00 M0 +#define S01 M1 +#define S02 C0 +#define S03 C1 +#define S04 M2 +#define S05 M3 +#define S06 C2 +#define S07 C3 +#define S08 C4 +#define S09 C5 +#define S0A M4 +#define S0B M5 +#define S0C C6 +#define S0D C7 +#define S0E M6 +#define S0F M7 +#define S10 M8 +#define S11 M9 +#define S12 C8 +#define S13 C9 +#define S14 MA +#define S15 MB +#define S16 CA +#define S17 CB +#define S18 CC +#define S19 CD +#define S1A MC +#define S1B MD +#define S1C CE +#define S1D CF +#define S1E ME +#define S1F MF + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) // Hamsi 8 way AVX512 @@ -562,14 +400,14 @@ do { \ for ( int u = 0; u < 64; u++ ) \ { \ const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \ - m0 = _mm512_mask_xor_epi64( m0, dm, m0, _mm512_set1_epi64( tp[0] ) ); \ - m1 = _mm512_mask_xor_epi64( m1, dm, m1, _mm512_set1_epi64( tp[1] ) ); \ - m2 = _mm512_mask_xor_epi64( m2, dm, m2, _mm512_set1_epi64( tp[2] ) ); \ - m3 = _mm512_mask_xor_epi64( m3, dm, m3, _mm512_set1_epi64( tp[3] ) ); \ - m4 = _mm512_mask_xor_epi64( m4, dm, m4, _mm512_set1_epi64( tp[4] ) ); \ - m5 = _mm512_mask_xor_epi64( m5, dm, m5, _mm512_set1_epi64( tp[5] ) ); \ - m6 = _mm512_mask_xor_epi64( m6, dm, m6, _mm512_set1_epi64( tp[6] ) ); \ - m7 = _mm512_mask_xor_epi64( m7, dm, m7, _mm512_set1_epi64( tp[7] ) ); \ + m0 = _mm512_mask_xor_epi64( m0, dm, m0, v512_64( tp[0] ) ); \ + m1 = _mm512_mask_xor_epi64( m1, dm, m1, v512_64( tp[1] ) ); \ + m2 = _mm512_mask_xor_epi64( m2, dm, m2, v512_64( tp[2] ) ); \ + m3 = _mm512_mask_xor_epi64( m3, dm, m3, v512_64( tp[3] ) ); \ + m4 = _mm512_mask_xor_epi64( m4, dm, m4, v512_64( tp[4] ) ); \ + m5 = _mm512_mask_xor_epi64( m5, dm, m5, v512_64( tp[5] ) ); \ + m6 = _mm512_mask_xor_epi64( m6, dm, m6, v512_64( tp[6] ) ); \ + m7 = _mm512_mask_xor_epi64( m7, dm, m7, v512_64( tp[7] ) ); \ db = _mm512_ror_epi64( db, 1 ); \ tp += 8; \ } \ @@ -656,7 +494,6 @@ do { \ SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \ SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \ SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \ -\ s4 = mm512_swap64_32( s4 ); \ s5 = mm512_swap64_32( s5 ); \ sD = mm512_swap64_32( sD ); \ @@ -664,7 +501,6 @@ do { \ t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \ L8( s0, t0, s9, t1 ); \ -\ s6 = mm512_swap64_32( s6 ); \ sF = mm512_swap64_32( sF ); \ t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ @@ -733,17 +569,17 @@ do { \ __m512i alpha[16]; \ const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \ for( int i = 0; i < 16; i++ ) \ - alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_n )[i] ); \ + alpha[i] = v512_64( ( (uint64_t*)alpha_n )[i] ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( (1ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( (1ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( (2ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( (2ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( (3ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( (3ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( (4ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( (4ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( (5ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( (5ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ } while (0) @@ -752,29 +588,29 @@ do { \ __m512i alpha[16]; \ const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \ for( int i = 0; i < 16; i++ ) \ - alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_f )[i] ); \ + alpha[i] = v512_64( ( (uint64_t*)alpha_f )[i] ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 1ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 1ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 2ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 2ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 3ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 3ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 4ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 4ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 5ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 5ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 6ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 6ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 7ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 7ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 8ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 8ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( ( 9ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( ( 9ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( (10ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( (10ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = _mm512_set1_epi64( (11ULL << 32) ^ A0 ); \ + alpha[0] = v512_64( (11ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ } while (0) @@ -790,13 +626,335 @@ do { /* order is important */ \ c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \ } while (0) +/////////////////////// +// +// Experimental + +// Hamsi 16 way 32 bit. + +#define DECL_STATE_16X32 \ + __m512i C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \ + +#define READ_STATE_16X32(sc) \ + C0 = sc->h[ 0]; \ + C1 = sc->h[ 1]; \ + C2 = sc->h[ 2]; \ + C3 = sc->h[ 3]; \ + C4 = sc->h[ 4]; \ + C5 = sc->h[ 5]; \ + C6 = sc->h[ 6]; \ + C7 = sc->h[ 7]; \ + C8 = sc->h[ 8]; \ + C9 = sc->h[ 9]; \ + CA = sc->h[10]; \ + CB = sc->h[11]; \ + CC = sc->h[12]; \ + CD = sc->h[13]; \ + CE = sc->h[14]; \ + CF = sc->h[15]; + +#define WRITE_STATE_16X32(sc) \ + sc->h[ 0] = C0; \ + sc->h[ 1] = C1; \ + sc->h[ 2] = C2; \ + sc->h[ 3] = C3; \ + sc->h[ 4] = C4; \ + sc->h[ 5] = C5; \ + sc->h[ 6] = C6; \ + sc->h[ 7] = C7; \ + sc->h[ 8] = C8; \ + sc->h[ 9] = C9; \ + sc->h[10] = CA; \ + sc->h[11] = CB; \ + sc->h[12] = CC; \ + sc->h[13] = CD; \ + sc->h[14] = CE; \ + sc->h[15] = CF; + + +#define INPUT_16X32 \ +{ \ + const __m512i zero = _mm512_setzero_si512(); \ + const uint32_t *tp = (const uint32_t*)T512; \ + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \ + M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \ + for ( int v = 0; v < 2; v++ ) \ + { \ + __m512i db = _mm512_ror_epi32( buf[v], 1 ); \ + for ( int u = 0; u < 32; u++ ) \ + { \ + __mmask16 dm = _mm512_cmplt_epi32_mask( db, zero ); \ + M0 = _mm512_mask_xor_epi32( M0, dm, M0, v512_32( tp[ 0] ) ); \ + M1 = _mm512_mask_xor_epi32( M1, dm, M1, v512_32( tp[ 1] ) ); \ + M2 = _mm512_mask_xor_epi32( M2, dm, M2, v512_32( tp[ 2] ) ); \ + M3 = _mm512_mask_xor_epi32( M3, dm, M3, v512_32( tp[ 3] ) ); \ + M4 = _mm512_mask_xor_epi32( M4, dm, M4, v512_32( tp[ 4] ) ); \ + M5 = _mm512_mask_xor_epi32( M5, dm, M5, v512_32( tp[ 5] ) ); \ + M6 = _mm512_mask_xor_epi32( M6, dm, M6, v512_32( tp[ 6] ) ); \ + M7 = _mm512_mask_xor_epi32( M7, dm, M7, v512_32( tp[ 7] ) ); \ + M8 = _mm512_mask_xor_epi32( M8, dm, M8, v512_32( tp[ 8] ) ); \ + M9 = _mm512_mask_xor_epi32( M9, dm, M9, v512_32( tp[ 9] ) ); \ + MA = _mm512_mask_xor_epi32( MA, dm, MA, v512_32( tp[10] ) ); \ + MB = _mm512_mask_xor_epi32( MB, dm, MB, v512_32( tp[11] ) ); \ + MC = _mm512_mask_xor_epi32( MC, dm, MC, v512_32( tp[12] ) ); \ + MD = _mm512_mask_xor_epi32( MD, dm, MD, v512_32( tp[13] ) ); \ + ME = _mm512_mask_xor_epi32( ME, dm, ME, v512_32( tp[14] ) ); \ + MF = _mm512_mask_xor_epi32( MF, dm, MF, v512_32( tp[15] ) ); \ + db = _mm512_ror_epi32( db, 1 ); \ + tp += 16; \ + } \ + } \ +} + +#define SBOX_16X32 SBOX8 +#define L_16X32 L8 + +#define ROUND_16X32( rc, alpha ) \ +{ \ + S00 = _mm512_xor_si512( S00, v512_32( alpha[ 0] ) ); \ + S01 = _mm512_xor_si512( S01, v512_32( alpha[ 1] ^ rc ) ); \ + S02 = _mm512_xor_si512( S02, v512_32( alpha[ 2] ) ); \ + S03 = _mm512_xor_si512( S03, v512_32( alpha[ 3] ) ); \ + S04 = _mm512_xor_si512( S04, v512_32( alpha[ 4] ) ); \ + S05 = _mm512_xor_si512( S05, v512_32( alpha[ 5] ) ); \ + S06 = _mm512_xor_si512( S06, v512_32( alpha[ 6] ) ); \ + S07 = _mm512_xor_si512( S07, v512_32( alpha[ 7] ) ); \ + S08 = _mm512_xor_si512( S08, v512_32( alpha[ 8] ) ); \ + S09 = _mm512_xor_si512( S09, v512_32( alpha[ 9] ) ); \ + S0A = _mm512_xor_si512( S0A, v512_32( alpha[10] ) ); \ + S0B = _mm512_xor_si512( S0B, v512_32( alpha[11] ) ); \ + S0C = _mm512_xor_si512( S0C, v512_32( alpha[12] ) ); \ + S0D = _mm512_xor_si512( S0D, v512_32( alpha[13] ) ); \ + S0E = _mm512_xor_si512( S0E, v512_32( alpha[14] ) ); \ + S0F = _mm512_xor_si512( S0F, v512_32( alpha[15] ) ); \ + S10 = _mm512_xor_si512( S10, v512_32( alpha[16] ) ); \ + S11 = _mm512_xor_si512( S11, v512_32( alpha[17] ) ); \ + S12 = _mm512_xor_si512( S12, v512_32( alpha[18] ) ); \ + S13 = _mm512_xor_si512( S13, v512_32( alpha[19] ) ); \ + S14 = _mm512_xor_si512( S14, v512_32( alpha[20] ) ); \ + S15 = _mm512_xor_si512( S15, v512_32( alpha[21] ) ); \ + S16 = _mm512_xor_si512( S16, v512_32( alpha[22] ) ); \ + S17 = _mm512_xor_si512( S17, v512_32( alpha[23] ) ); \ + S18 = _mm512_xor_si512( S18, v512_32( alpha[24] ) ); \ + S19 = _mm512_xor_si512( S19, v512_32( alpha[25] ) ); \ + S1A = _mm512_xor_si512( S1A, v512_32( alpha[26] ) ); \ + S1B = _mm512_xor_si512( S1B, v512_32( alpha[27] ) ); \ + S1C = _mm512_xor_si512( S1C, v512_32( alpha[28] ) ); \ + S1D = _mm512_xor_si512( S1D, v512_32( alpha[29] ) ); \ + S1E = _mm512_xor_si512( S1E, v512_32( alpha[30] ) ); \ + S1F = _mm512_xor_si512( S1F, v512_32( alpha[31] ) ); \ + SBOX_16X32( S00, S08, S10, S18 ); \ + SBOX_16X32( S01, S09, S11, S19 ); \ + SBOX_16X32( S02, S0A, S12, S1A ); \ + SBOX_16X32( S03, S0B, S13, S1B ); \ + SBOX_16X32( S04, S0C, S14, S1C ); \ + SBOX_16X32( S05, S0D, S15, S1D ); \ + SBOX_16X32( S06, S0E, S16, S1E ); \ + SBOX_16X32( S07, S0F, S17, S1F ); \ + L_16X32( S00, S09, S12, S1B ); \ + L_16X32( S01, S0A, S13, S1C ); \ + L_16X32( S02, S0B, S14, S1D ); \ + L_16X32( S03, S0C, S15, S1E ); \ + L_16X32( S04, S0D, S16, S1F ); \ + L_16X32( S05, S0E, S17, S18 ); \ + L_16X32( S06, S0F, S10, S19 ); \ + L_16X32( S07, S08, S11, S1A ); \ + L_16X32( S00, S02, S05, S07 ); \ + L_16X32( S10, S13, S15, S16 ); \ + L_16X32( S09, S0B, S0C, S0E ); \ + L_16X32( S19, S1A, S1C, S1F ); \ +} + +#define P_16X32 \ + ROUND_16X32( 0, alpha_n ); \ + ROUND_16X32( 1, alpha_n ); \ + ROUND_16X32( 2, alpha_n ); \ + ROUND_16X32( 3, alpha_n ); \ + ROUND_16X32( 4, alpha_n ); \ + ROUND_16X32( 5, alpha_n ); + +#define PF_16X32 \ + ROUND_16X32( 0, alpha_f ); \ + ROUND_16X32( 1, alpha_f ); \ + ROUND_16X32( 2, alpha_f ); \ + ROUND_16X32( 3, alpha_f ); \ + ROUND_16X32( 4, alpha_f ); \ + ROUND_16X32( 5, alpha_f ); \ + ROUND_16X32( 6, alpha_f ); \ + ROUND_16X32( 7, alpha_f ); \ + ROUND_16X32( 8, alpha_f ); \ + ROUND_16X32( 9, alpha_f ); \ + ROUND_16X32( 10, alpha_f ); \ + ROUND_16X32( 11, alpha_f ); + +#define T_16X32 \ + /* order is important */ \ + CF = sc->h[15] = _mm512_xor_si512( sc->h[15], S17 ); \ + CE = sc->h[14] = _mm512_xor_si512( sc->h[14], S16 ); \ + CD = sc->h[13] = _mm512_xor_si512( sc->h[13], S15 ); \ + CC = sc->h[12] = _mm512_xor_si512( sc->h[12], S14 ); \ + CB = sc->h[11] = _mm512_xor_si512( sc->h[11], S13 ); \ + CA = sc->h[10] = _mm512_xor_si512( sc->h[10], S12 ); \ + C9 = sc->h[ 9] = _mm512_xor_si512( sc->h[ 9], S11 ); \ + C8 = sc->h[ 8] = _mm512_xor_si512( sc->h[ 8], S10 ); \ + C7 = sc->h[ 7] = _mm512_xor_si512( sc->h[ 7], S07 ); \ + C6 = sc->h[ 6] = _mm512_xor_si512( sc->h[ 6], S06 ); \ + C5 = sc->h[ 5] = _mm512_xor_si512( sc->h[ 5], S05 ); \ + C4 = sc->h[ 4] = _mm512_xor_si512( sc->h[ 4], S04 ); \ + C3 = sc->h[ 3] = _mm512_xor_si512( sc->h[ 3], S03 ); \ + C2 = sc->h[ 2] = _mm512_xor_si512( sc->h[ 2], S02 ); \ + C1 = sc->h[ 1] = _mm512_xor_si512( sc->h[ 1], S01 ); \ + C0 = sc->h[ 0] = _mm512_xor_si512( sc->h[ 0], S00 ); + +void hamsi_16x32_big( hamsi_16x32_big_context *sc, __m512i *buf, size_t num ) +{ + DECL_STATE_16X32 + uint32_t tmp = num << 6; + + sc->count_low = sc->count_low + tmp; + sc->count_high += (uint32_t)( (num >> 13) >> 13 ); + if ( sc->count_low < tmp ) + sc->count_high++; + + READ_STATE_16X32( sc ); + while ( num-- > 0 ) + { + __m512i M0, M1, M2, M3, M4, M5, M6, M7; + __m512i M8, M9, MA, MB, MC, MD, ME, MF; + INPUT_16X32; + P_16X32; + T_16X32; + buf += 2; + } + WRITE_STATE_16X32( sc ); +} + +void hamsi_16x32_big_final( hamsi_16x32_big_context *sc, __m512i *buf ) +{ + DECL_STATE_16X32 + READ_STATE_16X32( sc ); + __m512i M0, M1, M2, M3, M4, M5, M6, M7; + __m512i M8, M9, MA, MB, MC, MD, ME, MF; + INPUT_16X32; + PF_16X32; + T_16X32; + WRITE_STATE_16X32( sc ); +} + +void hamsi512_16x32_init( hamsi_16x32_big_context *sc ) +{ + sc->partial_len = 0; + sc->count_high = sc->count_low = 0; + sc->h[ 0] = v512_32( HAMSI_IV512[ 0] ); + sc->h[ 1] = v512_32( HAMSI_IV512[ 1] ); + sc->h[ 2] = v512_32( HAMSI_IV512[ 2] ); + sc->h[ 3] = v512_32( HAMSI_IV512[ 3] ); + sc->h[ 4] = v512_32( HAMSI_IV512[ 4] ); + sc->h[ 5] = v512_32( HAMSI_IV512[ 5] ); + sc->h[ 6] = v512_32( HAMSI_IV512[ 6] ); + sc->h[ 7] = v512_32( HAMSI_IV512[ 7] ); + sc->h[ 8] = v512_32( HAMSI_IV512[ 8] ); + sc->h[ 9] = v512_32( HAMSI_IV512[ 9] ); + sc->h[10] = v512_32( HAMSI_IV512[10] ); + sc->h[11] = v512_32( HAMSI_IV512[11] ); + sc->h[12] = v512_32( HAMSI_IV512[12] ); + sc->h[13] = v512_32( HAMSI_IV512[13] ); + sc->h[14] = v512_32( HAMSI_IV512[14] ); + sc->h[15] = v512_32( HAMSI_IV512[15] ); +} + +void hamsi512_16x32_update( hamsi_16x32_big_context *sc, const void *data, + size_t len ) +{ + __m512i *vdata = (__m512i*)data; + + hamsi_16x32_big( sc, vdata, len>>3 ); + vdata += ( (len & ~(size_t)7) >> 3 ); + len &= (size_t)7; + memcpy_512( sc->buf, vdata, len>>3 ); + sc->partial_len = len; +} + +void hamsi512_16x32_close( hamsi_16x32_big_context *sc, void *dst ) +{ + __m512i pad[2]; + uint32_t ch, cl; + + ch = bswap_32( sc->count_high ); + cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = v512_32( ch ); + pad[1] = v512_32( cl ); + sc->buf[0] = v512_32( 0x80 ); + sc->buf[1] = _mm512_setzero_si512(); + hamsi_16x32_big( sc, sc->buf, 1 ); + hamsi_16x32_big_final( sc, pad ); + + mm512_block_bswap_32( (__m512i*)dst, sc->h ); + mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 ); +} + +void hamsi512_16x32_full( hamsi_16x32_big_context *sc, void *dst, + const void *data, size_t len ) +{ + // init + sc->partial_len = 0; + sc->count_high = sc->count_low = 0; + sc->h[ 0] = v512_32( HAMSI_IV512[ 0] ); + sc->h[ 1] = v512_32( HAMSI_IV512[ 1] ); + sc->h[ 2] = v512_32( HAMSI_IV512[ 2] ); + sc->h[ 3] = v512_32( HAMSI_IV512[ 3] ); + sc->h[ 4] = v512_32( HAMSI_IV512[ 4] ); + sc->h[ 5] = v512_32( HAMSI_IV512[ 5] ); + sc->h[ 6] = v512_32( HAMSI_IV512[ 6] ); + sc->h[ 7] = v512_32( HAMSI_IV512[ 7] ); + sc->h[ 8] = v512_32( HAMSI_IV512[ 8] ); + sc->h[ 9] = v512_32( HAMSI_IV512[ 9] ); + sc->h[10] = v512_32( HAMSI_IV512[10] ); + sc->h[11] = v512_32( HAMSI_IV512[11] ); + sc->h[12] = v512_32( HAMSI_IV512[12] ); + sc->h[13] = v512_32( HAMSI_IV512[13] ); + sc->h[14] = v512_32( HAMSI_IV512[14] ); + sc->h[15] = v512_32( HAMSI_IV512[15] ); + + // update + __m512i *vdata = (__m512i*)data; + + hamsi_16x32_big( sc, vdata, len>>3 ); + vdata += ( (len & ~(size_t)7) >> 3 ); + len &= (size_t)7; + memcpy_512( sc->buf, vdata, len>>3 ); + sc->partial_len = len; + + // close + __m512i pad[2]; + uint32_t ch, cl; + + ch = bswap_32( sc->count_high ); + cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = v512_32( ch ); + pad[1] = v512_32( cl ); + sc->buf[0] = v512_32( 0x80 ); + sc->buf[1] = _mm512_setzero_si512(); + hamsi_16x32_big( sc, sc->buf, 1 ); + hamsi_16x32_big_final( sc, pad ); + + mm512_block_bswap_32( (__m512i*)dst, sc->h ); + mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 ); +} + +// +// +// +///////////////////////////////// + + void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) { DECL_STATE_BIG8 uint32_t tmp = num << 6; - sc->count_low = SPH_T32( sc->count_low + tmp ); - sc->count_high += (sph_u32)( (num >> 13) >> 13 ); + sc->count_low = sc->count_low + tmp; + sc->count_high += (uint32_t)( (num >> 13) >> 13 ); if ( sc->count_low < tmp ) sc->count_high++; @@ -804,7 +962,6 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) while ( num-- > 0 ) { __m512i m0, m1, m2, m3, m4, m5, m6, m7; - INPUT_BIG8; P_BIG8; T_BIG8; @@ -816,6 +973,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf ) { __m512i m0, m1, m2, m3, m4, m5, m6, m7; + DECL_STATE_BIG8 READ_STATE_BIG8( sc ); INPUT_BIG8; @@ -828,16 +986,27 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc ) { sc->partial_len = 0; sc->count_high = sc->count_low = 0; - - sc->h[0] = _mm512_set1_epi64( 0x6c70617273746565 ); - sc->h[1] = _mm512_set1_epi64( 0x656e62656b204172 ); - sc->h[2] = _mm512_set1_epi64( 0x302c206272672031 ); - sc->h[3] = _mm512_set1_epi64( 0x3434362c75732032 ); - sc->h[4] = _mm512_set1_epi64( 0x3030312020422d33 ); - sc->h[5] = _mm512_set1_epi64( 0x656e2d484c657576 ); - sc->h[6] = _mm512_set1_epi64( 0x6c65652c65766572 ); - sc->h[7] = _mm512_set1_epi64( 0x6769756d2042656c ); -} + uint64_t *iv = (uint64_t*)HAMSI_IV512; + + sc->h[0] = v512_64( iv[0] ); + sc->h[1] = v512_64( iv[1] ); + sc->h[2] = v512_64( iv[2] ); + sc->h[3] = v512_64( iv[3] ); + sc->h[4] = v512_64( iv[4] ); + sc->h[5] = v512_64( iv[5] ); + sc->h[6] = v512_64( iv[6] ); + sc->h[7] = v512_64( iv[7] ); +/* + sc->h[0] = v512_64( 0x6c70617273746565 ); + sc->h[1] = v512_64( 0x656e62656b204172 ); + sc->h[2] = v512_64( 0x302c206272672031 ); + sc->h[3] = v512_64( 0x3434362c75732032 ); + sc->h[4] = v512_64( 0x3030312020422d33 ); + sc->h[5] = v512_64( 0x656e2d484c657576 ); + sc->h[6] = v512_64( 0x6c65652c65766572 ); + sc->h[7] = v512_64( 0x6769756d2042656c ); +*/ + } void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data, size_t len ) @@ -855,11 +1024,11 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) { __m512i pad[1]; uint32_t ch, cl; - - sph_enc32be( &ch, sc->count_high ); - sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); - pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch ); - sc->buf[0] = _mm512_set1_epi64( 0x80 ); + + ch = bswap_32( sc->count_high ); + cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = v512_64( ((uint64_t)cl << 32 ) | (uint64_t)ch ); + sc->buf[0] = v512_64( 0x80 ); hamsi_8way_big( sc, sc->buf, 1 ); hamsi_8way_big_final( sc, pad ); @@ -881,14 +1050,14 @@ do { \ for ( int u = 0; u < 64; u++ ) \ { \ const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \ - m0 = _mm256_mask_xor_epi64( m0, dm, m0, _mm256_set1_epi64x( tp[0] ) ); \ - m1 = _mm256_mask_xor_epi64( m1, dm, m1, _mm256_set1_epi64x( tp[1] ) ); \ - m2 = _mm256_mask_xor_epi64( m2, dm, m2, _mm256_set1_epi64x( tp[2] ) ); \ - m3 = _mm256_mask_xor_epi64( m3, dm, m3, _mm256_set1_epi64x( tp[3] ) ); \ - m4 = _mm256_mask_xor_epi64( m4, dm, m4, _mm256_set1_epi64x( tp[4] ) ); \ - m5 = _mm256_mask_xor_epi64( m5, dm, m5, _mm256_set1_epi64x( tp[5] ) ); \ - m6 = _mm256_mask_xor_epi64( m6, dm, m6, _mm256_set1_epi64x( tp[6] ) ); \ - m7 = _mm256_mask_xor_epi64( m7, dm, m7, _mm256_set1_epi64x( tp[7] ) ); \ + m0 = _mm256_mask_xor_epi64( m0, dm, m0, v256_64( tp[0] ) ); \ + m1 = _mm256_mask_xor_epi64( m1, dm, m1, v256_64( tp[1] ) ); \ + m2 = _mm256_mask_xor_epi64( m2, dm, m2, v256_64( tp[2] ) ); \ + m3 = _mm256_mask_xor_epi64( m3, dm, m3, v256_64( tp[3] ) ); \ + m4 = _mm256_mask_xor_epi64( m4, dm, m4, v256_64( tp[4] ) ); \ + m5 = _mm256_mask_xor_epi64( m5, dm, m5, v256_64( tp[5] ) ); \ + m6 = _mm256_mask_xor_epi64( m6, dm, m6, v256_64( tp[6] ) ); \ + m7 = _mm256_mask_xor_epi64( m7, dm, m7, v256_64( tp[7] ) ); \ db = _mm256_ror_epi64( db, 1 ); \ tp += 8; \ } \ @@ -906,21 +1075,21 @@ do { \ { \ __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \ m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[0] ) ) ); \ + v256_64( tp[0] ) ) ); \ m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[1] ) ) ); \ + v256_64( tp[1] ) ) ); \ m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[2] ) ) ); \ + v256_64( tp[2] ) ) ); \ m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[3] ) ) ); \ + v256_64( tp[3] ) ) ); \ m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[4] ) ) ); \ + v256_64( tp[4] ) ) ); \ m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[5] ) ) ); \ + v256_64( tp[5] ) ) ); \ m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[6] ) ) ); \ + v256_64( tp[6] ) ) ); \ m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \ - _mm256_set1_epi64x( tp[7] ) ) ); \ + v256_64( tp[7] ) ) ); \ tp += 8; \ } \ } while (0) @@ -1126,17 +1295,17 @@ do { \ __m256i alpha[16]; \ const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \ for( int i = 0; i < 16; i++ ) \ - alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_n )[i] ); \ + alpha[i] = v256_64( ( (uint64_t*)alpha_n )[i] ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( (1ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( (1ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( (2ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( (2ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( (3ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( (3ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( (4ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( (4ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( (5ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( (5ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ } while (0) @@ -1145,29 +1314,29 @@ do { \ __m256i alpha[16]; \ const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \ for( int i = 0; i < 16; i++ ) \ - alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_f )[i] ); \ + alpha[i] = v256_64( ( (uint64_t*)alpha_f )[i] ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 1ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 1ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 2ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 2ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 3ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 3ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 4ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 4ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 5ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 5ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 6ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 6ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 7ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 7ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 8ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 8ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( ( 9ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( ( 9ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( (10ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( (10ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = _mm256_set1_epi64x( (11ULL << 32) ^ A0 ); \ + alpha[0] = v256_64( (11ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ } while (0) @@ -1183,14 +1352,376 @@ do { /* order is important */ \ c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \ } while (0) + +// Hamsi-512 8x32 + +// Experimental untested + + +#define DECL_STATE_8X32 \ + __m256i C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \ + +#define READ_STATE_8X32(sc) \ + C0 = sc->h[ 0]; \ + C1 = sc->h[ 1]; \ + C2 = sc->h[ 2]; \ + C3 = sc->h[ 3]; \ + C4 = sc->h[ 4]; \ + C5 = sc->h[ 5]; \ + C6 = sc->h[ 6]; \ + C7 = sc->h[ 7]; \ + C8 = sc->h[ 8]; \ + C9 = sc->h[ 9]; \ + CA = sc->h[10]; \ + CB = sc->h[11]; \ + CC = sc->h[12]; \ + CD = sc->h[13]; \ + CE = sc->h[14]; \ + CF = sc->h[15]; + +#define WRITE_STATE_8X32(sc) \ + sc->h[ 0] = C0; \ + sc->h[ 1] = C1; \ + sc->h[ 2] = C2; \ + sc->h[ 3] = C3; \ + sc->h[ 4] = C4; \ + sc->h[ 5] = C5; \ + sc->h[ 6] = C6; \ + sc->h[ 7] = C7; \ + sc->h[ 8] = C8; \ + sc->h[ 9] = C9; \ + sc->h[10] = CA; \ + sc->h[11] = CB; \ + sc->h[12] = CC; \ + sc->h[13] = CD; \ + sc->h[14] = CE; \ + sc->h[15] = CF; + +#if defined(__AVX512VL__) + +#define INPUT_8X32 \ +{ \ + const __m256i zero = _mm256_setzero_si256(); \ + const uint32_t *tp = (const uint32_t*)T512; \ + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \ + M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \ + for ( int v = 0; v < 2; v++ ) \ + { \ + __m256i db = _mm256_ror_epi32( buf[v], 1 ); \ + for ( int u = 0; u < 32; u++ ) \ + { \ + __mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \ + M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \ + M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \ + M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \ + M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \ + M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \ + M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \ + M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \ + M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \ + M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \ + M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \ + MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \ + MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \ + MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \ + MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \ + ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \ + MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \ + db = _mm256_ror_epi32( db, 1 ); \ + tp += 16; \ + } \ + } \ +} + +#else + +#define INPUT_8X32 \ +{ \ + const __m256i zero = _mm256_setzero_si256(); \ + const uint32_t *tp = (const uint32_t*)T512; \ + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \ + M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \ + for ( int v = 0; v < 2; v++ ) \ + { \ + __m256i db = buf[v]; \ + for ( int u = 31; u >= 0; u-- ) \ + { \ + __m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \ + M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \ + M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \ + M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \ + M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \ + M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \ + M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \ + M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \ + M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \ + M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \ + M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \ + MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \ + MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \ + MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \ + MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \ + ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \ + MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \ + tp += 16; \ + } \ + } \ +} + +#endif + +#define SBOX_8X32 SBOX +#define L_8X32 L + +#define ROUND_8X32( rc, alpha ) \ +{ \ + S00 = _mm256_xor_si256( S00, v256_32( alpha[ 0] ) ); \ + S01 = _mm256_xor_si256( S01, v256_32( (alpha[ 1]) ^ (rc) ) ); \ + S02 = _mm256_xor_si256( S02, v256_32( alpha[ 2] ) ); \ + S03 = _mm256_xor_si256( S03, v256_32( alpha[ 3] ) ); \ + S04 = _mm256_xor_si256( S04, v256_32( alpha[ 4] ) ); \ + S05 = _mm256_xor_si256( S05, v256_32( alpha[ 5] ) ); \ + S06 = _mm256_xor_si256( S06, v256_32( alpha[ 6] ) ); \ + S07 = _mm256_xor_si256( S07, v256_32( alpha[ 7] ) ); \ + S08 = _mm256_xor_si256( S08, v256_32( alpha[ 8] ) ); \ + S09 = _mm256_xor_si256( S09, v256_32( alpha[ 9] ) ); \ + S0A = _mm256_xor_si256( S0A, v256_32( alpha[10] ) ); \ + S0B = _mm256_xor_si256( S0B, v256_32( alpha[11] ) ); \ + S0C = _mm256_xor_si256( S0C, v256_32( alpha[12] ) ); \ + S0D = _mm256_xor_si256( S0D, v256_32( alpha[13] ) ); \ + S0E = _mm256_xor_si256( S0E, v256_32( alpha[14] ) ); \ + S0F = _mm256_xor_si256( S0F, v256_32( alpha[15] ) ); \ + S10 = _mm256_xor_si256( S10, v256_32( alpha[16] ) ); \ + S11 = _mm256_xor_si256( S11, v256_32( alpha[17] ) ); \ + S12 = _mm256_xor_si256( S12, v256_32( alpha[18] ) ); \ + S13 = _mm256_xor_si256( S13, v256_32( alpha[19] ) ); \ + S14 = _mm256_xor_si256( S14, v256_32( alpha[20] ) ); \ + S15 = _mm256_xor_si256( S15, v256_32( alpha[21] ) ); \ + S16 = _mm256_xor_si256( S16, v256_32( alpha[22] ) ); \ + S17 = _mm256_xor_si256( S17, v256_32( alpha[23] ) ); \ + S18 = _mm256_xor_si256( S18, v256_32( alpha[24] ) ); \ + S19 = _mm256_xor_si256( S19, v256_32( alpha[25] ) ); \ + S1A = _mm256_xor_si256( S1A, v256_32( alpha[26] ) ); \ + S1B = _mm256_xor_si256( S1B, v256_32( alpha[27] ) ); \ + S1C = _mm256_xor_si256( S1C, v256_32( alpha[28] ) ); \ + S1D = _mm256_xor_si256( S1D, v256_32( alpha[29] ) ); \ + S1E = _mm256_xor_si256( S1E, v256_32( alpha[30] ) ); \ + S1F = _mm256_xor_si256( S1F, v256_32( alpha[31] ) ); \ + SBOX_8X32( S00, S08, S10, S18 ); \ + SBOX_8X32( S01, S09, S11, S19 ); \ + SBOX_8X32( S02, S0A, S12, S1A ); \ + SBOX_8X32( S03, S0B, S13, S1B ); \ + SBOX_8X32( S04, S0C, S14, S1C ); \ + SBOX_8X32( S05, S0D, S15, S1D ); \ + SBOX_8X32( S06, S0E, S16, S1E ); \ + SBOX_8X32( S07, S0F, S17, S1F ); \ + L_8X32( S00, S09, S12, S1B ); \ + L_8X32( S01, S0A, S13, S1C ); \ + L_8X32( S02, S0B, S14, S1D ); \ + L_8X32( S03, S0C, S15, S1E ); \ + L_8X32( S04, S0D, S16, S1F ); \ + L_8X32( S05, S0E, S17, S18 ); \ + L_8X32( S06, S0F, S10, S19 ); \ + L_8X32( S07, S08, S11, S1A ); \ + L_8X32( S00, S02, S05, S07 ); \ + L_8X32( S10, S13, S15, S16 ); \ + L_8X32( S09, S0B, S0C, S0E ); \ + L_8X32( S19, S1A, S1C, S1F ); \ +} + +#define P_8X32 \ + ROUND_8X32( 0, alpha_n ); \ + ROUND_8X32( 1, alpha_n ); \ + ROUND_8X32( 2, alpha_n ); \ + ROUND_8X32( 3, alpha_n ); \ + ROUND_8X32( 4, alpha_n ); \ + ROUND_8X32( 5, alpha_n ); + +#define PF_8X32 \ + ROUND_8X32( 0, alpha_f ); \ + ROUND_8X32( 1, alpha_f ); \ + ROUND_8X32( 2, alpha_f ); \ + ROUND_8X32( 3, alpha_f ); \ + ROUND_8X32( 4, alpha_f ); \ + ROUND_8X32( 5, alpha_f ); \ + ROUND_8X32( 6, alpha_f ); \ + ROUND_8X32( 7, alpha_f ); \ + ROUND_8X32( 8, alpha_f ); \ + ROUND_8X32( 9, alpha_f ); \ + ROUND_8X32( 10, alpha_f ); \ + ROUND_8X32( 11, alpha_f ); + +#define T_8X32 \ + /* order is important */ \ + CF = sc->h[15] = _mm256_xor_si256( sc->h[15], S17 ); \ + CE = sc->h[14] = _mm256_xor_si256( sc->h[14], S16 ); \ + CD = sc->h[13] = _mm256_xor_si256( sc->h[13], S15 ); \ + CC = sc->h[12] = _mm256_xor_si256( sc->h[12], S14 ); \ + CB = sc->h[11] = _mm256_xor_si256( sc->h[11], S13 ); \ + CA = sc->h[10] = _mm256_xor_si256( sc->h[10], S12 ); \ + C9 = sc->h[ 9] = _mm256_xor_si256( sc->h[ 9], S11 ); \ + C8 = sc->h[ 8] = _mm256_xor_si256( sc->h[ 8], S10 ); \ + C7 = sc->h[ 7] = _mm256_xor_si256( sc->h[ 7], S07 ); \ + C6 = sc->h[ 6] = _mm256_xor_si256( sc->h[ 6], S06 ); \ + C5 = sc->h[ 5] = _mm256_xor_si256( sc->h[ 5], S05 ); \ + C4 = sc->h[ 4] = _mm256_xor_si256( sc->h[ 4], S04 ); \ + C3 = sc->h[ 3] = _mm256_xor_si256( sc->h[ 3], S03 ); \ + C2 = sc->h[ 2] = _mm256_xor_si256( sc->h[ 2], S02 ); \ + C1 = sc->h[ 1] = _mm256_xor_si256( sc->h[ 1], S01 ); \ + C0 = sc->h[ 0] = _mm256_xor_si256( sc->h[ 0], S00 ); + + +void hamsi_8x32_big( hamsi_8x32_big_context *sc, __m256i *buf, size_t num ) +{ + DECL_STATE_8X32 + uint32_t tmp; + + tmp = (uint32_t)num << 6; + sc->count_low = sc->count_low + tmp; + sc->count_high += (uint32_t)( (num >> 13) >> 13 ); + if ( sc->count_low < tmp ) + sc->count_high++; + + READ_STATE_8X32( sc ); + while ( num-- > 0 ) + { + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i M8, M9, MA, MB, MC, MD, ME, MF; + INPUT_8X32; + P_8X32; + T_8X32; + buf += 2; + } + WRITE_STATE_8X32( sc ); +} + +void hamsi_8x32_big_final( hamsi_8x32_big_context *sc, __m256i *buf ) +{ + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i M8, M9, MA, MB, MC, MD, ME, MF; + + DECL_STATE_8X32 + READ_STATE_8X32( sc ); + INPUT_8X32; + PF_8X32; + T_8X32; + WRITE_STATE_8X32( sc ); +} + +void hamsi512_8x32_init( hamsi512_8x32_context *sc ) +{ + sc->partial_len = 0; + sc->count_high = sc->count_low = 0; + + sc->h[ 0] = v256_32( HAMSI_IV512[ 0] ); + sc->h[ 1] = v256_32( HAMSI_IV512[ 1] ); + sc->h[ 2] = v256_32( HAMSI_IV512[ 2] ); + sc->h[ 3] = v256_32( HAMSI_IV512[ 3] ); + sc->h[ 4] = v256_32( HAMSI_IV512[ 4] ); + sc->h[ 5] = v256_32( HAMSI_IV512[ 5] ); + sc->h[ 6] = v256_32( HAMSI_IV512[ 6] ); + sc->h[ 7] = v256_32( HAMSI_IV512[ 7] ); + sc->h[ 8] = v256_32( HAMSI_IV512[ 8] ); + sc->h[ 9] = v256_32( HAMSI_IV512[ 9] ); + sc->h[10] = v256_32( HAMSI_IV512[10] ); + sc->h[11] = v256_32( HAMSI_IV512[11] ); + sc->h[12] = v256_32( HAMSI_IV512[12] ); + sc->h[13] = v256_32( HAMSI_IV512[13] ); + sc->h[14] = v256_32( HAMSI_IV512[14] ); + sc->h[15] = v256_32( HAMSI_IV512[15] ); +} + +void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data, + size_t len ) +{ + __m256i *vdata = (__m256i*)data; + + hamsi_8x32_big( sc, vdata, len >> 3 ); + vdata += ( (len & ~(size_t)7) >> 3 ); + len &= (size_t)7; + memcpy_256( sc->buf, vdata, len>> 3 ); + sc->partial_len = len; +} + +void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst ) +{ + __m256i pad[2]; + uint32_t ch, cl; + + ch = bswap_32( sc->count_high ); + cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = v256_32( ch ); + pad[1] = v256_32( cl ); + sc->buf[0] = v256_32( 0x80 ); + sc->buf[1] = _mm256_setzero_si256(); + hamsi_8x32_big( sc, sc->buf, 1 ); + hamsi_8x32_big_final( sc, pad ); + + mm256_block_bswap_32( (__m256i*)dst, sc->h ); + mm256_block_bswap_32( (__m256i*)dst + 8, sc->h + 8 ); +} + +void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst, + const void *data, size_t len ) +{ + // init + sc->partial_len = 0; + sc->count_high = sc->count_low = 0; + + sc->h[ 0] = v256_32( HAMSI_IV512[ 0] ); + sc->h[ 1] = v256_32( HAMSI_IV512[ 1] ); + sc->h[ 2] = v256_32( HAMSI_IV512[ 2] ); + sc->h[ 3] = v256_32( HAMSI_IV512[ 3] ); + sc->h[ 4] = v256_32( HAMSI_IV512[ 4] ); + sc->h[ 5] = v256_32( HAMSI_IV512[ 5] ); + sc->h[ 6] = v256_32( HAMSI_IV512[ 6] ); + sc->h[ 7] = v256_32( HAMSI_IV512[ 7] ); + sc->h[ 8] = v256_32( HAMSI_IV512[ 8] ); + sc->h[ 9] = v256_32( HAMSI_IV512[ 9] ); + sc->h[10] = v256_32( HAMSI_IV512[10] ); + sc->h[11] = v256_32( HAMSI_IV512[11] ); + sc->h[12] = v256_32( HAMSI_IV512[12] ); + sc->h[13] = v256_32( HAMSI_IV512[13] ); + sc->h[14] = v256_32( HAMSI_IV512[14] ); + sc->h[15] = v256_32( HAMSI_IV512[15] ); + + //update + __m256i *vdata = (__m256i*)data; + + hamsi_8x32_big( sc, vdata, len >> 3 ); + vdata += ( (len & ~(size_t)7) >> 3 ); + len &= (size_t)7; + memcpy_256( sc->buf, vdata, len>> 3 ); + sc->partial_len = len; + + // close + __m256i pad[2]; + uint32_t ch, cl; + + ch = bswap_32( sc->count_high ); + cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = v256_32( ch ); + pad[1] = v256_32( cl ); + sc->buf[0] = v256_32( 0x80 ); + sc->buf[1] = _mm256_setzero_si256(); + hamsi_8x32_big( sc, sc->buf, 1 ); + hamsi_8x32_big_final( sc, pad ); + + mm256_block_bswap_32( (__m256i*)dst, sc->h ); + mm256_block_bswap_32( (__m256i*)dst + 8, sc->h + 8 ); +} + + +//////////// + void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) { DECL_STATE_BIG - sph_u32 tmp; + uint32_t tmp; - tmp = SPH_T32( (sph_u32)num << 6 ); - sc->count_low = SPH_T32( sc->count_low + tmp ); - sc->count_high += (sph_u32)( (num >> 13) >> 13 ); + tmp = (uint32_t)num << 6; + sc->count_low = sc->count_low + tmp; + sc->count_high += (uint32_t)( (num >> 13) >> 13 ); if ( sc->count_low < tmp ) sc->count_high++; @@ -1223,14 +1754,14 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc ) sc->partial_len = 0; sc->count_high = sc->count_low = 0; - sc->h[0] = _mm256_set1_epi64x( 0x6c70617273746565 ); - sc->h[1] = _mm256_set1_epi64x( 0x656e62656b204172 ); - sc->h[2] = _mm256_set1_epi64x( 0x302c206272672031 ); - sc->h[3] = _mm256_set1_epi64x( 0x3434362c75732032 ); - sc->h[4] = _mm256_set1_epi64x( 0x3030312020422d33 ); - sc->h[5] = _mm256_set1_epi64x( 0x656e2d484c657576 ); - sc->h[6] = _mm256_set1_epi64x( 0x6c65652c65766572 ); - sc->h[7] = _mm256_set1_epi64x( 0x6769756d2042656c ); + sc->h[0] = v256_64( 0x6c70617273746565 ); + sc->h[1] = v256_64( 0x656e62656b204172 ); + sc->h[2] = v256_64( 0x302c206272672031 ); + sc->h[3] = v256_64( 0x3434362c75732032 ); + sc->h[4] = v256_64( 0x3030312020422d33 ); + sc->h[5] = v256_64( 0x656e2d484c657576 ); + sc->h[6] = v256_64( 0x6c65652c65766572 ); + sc->h[7] = v256_64( 0x6769756d2042656c ); } void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data, @@ -1250,17 +1781,14 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst ) __m256i pad[1]; uint32_t ch, cl; - sph_enc32be( &ch, sc->count_high ); - sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); - pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch ); - sc->buf[0] = _mm256_set1_epi64x( 0x80 ); + ch = bswap_32( sc->count_high ); + cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = v256_64( ((uint64_t)cl << 32 ) | (uint64_t)ch ); + sc->buf[0] = v256_64( 0x80 ); hamsi_big( sc, sc->buf, 1 ); hamsi_big_final( sc, pad ); mm256_block_bswap_32( (__m256i*)dst, sc->h ); } -#ifdef __cplusplus -} -#endif #endif diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h index 60e33b24..8e21219a 100644 --- a/algo/hamsi/hamsi-hash-4way.h +++ b/algo/hamsi/hamsi-hash-4way.h @@ -36,44 +36,64 @@ #define HAMSI_4WAY_H__ #include -#include "algo/sha/sph_types.h" #if defined (__AVX2__) #include "simd-utils.h" -#ifdef __cplusplus -extern "C"{ -#endif - -#define SPH_SIZE_hamsi512 512 +// Hamsi-512 4x64 // Partial is only scalar but needs pointer ref for hamsi-helper // deprecate partial_len -typedef struct { +typedef struct +{ __m256i h[8]; __m256i buf[1]; size_t partial_len; - sph_u32 count_high, count_low; + uint32_t count_high, count_low; } hamsi_4way_big_context; - typedef hamsi_4way_big_context hamsi512_4way_context; void hamsi512_4way_init( hamsi512_4way_context *sc ); void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data, size_t len ); -//#define hamsi512_4way hamsi512_4way_update void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst ); +#define hamsi512_4x64_context hamsi512_4way_context +#define hamsi512_4x64_init hamsi512_4way_init +#define hamsi512_4x64_update hamsi512_4way_update +#define hamsi512_4x64_close hamsi512_4way_close + +// Hamsi-512 8x32 + +typedef struct +{ + __m256i h[16]; + __m256i buf[2]; + size_t partial_len; + uint32_t count_high, count_low; +} hamsi_8x32_big_context; +typedef hamsi_8x32_big_context hamsi512_8x32_context; + +void hamsi512_8x32_init( hamsi512_8x32_context *sc ); +void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data, + size_t len ); +void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst ); +void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data, + size_t len ); + +#endif + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +// Hamsi-512 8x64 + typedef struct { __m512i h[8]; __m512i buf[1]; size_t partial_len; - sph_u32 count_high, count_low; + uint32_t count_high, count_low; } hamsi_8way_big_context; - typedef hamsi_8way_big_context hamsi512_8way_context; void hamsi512_8way_init( hamsi512_8way_context *sc ); @@ -81,15 +101,29 @@ void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data, size_t len ); void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst ); +#define hamsi512_8x64_context hamsi512_8way_context +#define hamsi512_8x64_init hamsi512_8way_init +#define hamsi512_8x64_update hamsi512_8way_update +#define hamsi512_8x64_close hamsi512_8way_close +// Hamsi-512 16x32 -#endif - +typedef struct +{ + __m512i h[16]; + __m512i buf[2]; + size_t partial_len; + uint32_t count_high, count_low; +} hamsi_16x32_big_context; +typedef hamsi_16x32_big_context hamsi512_16x32_context; -#ifdef __cplusplus -} -#endif +void hamsi512_16x32_init( hamsi512_16x32_context *sc ); +void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data, + size_t len ); +void hamsi512_16way_close( hamsi512_16x32_context *sc, void *dst ); +void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst, + const void *data, size_t len ); -#endif +#endif // AVX512 #endif diff --git a/algo/hamsi/sph_hamsi.h b/algo/hamsi/sph_hamsi.h index b0cb20c0..0d55ccfe 100644 --- a/algo/hamsi/sph_hamsi.h +++ b/algo/hamsi/sph_hamsi.h @@ -36,7 +36,7 @@ #define SPH_HAMSI_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #ifdef __cplusplus extern "C"{ diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c index 313b23fa..e3ae285a 100644 --- a/algo/haval/haval-4way-helper.c +++ b/algo/haval/haval-4way-helper.c @@ -48,7 +48,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update) while ( len > 0 ) { unsigned clen; - sph_u32 clow, clow2; + uint32_t clow, clow2; clen = 128U - current; if ( clen > len ) @@ -67,7 +67,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update) current = 0; } clow = sc->count_low; - clow2 = SPH_T32(clow + clen); + clow2 = clow + clen; sc->count_low = clow2; if ( clow2 < clow ) sc->count_high ++; diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index f3981c1c..39bbb756 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -292,7 +292,9 @@ static const unsigned MP5[32] = { 2, 23, 16, 22, 4, 1, 25, 15 }; -static const sph_u32 RK2[32] = { +#define SPH_C32(x) (x) + +static const uint32_t RK2[32] = { SPH_C32(0x452821E6), SPH_C32(0x38D01377), SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), @@ -311,7 +313,7 @@ static const sph_u32 RK2[32] = { SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5) }; -static const sph_u32 RK3[32] = { +static const uint32_t RK3[32] = { SPH_C32(0x9C30D539), SPH_C32(0x2AF26013), SPH_C32(0xC5D1B023), SPH_C32(0x286085F0), SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF), @@ -330,7 +332,7 @@ static const sph_u32 RK3[32] = { SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C) }; -static const sph_u32 RK4[32] = { +static const uint32_t RK4[32] = { SPH_C32(0x7A325381), SPH_C32(0x28958677), SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF), SPH_C32(0xC4BFE81B), SPH_C32(0x66282193), @@ -349,7 +351,7 @@ static const sph_u32 RK4[32] = { SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4) }; -static const sph_u32 RK5[32] = { +static const uint32_t RK5[32] = { SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98), SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176), SPH_C32(0x66CA593E), SPH_C32(0x82430E88), diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h index 9164d2fd..d032e1bc 100644 --- a/algo/haval/haval-hash-4way.h +++ b/algo/haval/haval-hash-4way.h @@ -68,7 +68,6 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" #include "simd-utils.h" #define SPH_SIZE_haval256_5 256 @@ -77,7 +76,7 @@ typedef struct { __m128i buf[32]; __m128i s0, s1, s2, s3, s4, s5, s6, s7; unsigned olen, passes; - sph_u32 count_high, count_low; + uint32_t count_high, count_low; } haval_4way_context; typedef haval_4way_context haval256_5_4way_context; diff --git a/algo/haval/sph-haval.h b/algo/haval/sph-haval.h index 9ec57721..710393a4 100644 --- a/algo/haval/sph-haval.h +++ b/algo/haval/sph-haval.h @@ -66,7 +66,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for HAVAL-128/3. diff --git a/algo/jh/sph_jh.h b/algo/jh/sph_jh.h index 77a0fdb4..a5c37695 100644 --- a/algo/jh/sph_jh.h +++ b/algo/jh/sph_jh.h @@ -41,7 +41,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for JH-224. diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index f8b0cd1a..95f437e0 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -2,7 +2,6 @@ #include #include #include -#include "sph_keccak.h" #include "keccak-hash-4way.h" #if defined(KECCAK_8WAY) diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index c710836b..b2f0a212 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -9,7 +9,7 @@ int hard_coded_eb = 1; bool register_keccak_algo( algo_gate_t* gate ) { gate->optimizations = AVX2_OPT | AVX512_OPT; - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; + gate->gen_merkle_root = (void*)&sha256_gen_merkle_root; opt_target_factor = 128.0; #if defined (KECCAK_8WAY) gate->scanhash = (void*)&scanhash_keccak_8way; diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h index 5b91bcfe..20554091 100644 --- a/algo/keccak/keccak-hash-4way.h +++ b/algo/keccak/keccak-hash-4way.h @@ -1,45 +1,6 @@ -/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * Keccak interface. This is the interface for Keccak with the - * recommended parameters for SHA-3, with output lengths 224, 256, - * 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_keccak.h - * @author Thomas Pornin - */ - #ifndef KECCAK_HASH_4WAY_H__ #define KECCAK_HASH_4WAY_H__ -#ifdef __cplusplus -extern "C"{ -#endif - #ifdef __AVX2__ #include @@ -100,8 +61,4 @@ void keccak512_4way_addbits_and_close( #endif -#ifdef __cplusplus -} -#endif - #endif diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c index ffb4056d..ca5ab726 100644 --- a/algo/keccak/sha3d-4way.c +++ b/algo/keccak/sha3d-4way.c @@ -2,7 +2,6 @@ #include #include #include -#include "sph_keccak.h" #include "keccak-hash-4way.h" #if defined(KECCAK_8WAY) diff --git a/algo/keccak/sph_keccak.h b/algo/keccak/sph_keccak.h index ec2dbfc7..b075f150 100644 --- a/algo/keccak/sph_keccak.h +++ b/algo/keccak/sph_keccak.h @@ -41,7 +41,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for Keccak-224. diff --git a/algo/lanehash/lane.h b/algo/lanehash/lane.h index 4a02e643..a0e85f60 100644 --- a/algo/lanehash/lane.h +++ b/algo/lanehash/lane.h @@ -23,7 +23,6 @@ #define LANE_H #include -//#include "algo/sha/sha3-defs.h" #include typedef unsigned char BitSequence; diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index 64d14069..f9b049b9 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -7,8 +7,10 @@ #include "simd-utils.h" +#define uint32 uint32_t + /* initial values of chaining variables */ -static const uint32 IV[40] __attribute((aligned(64))) = { +static const uint32_t IV[40] __attribute((aligned(64))) = { 0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69, 0xdef610bb,0xee058139,0x90152df4,0x6e292011, 0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95, @@ -22,7 +24,7 @@ static const uint32 IV[40] __attribute((aligned(64))) = { }; /* Round Constants */ -static const uint32 CNS_INIT[128] __attribute((aligned(64))) = { +static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = { 0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6, 0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818, 0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299, diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h index ba9bc147..a274995f 100644 --- a/algo/luffa/luffa-hash-2way.h +++ b/algo/luffa/luffa-hash-2way.h @@ -23,7 +23,7 @@ #if defined(__AVX2__) #include -#include "algo/sha/sha3-defs.h" +//#include "algo/sha/sha3-defs.h" #include "simd-utils.h" /* The length of digests*/ @@ -54,7 +54,7 @@ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) typedef struct { - uint32 buffer[8*4]; + uint32_t buffer[8*4]; __m512i chainv[10]; /* Chaining values */ int hashbitlen; int rembytes; @@ -82,7 +82,7 @@ int luffa512_4way_update_close( luffa_4way_context *state, void *output, #endif typedef struct { - uint32 buffer[8*2]; + uint32_t buffer[8*2]; __m256i chainv[10]; /* Chaining values */ int hashbitlen; int rembytes; diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h index f20a400c..aaa066e6 100644 --- a/algo/luffa/luffa_for_sse2.h +++ b/algo/luffa/luffa_for_sse2.h @@ -22,7 +22,7 @@ */ #include -#include "algo/sha/sha3-defs.h" +#include "compat/sha3-defs.h" /* The length of digests*/ #define DIGEST_BIT_LEN_224 224 #define DIGEST_BIT_LEN_256 256 diff --git a/algo/luffa/sph_luffa.h b/algo/luffa/sph_luffa.h index 5201d2fc..5cc24b7d 100644 --- a/algo/luffa/sph_luffa.h +++ b/algo/luffa/sph_luffa.h @@ -41,7 +41,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for Luffa-224. diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h index 5ab0b813..71c0a3f9 100644 --- a/algo/lyra2/lyra2.h +++ b/algo/lyra2/lyra2.h @@ -21,9 +21,8 @@ #define LYRA2_H_ #include -#include "algo/sha/sha3-defs.h" -//typedef unsigned char byte; +typedef unsigned char byte; //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED) #define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t) diff --git a/algo/lyra2/lyra2rev2.c b/algo/lyra2/lyra2rev2.c index 69155182..8e052971 100644 --- a/algo/lyra2/lyra2rev2.c +++ b/algo/lyra2/lyra2rev2.c @@ -4,7 +4,6 @@ #include #include "algo/blake/sph_blake.h" -#include "algo/cubehash/sph_cubehash.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/bmw/sph_bmw.h" diff --git a/algo/lyra2/lyra2rev3.c b/algo/lyra2/lyra2rev3.c index e72ec88c..d1e5b518 100644 --- a/algo/lyra2/lyra2rev3.c +++ b/algo/lyra2/lyra2rev3.c @@ -4,7 +4,6 @@ #include #include "algo/blake/sph_blake.h" -#include "algo/cubehash/sph_cubehash.h" #include "algo/bmw/sph_bmw.h" #include "algo/cubehash/cubehash_sse2.h" //#include "lyra2.h" diff --git a/algo/panama/sph_panama.h b/algo/panama/sph_panama.h index 6f9d3e8a..638e4313 100644 --- a/algo/panama/sph_panama.h +++ b/algo/panama/sph_panama.h @@ -58,7 +58,7 @@ #define SPH_PANAMA_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for PANAMA. diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 45b10115..96c6d0f8 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -21,7 +21,7 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha512-hash.h" #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" #include "algo/shavite/shavite-hash-4way.h" diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c index 386e2452..990a4af3 100644 --- a/algo/ripemd/lbry-4way.c +++ b/algo/ripemd/lbry-4way.c @@ -3,7 +3,8 @@ #include #include #include -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha256-hash.h" +#include "algo/sha/sha512-hash.h" #include "ripemd-hash-4way.h" #define LBRY_INPUT_SIZE 112 diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h index 71fb3d73..c0c87db4 100644 --- a/algo/ripemd/ripemd-hash-4way.h +++ b/algo/ripemd/ripemd-hash-4way.h @@ -2,7 +2,6 @@ #define RIPEMD_HASH_4WAY_H__ #include -#include "algo/sha/sph_types.h" #if defined(__SSE4_2__) diff --git a/algo/ripemd/sph_ripemd.h b/algo/ripemd/sph_ripemd.h index b677bd54..c0019f9c 100644 --- a/algo/ripemd/sph_ripemd.h +++ b/algo/ripemd/sph_ripemd.h @@ -57,7 +57,7 @@ #define SPH_RIPEMD_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for RIPEMD. diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index c36411bd..b60a5ba8 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -31,7 +31,6 @@ #include #include #include -#include "algo/sha/sha-hash-4way.h" #include "algo/sha/sha256-hash.h" #include #include "malloc-huge.h" diff --git a/algo/sha/hmac-sha256-hash-4way.h b/algo/sha/hmac-sha256-hash-4way.h index f33fa23a..31d51cd9 100644 --- a/algo/sha/hmac-sha256-hash-4way.h +++ b/algo/sha/hmac-sha256-hash-4way.h @@ -36,7 +36,7 @@ #include #include #include "simd-utils.h" -#include "sha-hash-4way.h" +#include "sha256-hash.h" typedef struct _hmac_sha256_4way_context { diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h deleted file mode 100644 index 2e95c7f0..00000000 --- a/algo/sha/sha-hash-4way.h +++ /dev/null @@ -1,168 +0,0 @@ -/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * SHA-224, SHA-256, SHA-384 and SHA-512 interface. - * - * SHA-256 has been published in FIPS 180-2, now amended with a change - * notice to include SHA-224 as well (which is a simple variation on - * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS - * standards can be found at: - * http://csrc.nist.gov/publications/fips/ - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_sha2.h - * @author Thomas Pornin - */ - -#ifndef SHA2_HASH_4WAY_H__ -#define SHA2_HASH_4WAY_H__ 1 - -#include -#include "simd-utils.h" - -#if defined(__SSE2__) - -// SHA-256 4 way - -typedef struct { - __m128i buf[64>>2]; - __m128i val[8]; - uint32_t count_high, count_low; -} sha256_4way_context __attribute__ ((aligned (64))); - -void sha256_4way_init( sha256_4way_context *sc ); -void sha256_4way_update( sha256_4way_context *sc, const void *data, - size_t len ); -void sha256_4way_close( sha256_4way_context *sc, void *dst ); -void sha256_4way_full( void *dst, const void *data, size_t len ); -void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, - const __m128i *state_in ); -void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, - const __m128i *state_in ); -void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, - const __m128i *W, const __m128i *state_in ); -void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, - const __m128i *state_in, const __m128i *state_mid, const __m128i *X ); -int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, - const __m128i *state_in, const uint32_t *target ); - -#endif // SSE2 - -#if defined (__AVX2__) - -// SHA-256 8 way - -typedef struct { - __m256i buf[64>>2]; - __m256i val[8]; - uint32_t count_high, count_low; -} sha256_8way_context __attribute__ ((aligned (128))); - -void sha256_8way_init( sha256_8way_context *sc ); -void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ); -void sha256_8way_close( sha256_8way_context *sc, void *dst ); -void sha256_8way_full( void *dst, const void *data, size_t len ); -void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, - const __m256i *state_in ); -void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, - const __m256i *state_in ); - -void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, - const __m256i *W, const __m256i *state_in ); -void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, - const __m256i *state_in, const __m256i *state_mid, const __m256i *X ); -int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, - const __m256i *state_in, const uint32_t *target ); - -#endif // AVX2 - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -// SHA-256 16 way - -typedef struct { - __m512i buf[64>>2]; - __m512i val[8]; - uint32_t count_high, count_low; -} sha256_16way_context __attribute__ ((aligned (128))); - -void sha256_16way_init( sha256_16way_context *sc ); -void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ); -void sha256_16way_close( sha256_16way_context *sc, void *dst ); -void sha256_16way_full( void *dst, const void *data, size_t len ); -void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, - const __m512i *state_in ); -void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, - const __m512i *state_in ); -void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, - const __m512i *W, const __m512i *state_in ); -void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, - const __m512i *state_in, const __m512i *state_mid, const __m512i *X ); - -int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, - const __m512i *state_in, const uint32_t *target ); - -#endif // AVX512 - -#if defined (__AVX2__) - -// SHA-512 4 way - -typedef struct { - __m256i buf[128>>3]; - __m256i val[8]; - uint64_t count; - bool initialized; -} sha512_4way_context __attribute__ ((aligned (128))); - -void sha512_4way_init( sha512_4way_context *sc); -void sha512_4way_update( sha512_4way_context *sc, const void *data, - size_t len ); -void sha512_4way_close( sha512_4way_context *sc, void *dst ); -void sha512_4way_full( void *dst, const void *data, size_t len ); - -#endif // AVX2 - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -// SHA-512 8 way - -typedef struct { - __m512i buf[128>>3]; - __m512i val[8]; - uint64_t count; - bool initialized; -} sha512_8way_context __attribute__ ((aligned (128))); - -void sha512_8way_init( sha512_8way_context *sc); -void sha512_8way_update( sha512_8way_context *sc, const void *data, - size_t len ); -void sha512_8way_close( sha512_8way_context *sc, void *dst ); -void sha512_8way_full( void *dst, const void *data, size_t len ); - -#endif // AVX512 - -#endif // SHA256_4WAY_H__ diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c deleted file mode 100644 index 7fc64ca3..00000000 --- a/algo/sha/sha256-hash-2way-ni.c +++ /dev/null @@ -1,689 +0,0 @@ -/* Intel SHA extensions using C intrinsics */ -/* Written and place in public domain by Jeffrey Walton */ -/* Based on code from Intel, and by Sean Gulley for */ -/* the miTLS project. */ - -// A stripped down version with byte swapping removed. - -#if defined(__SHA__) - -#include "sha256-hash.h" - -void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, - const void *msg_X, const void *msg_Y, - const uint32_t *in_X, const uint32_t *in_Y ) -{ - __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; - __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; - __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; - __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; - __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y; - - // Load initial values - TMP_X = _mm_load_si128((__m128i*) &in_X[0]); - STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); - TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); - STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); - - TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB - TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB - STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH - STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH - STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF - STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF - STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH - STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH - - // Save current hash - ABEF_SAVE_X = STATE0_X; - ABEF_SAVE_Y = STATE0_Y; - CDGH_SAVE_X = STATE1_X; - CDGH_SAVE_Y = STATE1_Y; - - // Rounds 0-3 - TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); - TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); - TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Rounds 4-7 - TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); - TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); - TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); - TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); - - // Rounds 8-11 - TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); - TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); - TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); - TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); - - // Rounds 12-15 - TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); - TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); - TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); - TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); - TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); - TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); - TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); - TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); - - // Rounds 16-19 - TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); - MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); - TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); - TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); - TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); - TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); - TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); - - // Rounds 20-23 - TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); - TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); - TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); - TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); - TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); - TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); - - // Rounds 24-27 - TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); - TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); - TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); - TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); - TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); - TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); - - // Rounds 28-31 - TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); - TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); - TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); - TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); - TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); - TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); - - // Rounds 32-35 - TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); - MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); - TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); - TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); - TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); - TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); - TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); - - // Rounds 36-39 - TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); - TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); - TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); - TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); - TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); - TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); - - // Rounds 40-43 - TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); - TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); - TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); - TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); - TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); - TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); - - // Rounds 44-47 - TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); - TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); - TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); - TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); - TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); - TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); - - // Rounds 48-51 - TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); - MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); - TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); - TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); - TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); - TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); - TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); - - // Rounds 52-55 - TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); - TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); - TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); - TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); - TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Rounds 56-59 - TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); - TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); - TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); - TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); - TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Rounds 60-63 - TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Add values back to state - STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); - STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); - STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); - STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); - - TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA - TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA - STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG - STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG - STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA - STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA - STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF - STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF - - // Save state - _mm_store_si128((__m128i*) &out_X[0], STATE0_X); - _mm_store_si128((__m128i*) &out_X[4], STATE1_X); - _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); - _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); -} - -void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, - const void *msg_X, const void *msg_Y, - const uint32_t *in_X, const uint32_t *in_Y ) -{ - __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; - __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK; - __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; - __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; - __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y; - - // Load initial values - TMP_X = _mm_load_si128((__m128i*) &in_X[0]); - STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); - TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); - STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); - MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); - - TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB - TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB - STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH - STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH - STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF - STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF - STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH - STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH - - // Save current hash - ABEF_SAVE_X = STATE0_X; - ABEF_SAVE_Y = STATE0_Y; - CDGH_SAVE_X = STATE1_X; - CDGH_SAVE_Y = STATE1_Y; - - // Rounds 0-3 - TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); - TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); - TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); - TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK ); - TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Rounds 4-7 - TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); - TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); - TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); - TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK ); - TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK ); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); - TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); - - // Rounds 8-11 - TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); - TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); - TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); - TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK ); - TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK ); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); - TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); - - // Rounds 12-15 - TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); - TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); - TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); - TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK ); - TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK ); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); - TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); - TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); - TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); - TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); - TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); - - // Rounds 16-19 - TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); - MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); - TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); - TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); - TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); - TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); - TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); - - // Rounds 20-23 - TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); - TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); - TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); - TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); - TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); - TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); - - // Rounds 24-27 - TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); - TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); - TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); - TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); - TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); - TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); - - // Rounds 28-31 - TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); - TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); - TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); - TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); - TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); - TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); - - // Rounds 32-35 - TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); - MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); - TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); - TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); - TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); - TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); - TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); - - // Rounds 36-39 - TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); - TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); - TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); - TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); - TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); - TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); - - // Rounds 40-43 - TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); - TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); - TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); - TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); - TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); - TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); - - // Rounds 44-47 - TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); - TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); - TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); - TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); - TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); - TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); - - // Rounds 48-51 - TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); - MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); - TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); - TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); - TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); - TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); - TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); - - // Rounds 52-55 - TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); - MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); - TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); - TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); - TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); - TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Rounds 56-59 - TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); - MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); - TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); - TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); - TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); - TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); - TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Rounds 60-63 - TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); - MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); - MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); - STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); - STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); - MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); - MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); - STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); - STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); - - // Add values back to state - STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); - STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); - STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); - STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); - - TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA - TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA - STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG - STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG - STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA - STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA - STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF - STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF - - // Save state - _mm_store_si128((__m128i*) &out_X[0], STATE0_X); - _mm_store_si128((__m128i*) &out_X[4], STATE1_X); - _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); - _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); -} - - -#endif - diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index 549b2b7c..1f2d4e9d 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -3,7 +3,7 @@ #include #include -#include "sha-hash-4way.h" +#include "sha256-hash.h" #include "compat.h" /* @@ -610,6 +610,16 @@ do { \ SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \ SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j ); +// Not used with AVX512, needed to satisfy the compiler +#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ +{ \ + __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \ + _mm256_set1_epi32( K256[(i)+(j)] ) ); \ + __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ + D = _mm256_add_epi32( D, T1 ); \ + H = _mm256_add_epi32( T1, T2 ); \ +} + #else // AVX2 #define CHx(X, Y, Z) \ @@ -621,6 +631,16 @@ do { \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ Y_xor_Z ) ) +#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ +{ \ + __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \ + _mm256_set1_epi32( K256[(i)+(j)] ) ); \ + __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ + D = _mm256_add_epi32( D, T1 ); \ + H = _mm256_add_epi32( T1, T2 ); \ +} + #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \ do { \ __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \ @@ -635,7 +655,6 @@ do { \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) - // read Y_xor_Z, update X_xor_Y #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ @@ -769,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 ); SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 ); - // round 3 part 1, ignore nonces W[3] + // round 3 part 1, avoid nonces W[3] T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D), _mm256_set1_epi32( K256[3] ) ); A = _mm256_add_epi32( A, T1 ); @@ -807,23 +826,22 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G ); #endif - // round 3 part 2, inject nonces + // round 3 part 2, add nonces A = _mm256_add_epi32( A, W[3] ); E = _mm256_add_epi32( E, W[3] ); -// SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 ); - SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 ); - SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 ); - SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 ); - SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 ); - SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 ); - SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 0 ); - SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, 0 ); - SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, 0 ); - SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, 0 ); - SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, 0 ); - SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, 0 ); - SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 ); + SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 ); + SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 ); + SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 ); + SHA256_8WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 ); + SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 ); + SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 ); + SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 ); + SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 ); + SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 ); + SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 ); + SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 ); + SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 ); W[ 0] = X[ 0]; W[ 1] = X[ 1]; @@ -865,6 +883,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256_8WAY_MEXP_16ROUNDS( W ); SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); @@ -887,8 +906,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, _mm256_store_si256( state_out + 7, H ); } - -// It's working with a high hit rate but performance is lower int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, const __m256i *state_in, const uint32_t *target ) { @@ -912,14 +929,37 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, const __m256i IV7 = H; const __m256i IV6 = G; - SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); +#endif - for ( int j = 16; j < 48; j += 16 ) - { - SHA256_8WAY_MEXP_16ROUNDS( W ); - SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ); - } + // rounds 0 to 16, ignore zero padding W[9..14] + SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 ); + SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 ); + SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 ); + SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 ); + SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 ); + SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 ); + SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 ); + SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 ); + SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 ); + SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 ); + SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 ); + SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 ); + SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 ); + SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 ); + SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 ); + SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 ); + + // rounds 16 ro 31 + SHA256_8WAY_MEXP_16ROUNDS( W ); + SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + // rounds 32 to 47 + SHA256_8WAY_MEXP_16ROUNDS( W ); + SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + + // rounds 48 to 60 mexp W[ 0] = SHA256_8WAY_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); W[ 1] = SHA256_8WAY_MEXP( W[15], W[10], W[ 2], W[ 1] ); W[ 2] = SHA256_8WAY_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); @@ -935,9 +975,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] ); #if !defined(__AVX512VL__) - __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); + Y_xor_Z = _mm256_xor_si256( B, C ); #endif + // rounds 48 to 57 SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 ); SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 ); SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 ); @@ -968,7 +1009,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); if ( likely( 0xff == ( flip ^ mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) )) - return 0; + return 0; t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) ); @@ -983,28 +1024,29 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, if ( t6_mask ) { - // Testing H inconclusive: hash7 == target7, need to test G + // Testing H was inconclusive: hash7 == target7, need to test G targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) ); hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf ); - if ( unlikely( 0 != ( t6_mask & mm256_movmask_32( + if ( likely( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpeq_epi32( hash, targ ) ) ) )) - return 0; - else { flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); if ( likely( 0 != ( t6_mask & ( flip ^ mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) )) return 0; - else if ( likely( target[6] == 0x80000000 )) - { - if ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32( - hash, _mm256_xor_si256( hash, hash ) ) ) ) ) - return 0; - } + if ( likely( ( target[6] == 0x80000000 ) + && ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32( + hash, _mm256_xor_si256( hash, hash ) ) ) ) ) )) + return 0; } +// else inconclusive, testing targ5 isn't practical, fininsh hashing } +// At this point either the hash will be good or the test was inconclusive. +// If the latter it's probably a high target difficulty with a nearly equal +// high difficulty hash that has a good chance of being good. + // rounds 59 to 61 part 2 E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ), MAJx( F, G, H ) ) ); @@ -1179,6 +1221,15 @@ do { \ H = _mm512_add_epi32( T1, T2 ); \ } while (0) +#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ +{ \ + __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \ + _mm512_set1_epi32( K256[(i)+(j)] ) ); \ + __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ + D = _mm512_add_epi32( D, T1 ); \ + H = _mm512_add_epi32( T1, T2 ); \ +} + /* #define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \ do { \ @@ -1292,7 +1343,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 ); SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 ); - // round 3 part 1, ignore nonces W[3] + // round 3 part 1, avoid nonces W[3] T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D), _mm512_set1_epi32( K256[3] ) ); A = _mm512_add_epi32( A, T1 ); @@ -1312,7 +1363,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, const __m512i *state_in, const __m512i *state_mid, const __m512i *X ) { - __m512i A, B, C, D, E, F, G, H, T1, T2; + __m512i A, B, C, D, E, F, G, H; __m512i W[16]; memcpy_512( W, data, 16 ); @@ -1326,87 +1377,25 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, G = _mm512_load_si512( state_mid + 6 ); H = _mm512_load_si512( state_mid + 7 ); - // round 3 part 2, inject nonces + // round 3 part 2, add nonces A = _mm512_add_epi32( A, W[3] ); E = _mm512_add_epi32( E, W[3] ); - // round 4 - SHA256_16WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 ); - - // round 5 - T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B), - _mm512_set1_epi32( K256[5] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) ); - G = _mm512_add_epi32( G, T1 ); - C = _mm512_add_epi32( T1, T2 ); - - // round 6 - T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A), - _mm512_set1_epi32( K256[6] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) ); - F = _mm512_add_epi32( F, T1 ); - B = _mm512_add_epi32( T1, T2 ); - - // round 7 - T1 = mm512_add4_32( A, BSG2_1x16(F), CHx16(F, G, H), - _mm512_set1_epi32( K256[7] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(B), MAJx16(B, C, D) ); - E = _mm512_add_epi32( E, T1 ); - A = _mm512_add_epi32( T1, T2 ); - - // round 8 - T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), - _mm512_set1_epi32( K256[8] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); - D = _mm512_add_epi32( D, T1 ); - H = _mm512_add_epi32( T1, T2 ); - - // round 9 - T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F), - _mm512_set1_epi32( K256[9] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) ); - C = _mm512_add_epi32( C, T1 ); - G = _mm512_add_epi32( T1, T2 ); - - // round 10 - T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E), - _mm512_set1_epi32( K256[10] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) ); - B = _mm512_add_epi32( B, T1 ); - F = _mm512_add_epi32( T1, T2 ); - - // round 11 - T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D), - _mm512_set1_epi32( K256[11] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) ); - A = _mm512_add_epi32( A, T1 ); - E = _mm512_add_epi32( T1, T2 ); - - // round 12 - T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C), - _mm512_set1_epi32( K256[12] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) ); - H = _mm512_add_epi32( H, T1 ); - D = _mm512_add_epi32( T1, T2 ); - - // round 13 - T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B), - _mm512_set1_epi32( K256[13] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) ); - G = _mm512_add_epi32( G, T1 ); - C = _mm512_add_epi32( T1, T2 ); - - // round 14 - T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A), - _mm512_set1_epi32( K256[14] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) ); - F = _mm512_add_epi32( F, T1 ); - B = _mm512_add_epi32( T1, T2 ); - - // round 15 - SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 ); - - // rounds 16 to 31 mexp part 2, inject nonces. + // rounds 4 to 15, ignore zero padding W[5..14] + SHA256_16WAY_ROUND ( E, F, G, H, A, B, C, D, 4, 0 ); + SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 ); + SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 ); + SHA256_16WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 ); + SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 ); + SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 ); + SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 ); + SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 ); + SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 ); + SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 ); + SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 ); + SHA256_16WAY_ROUND ( B, C, D, E, F, G, H, A, 15, 0 ); + + // rounds 16 to 31 mexp part 2, add nonces. W[ 0] = X[ 0]; W[ 1] = X[ 1]; W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) ); @@ -1428,6 +1417,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + // rounds 32 to 63 W[ 0] = _mm512_add_epi32( X[ 6], _mm512_add_epi32( SSG2_1x16( W[14] ), W[ 9] ) ); W[ 1] = SHA256_16WAY_MEXP( W[15], W[10], W[ 2], W[ 1] ); @@ -1505,41 +1495,12 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 ); // rounds 9 to 14, ignore zero padding - T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F), - _mm512_set1_epi32( K256[9] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) ); - C = _mm512_add_epi32( C, T1 ); - G = _mm512_add_epi32( T1, T2 ); - - T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E), - _mm512_set1_epi32( K256[10] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) ); - B = _mm512_add_epi32( B, T1 ); - F = _mm512_add_epi32( T1, T2 ); - - T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D), - _mm512_set1_epi32( K256[11] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) ); - A = _mm512_add_epi32( A, T1 ); - E = _mm512_add_epi32( T1, T2 ); - - T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C), - _mm512_set1_epi32( K256[12] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) ); - H = _mm512_add_epi32( H, T1 ); - D = _mm512_add_epi32( T1, T2 ); - - T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B), - _mm512_set1_epi32( K256[13] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) ); - G = _mm512_add_epi32( G, T1 ); - C = _mm512_add_epi32( T1, T2 ); - - T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A), - _mm512_set1_epi32( K256[14] ) ); - T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) ); - F = _mm512_add_epi32( F, T1 ); - B = _mm512_add_epi32( T1, T2 ); + SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 ); + SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 ); + SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 ); + SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 ); + SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 ); + SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 ); // round 15 SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 ); @@ -1575,7 +1536,6 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, // rounds 32 to 47 SHA256_MEXP_16WAY_16ROUNDS( W ); - SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); // rounds 48 to 60 mexp @@ -1640,8 +1600,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, { hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf ); targ = _mm512_set1_epi32( target[6] ); - if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, - hash, targ ) )) + if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) )) return 0; } diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c deleted file mode 100644 index e08dd60b..00000000 --- a/algo/sha/sha256-hash-opt.c +++ /dev/null @@ -1,388 +0,0 @@ -/* Intel SHA extensions using C intrinsics */ -/* Written and place in public domain by Jeffrey Walton */ -/* Based on code from Intel, and by Sean Gulley for */ -/* the miTLS project. */ - -// A stripped down version with byte swapping removed. - -#if defined(__SHA__) - -#include "sha256-hash.h" - -void sha256_opt_transform_le( uint32_t *state_out, const void *input, - const uint32_t *state_in ) -{ - __m128i STATE0, STATE1; - __m128i MSG, TMP; - __m128i TMSG0, TMSG1, TMSG2, TMSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - // Load initial values - TMP = _mm_load_si128((__m128i*) &state_in[0]); - STATE1 = _mm_load_si128((__m128i*) &state_in[4]); -// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); - - TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH - - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Rounds 0-3 - TMSG0 = _mm_load_si128((const __m128i*) (input+0)); -// TMSG0 = _mm_shuffle_epi8(MSG, MASK); - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 4-7 - TMSG1 = _mm_load_si128((const __m128i*) (input+16)); -// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 8-11 - TMSG2 = _mm_load_si128((const __m128i*) (input+32)); -// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 12-15 - TMSG3 = _mm_load_si128((const __m128i*) (input+48)); -// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 16-19 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 20-23 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 24-27 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 28-31 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 32-35 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 36-39 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 40-43 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 44-47 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 48-51 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 52-55 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 56-59 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 60-63 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Add values back to state - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF - - // Save state - _mm_store_si128((__m128i*) &state_out[0], STATE0); - _mm_store_si128((__m128i*) &state_out[4], STATE1); -} - - -void sha256_opt_transform_be( uint32_t *state_out, const void *input, - const uint32_t *state_in ) -{ - __m128i STATE0, STATE1; - __m128i MSG, TMP, MASK; - __m128i TMSG0, TMSG1, TMSG2, TMSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - // Load initial values - TMP = _mm_load_si128((__m128i*) &state_in[0]); - STATE1 = _mm_load_si128((__m128i*) &state_in[4]); - MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); - - TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH - - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Rounds 0-3 - TMSG0 = _mm_load_si128((const __m128i*) (input+0)); - TMSG0 = _mm_shuffle_epi8( TMSG0, MASK ); - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 4-7 - TMSG1 = _mm_load_si128((const __m128i*) (input+16)); - TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - // Rounds 8-11 - TMSG2 = _mm_load_si128((const __m128i*) (input+32)); - TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 12-15 - TMSG3 = _mm_load_si128((const __m128i*) (input+48)); - TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 16-19 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 20-23 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 24-27 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 28-31 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 32-35 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 36-39 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 40-43 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 44-47 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 48-51 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 52-55 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 56-59 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 60-63 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Add values back to state - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF - - // Save state - _mm_store_si128((__m128i*) &state_out[0], STATE0); - _mm_store_si128((__m128i*) &state_out[4], STATE1); -} - -#endif diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c index 9d81b20e..823ee72f 100644 --- a/algo/sha/sha256-hash.c +++ b/algo/sha/sha256-hash.c @@ -6,15 +6,1385 @@ static const uint32_t SHA256_IV[8] = 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; -/* -static const uint8_t SHA256_PAD[64] = +#if defined(__SHA__) + +void sha256_opt_transform_le( uint32_t *state_out, const void *input, + const uint32_t *state_in ) { - 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -*/ + __m128i STATE0, STATE1; + __m128i MSG, TMP; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial values + TMP = _mm_load_si128((__m128i*) &state_in[0]); + STATE1 = _mm_load_si128((__m128i*) &state_in[4]); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + TMSG0 = _mm_load_si128((const __m128i*) (input+0)); + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_load_si128((const __m128i*) (input+16)); + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + + // Rounds 8-11 + TMSG2 = _mm_load_si128((const __m128i*) (input+32)); + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_load_si128((const __m128i*) (input+48)); + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add values back to state + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &state_out[0], STATE0); + _mm_store_si128((__m128i*) &state_out[4], STATE1); +} + + +void sha256_opt_transform_be( uint32_t *state_out, const void *input, + const uint32_t *state_in ) +{ + __m128i STATE0, STATE1; + __m128i MSG, TMP, MASK; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial values + TMP = _mm_load_si128((__m128i*) &state_in[0]); + STATE1 = _mm_load_si128((__m128i*) &state_in[4]); + MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + TMSG0 = _mm_load_si128((const __m128i*) (input+0)); + TMSG0 = _mm_shuffle_epi8( TMSG0, MASK ); + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_load_si128((const __m128i*) (input+16)); + TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + // Rounds 8-11 + TMSG2 = _mm_load_si128((const __m128i*) (input+32)); + TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_load_si128((const __m128i*) (input+48)); + TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add values back to state + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &state_out[0], STATE0); + _mm_store_si128((__m128i*) &state_out[4], STATE1); +} + +// 2 way double buffered + +void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; + __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y; + + // Load initial values + TMP_X = _mm_load_si128((__m128i*) &in_X[0]); + STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); + TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); + STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); + + TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB + TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH + STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF + STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF + STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH + STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE_X = STATE0_X; + ABEF_SAVE_Y = STATE0_Y; + CDGH_SAVE_X = STATE1_X; + CDGH_SAVE_Y = STATE1_Y; + + // Rounds 0-3 + TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); + TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); + TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 4-7 + TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); + TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); + TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 8-11 + TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); + TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); + TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 12-15 + TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); + TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); + TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 16-19 + TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 20-23 + TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 24-27 + TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 28-31 + TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 32-35 + TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 36-39 + TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 40-43 + TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 44-47 + TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 48-51 + TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 52-55 + TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 56-59 + TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 60-63 + TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Add values back to state + STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); + STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); + STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); + STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); + + TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA + TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG + STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA + STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA + STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF + STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &out_X[0], STATE0_X); + _mm_store_si128((__m128i*) &out_X[4], STATE1_X); + _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); + _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); +} + +void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK; + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; + __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y; + + // Load initial values + TMP_X = _mm_load_si128((__m128i*) &in_X[0]); + STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); + TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); + STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); + MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB + TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH + STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF + STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF + STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH + STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE_X = STATE0_X; + ABEF_SAVE_Y = STATE0_Y; + CDGH_SAVE_X = STATE1_X; + CDGH_SAVE_Y = STATE1_Y; + + // Rounds 0-3 + TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); + TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); + TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); + TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK ); + TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK ); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 4-7 + TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); + TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); + TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); + TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK ); + TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 8-11 + TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); + TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); + TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); + TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK ); + TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 12-15 + TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); + TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); + TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); + TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK ); + TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 16-19 + TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 20-23 + TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 24-27 + TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 28-31 + TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 32-35 + TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 36-39 + TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 40-43 + TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 44-47 + TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 48-51 + TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 52-55 + TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 56-59 + TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 60-63 + TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Add values back to state + STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); + STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); + STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); + STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); + + TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA + TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG + STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA + STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA + STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF + STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &out_X[0], STATE0_X); + _mm_store_si128((__m128i*) &out_X[4], STATE1_X); + _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); + _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); +} + +// The next 2 functions work together to seperate the low frequency data +// (outer loop) from the high frequency data containing the nonce (inner loop) +// when hashing the second block (tail) of the first sha256 hash. +// The goal is to avoid any redundant processing in final. Prehash is almost +// 4 rounds total, only missing the final addition of the nonce. +// Nonce must be set to zero for prehash. +void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg, + uint32_t *sstate, const uint32_t *istate ) +{ + __m128i STATE0, STATE1, MSG, TMP; + + // Load initial values + TMP = casti_m128i( istate, 0 ); + STATE1 = casti_m128i( istate, 1 ); + + TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB + STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH + STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 ); // ABEF + STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH + + // Save current hash + casti_m128i( sstate, 0 ) = STATE0; + casti_m128i( sstate, 1 ) = STATE1; + + // Rounds 0 to 3 + MSG = casti_m128i( msg, 0 ); + TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL ); + MSG = _mm_add_epi32( MSG, TMP ); + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); + MSG = _mm_shuffle_epi32( MSG, 0x0E ); + casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); + casti_m128i( ostate, 1 ) = STATE1; +} + +void sha256_ni2way_final_rounds( uint32_t *out_X, uint32_t *out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *state_mid_X, const uint32_t *state_mid_Y, + const uint32_t *state_save_X, const uint32_t *state_save_Y ) +{ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; + + STATE0_X = casti_m128i( state_mid_X, 0 ); + STATE1_X = casti_m128i( state_mid_X, 1 ); + STATE0_Y = casti_m128i( state_mid_Y, 0 ); + STATE1_Y = casti_m128i( state_mid_Y, 1 ); + + // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3) + TMSG0_X = casti_m128i( msg_X, 0 ); + TMSG0_Y = casti_m128i( msg_Y, 0 ); + TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 ); + TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 ); + STATE0_X = _mm_add_epi32( STATE0_X, TMP_X ); + STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y ); + + // Rounds 4 to 7 + TMSG1_X = casti_m128i( msg_X, 1 ); + TMSG1_Y = casti_m128i( msg_Y, 1 ); + TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL ); + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); + TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); + + // Rounds 8 to 11, skip TMSG2, it's zero until round 22 + MSG_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_X ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X ); + + // Rounds 12 to 15 + TMSG3_X = casti_m128i( msg_X, 3 ); + TMSG3_Y = casti_m128i( msg_Y, 3 ); + TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL ); + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); + TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + + // Rounds 16 to 19 + TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL ); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); + TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); + TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); + TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); + TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); + TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); + + // Rounds 20 to 23 + TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL ); + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMSG2_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); + TMSG2_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); + TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); + TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); + TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); + + // Rounds 24 to 27 + TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL ); + MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); + TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); + TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); + TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); + TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); + TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); + + // Rounds 28 to 31 + TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL ); + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); + TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); + TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); + TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); + TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); + TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); + + // Rounds 32 to 35 + TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL ); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); + TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); + TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); + TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); + TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); + TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); + + // Rounds 36 to 39 + TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL ); + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); + TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); + TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); + TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); + TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); + TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); + + // Rounds 40 to 43 + TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL ); + MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); + TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); + TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); + TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); + TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); + TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); + + // Rounds 44 to 47 + TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL ); + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); + TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); + TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); + TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); + TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); + TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); + + // Rounds 48 to 51 + TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL ); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); + TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); + TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); + TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); + TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); + TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); + + // Rounds 52 to 55 + TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL ); + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); + TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); + TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); + TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); + TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + + // Rounds 56 to 59 + TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL ); + MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); + TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); + TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); + TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); + TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); + TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + + // Rounds 60 to 63 + TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL ); + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); + + // Add saved state to new state + STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) ); + STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) ); + STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) ); + STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) ); + + // Unshuffle & save state + TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA + TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B ); + STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG + STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 ); + casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA + casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 ); + casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF + casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 ); +} + +#endif + void sha256_ctx_init( sha256_context *ctx ) { diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h index 410ca90f..763b405f 100644 --- a/algo/sha/sha256-hash.h +++ b/algo/sha/sha256-hash.h @@ -4,17 +4,18 @@ #include #include "simd-utils.h" #include "cpuminer-config.h" -#include "sph_sha2.h" - // generic interface -typedef struct { +typedef struct +{ unsigned char buf[64]; /* first field, for alignment */ uint32_t state[8]; uint64_t count; } sha256_context __attribute__((aligned(64))); +static const uint32_t SHA256_IV[8]; + void sha256_full( void *hash, const void *data, size_t len ); void sha256_update( sha256_context *ctx, const void *data, size_t len ); void sha256_final( sha256_context *ctx, void *hash ); @@ -41,20 +42,113 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, const void *msg_X, const void *msg_Y, const uint32_t *in_X, const uint32_t *in_Y ); +void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg, + uint32_t *sstate, const uint32_t *istate ); + +void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *state_mid_X, const uint32_t *state_mid_Y, + const uint32_t *state_save_X, const uint32_t *state_save_Y ); + // Select target // with SHA... #define sha256_transform_le sha256_opt_transform_le #define sha256_transform_be sha256_opt_transform_be #else - // without SHA... +#include "sph_sha2.h" + #define sha256_transform_le sph_sha256_transform_le #define sha256_transform_be sph_sha256_transform_be #endif -// SHA can't do only 3 rounds -#define sha256_prehash_3rounds sph_sha256_prehash_3rounds +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// SHA-256 16 way + +typedef struct +{ + __m512i buf[64>>2]; + __m512i val[8]; + uint32_t count_high, count_low; +} sha256_16way_context __attribute__ ((aligned (128))); + +void sha256_16way_init( sha256_16way_context *sc ); +void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ); +void sha256_16way_close( sha256_16way_context *sc, void *dst ); +void sha256_16way_full( void *dst, const void *data, size_t len ); +void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); +void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); +void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, + const __m512i *W, const __m512i *state_in ); +void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, + const __m512i *state_in, const __m512i *state_mid, const __m512i *X ); + +int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, + const __m512i *state_in, const uint32_t *target ); + +#endif // AVX512 + +#if defined (__AVX2__) + +// SHA-256 8 way + +typedef struct +{ + __m256i buf[64>>2]; + __m256i val[8]; + uint32_t count_high, count_low; +} sha256_8way_context __attribute__ ((aligned (64))); + +void sha256_8way_init( sha256_8way_context *sc ); +void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ); +void sha256_8way_close( sha256_8way_context *sc, void *dst ); +void sha256_8way_full( void *dst, const void *data, size_t len ); +void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); +void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); + +void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, + const __m256i *W, const __m256i *state_in ); +void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, + const __m256i *state_in, const __m256i *state_mid, const __m256i *X ); +int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, + const __m256i *state_in, const uint32_t *target ); + +#endif // AVX2 + +#if defined(__SSE2__) + +// SHA-256 4 way + +typedef struct +{ + __m128i buf[64>>2]; + __m128i val[8]; + uint32_t count_high, count_low; +} sha256_4way_context __attribute__ ((aligned (32))); + +void sha256_4way_init( sha256_4way_context *sc ); +void sha256_4way_update( sha256_4way_context *sc, const void *data, + size_t len ); +void sha256_4way_close( sha256_4way_context *sc, void *dst ); +void sha256_4way_full( void *dst, const void *data, size_t len ); +void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); +void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); +void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, + const __m128i *W, const __m128i *state_in ); +void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, + const __m128i *state_in, const __m128i *state_mid, const __m128i *X ); +int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, + const __m128i *state_in, const uint32_t *target ); + +#endif // SSE2 #endif diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c index 24b588ec..b4a54c47 100644 --- a/algo/sha/sha256d-4way.c +++ b/algo/sha/sha256d-4way.c @@ -4,7 +4,6 @@ #include #include #include "sha256-hash.h" -#include "sha-hash-4way.h" static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) = { @@ -17,11 +16,15 @@ static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) = int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t block0[16] __attribute__ ((aligned (64))); - uint32_t block1[16] __attribute__ ((aligned (64))); - uint32_t hash0[8] __attribute__ ((aligned (32))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t mstate[8] __attribute__ ((aligned (32))); + uint32_t block1a[16] __attribute__ ((aligned (64))); + uint32_t block1b[16] __attribute__ ((aligned (64))); + uint32_t block2a[16] __attribute__ ((aligned (64))); + uint32_t block2b[16] __attribute__ ((aligned (64))); + uint32_t hasha[8] __attribute__ ((aligned (32))); + uint32_t hashb[8] __attribute__ ((aligned (32))); + uint32_t mstatea[8] __attribute__ ((aligned (32))); + uint32_t mstateb[8] __attribute__ ((aligned (32))); + uint32_t sstate[8] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -32,56 +35,60 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, const __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); - // hash first 64 bytes of data - sha256_opt_transform_le( mstate, pdata, sha256_iv ); + // hash first 64 byte block of data + sha256_opt_transform_le( mstatea, pdata, sha256_iv ); + + // fill & pad second bock without nonce + memcpy( block1a, pdata + 16, 12 ); + memcpy( block1b, pdata + 16, 12 ); + block1a[ 3] = 0; + block1b[ 3] = 0; + block1a[ 4] = block1b[ 4] = 0x80000000; + memset( block1a + 5, 0, 40 ); + memset( block1b + 5, 0, 40 ); + block1a[15] = block1b[15] = 80*8; // bit count + + sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea); + + // Pad third block + block2a[ 8] = block2b[ 8] = 0x80000000; + memset( block2a + 9, 0, 24 ); + memset( block2b + 9, 0, 24 ); + block2a[15] = block2b[15] = 32*8; // bit count do { - // 1. final 16 bytes of data, with padding - memcpy( block0, pdata + 16, 16 ); - memcpy( block1, pdata + 16, 16 ); - block0[ 3] = n; - block1[ 3] = n+1; - block0[ 4] = block1[ 4] = 0x80000000; - memset( block0 + 5, 0, 40 ); - memset( block1 + 5, 0, 40 ); - block0[15] = block1[15] = 80*8; // bit count - sha256_ni2way_transform_le( hash0, hash1, block0, block1, - mstate, mstate ); - - // 2. 32 byte hash from 1. - memcpy( block0, hash0, 32 ); - memcpy( block1, hash1, 32 ); - block0[ 8] = block1[ 8] = 0x80000000; - memset( block0 + 9, 0, 24 ); - memset( block1 + 9, 0, 24 ); - block0[15] = block1[15] = 32*8; // bit count - sha256_ni2way_transform_le( hash0, hash1, block0, block1, + // Insert nonce for second block + block1a[3] = n; + block1b[3] = n+1; + sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b, + mstateb, mstateb, sstate, sstate ); + + sha256_ni2way_transform_le( hasha, hashb, block2a, block2b, sha256_iv, sha256_iv ); - if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) ) + if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) { - casti_m128i( hash0, 0 ) = - _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 ); - casti_m128i( hash0, 1 ) = - _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 ); - if ( likely( valid_hash( hash0, ptarget ) && !bench ) ) + casti_m128i( hasha, 0 ) = + _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 ); + casti_m128i( hasha, 1 ) = + _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 ); + if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) { pdata[19] = n; - submit_solution( work, hash0, mythr ); + submit_solution( work, hasha, mythr ); } } - - if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) ) + if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) { - casti_m128i( hash1, 0 ) = - _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 ); - casti_m128i( hash1, 1 ) = - _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 ); - if ( likely( valid_hash( hash1, ptarget ) && !bench ) ) + casti_m128i( hashb, 0 ) = + _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 ); + casti_m128i( hashb, 1 ) = + _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 ); + if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) { pdata[19] = n+1; - submit_solution( work, hash1, mythr ); + submit_solution( work, hashb, mythr ); } } n += 2; @@ -99,18 +106,16 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m512i hash32[8] __attribute__ ((aligned (128))); - __m512i block[16] __attribute__ ((aligned (64))); + __m512i block[16] __attribute__ ((aligned (128))); __m512i buf[16] __attribute__ ((aligned (64))); + __m512i hash32[8] __attribute__ ((aligned (64))); __m512i mstate1[8] __attribute__ ((aligned (64))); __m512i mstate2[8] __attribute__ ((aligned (64))); __m512i istate[8] __attribute__ ((aligned (64))); __m512i mexp_pre[8] __attribute__ ((aligned (64))); uint32_t phash[8] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]); - const uint32_t targ32_d7 = ptarget[7]; + const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 16; const __m512i last_byte = _mm512_set1_epi32( 0x80000000 ); @@ -134,7 +139,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, mstate1[6] = _mm512_set1_epi32( phash[6] ); mstate1[7] = _mm512_set1_epi32( phash[7] ); - // second message block data, with nonce & padding + // second message block data, with nonce & padding buf[0] = _mm512_set1_epi32( pdata[16] ); buf[1] = _mm512_set1_epi32( pdata[17] ); buf[2] = _mm512_set1_epi32( pdata[18] ); @@ -142,12 +147,12 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); buf[4] = last_byte; memset_zero_512( buf+5, 10 ); - buf[15] = _mm512_set1_epi32( 80*8 ); // bit count + buf[15] = _mm512_set1_epi32( 80*8 ); // bit count // partially pre-expand & prehash second message block, avoiding the nonces sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 ); - // vectorize IV for 2nd & 3rd sha256 + // vectorize IV for second hash istate[0] = _mm512_set1_epi32( sha256_iv[0] ); istate[1] = _mm512_set1_epi32( sha256_iv[1] ); istate[2] = _mm512_set1_epi32( sha256_iv[2] ); @@ -157,27 +162,26 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, istate[6] = _mm512_set1_epi32( sha256_iv[6] ); istate[7] = _mm512_set1_epi32( sha256_iv[7] ); - // initialize padding for 2nd sha256 + // initialize padding for second hash block[ 8] = last_byte; - memset_zero_512( block + 9, 6 ); + memset_zero_512( block+9, 6 ); block[15] = _mm512_set1_epi32( 32*8 ); // bit count do { sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre ); - - if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) ) + if ( unlikely( sha256_16way_transform_le_short( + hash32, block, istate, ptarget ) ) ) { for ( int lane = 0; lane < 16; lane++ ) - if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 ) { extr_lane_16x32( phash, hash32, lane, 256 ); casti_m256i( phash, 0 ) = - _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); + _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); if ( likely( valid_hash( phash, ptarget ) && !bench ) ) { - pdata[19] = n + lane; - submit_solution( work, phash, mythr ); + pdata[19] = n + lane; + submit_solution( work, phash, mythr ); } } } @@ -188,92 +192,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, *hashes_done = n - first_nonce; return 0; } - - -/* -int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - __m512i vdata[32] __attribute__ ((aligned (128))); - __m512i block[16] __attribute__ ((aligned (64))); - __m512i hash32[8] __attribute__ ((aligned (64))); - __m512i initstate[8] __attribute__ ((aligned (64))); - __m512i midstate1[8] __attribute__ ((aligned (64))); - __m512i midstate2[8] __attribute__ ((aligned (64))); - __m512i mexp_pre[16] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); - uint32_t *pdata = work->data; - const uint32_t *ptarget = work->target; - const uint32_t targ32_d7 = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 16; - uint32_t n = first_nonce; - __m512i *noncev = vdata + 19; - const int thr_id = mythr->id; - const bool bench = opt_benchmark; - const __m512i last_byte = _mm512_set1_epi32( 0x80000000 ); - const __m512i sixteen = _mm512_set1_epi32( 16 ); - - for ( int i = 0; i < 19; i++ ) - vdata[i] = _mm512_set1_epi32( pdata[i] ); - - *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, - n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); - - vdata[16+4] = last_byte; - memset_zero_512( vdata+16 + 5, 10 ); - vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count - - block[ 8] = last_byte; - memset_zero_512( block + 9, 6 ); - block[15] = _mm512_set1_epi32( 32*8 ); // bit count - // initialize state - initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 ); - initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 ); - initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 ); - initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A ); - initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F ); - initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C ); - initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB ); - initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 ); - - sha256_16way_transform_le( midstate1, vdata, initstate ); - - // Do 3 rounds on the first 12 bytes of the next block - sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 ); - - do - { - // 1. final 16 bytes of data, with padding - sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2, - mexp_pre ); - - // 2. 32 byte hash from 1. - sha256_16way_transform_le( hash32, block, initstate ); - // byte swap final hash for testing - mm512_block_bswap_32( hash32, hash32 ); - - for ( int lane = 0; lane < 16; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) - { - extr_lane_16x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) - { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); - } - } - *noncev = _mm512_add_epi32( *noncev, sixteen ); - n += 16; - } while ( (n < last_nonce) && !work_restart[thr_id].restart ); - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} -*/ - #endif #if defined(SHA256D_8WAY) @@ -284,15 +203,13 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, __m256i vdata[32] __attribute__ ((aligned (64))); __m256i block[16] __attribute__ ((aligned (32))); __m256i hash32[8] __attribute__ ((aligned (32))); - __m256i initstate[8] __attribute__ ((aligned (32))); - __m256i midstate1[8] __attribute__ ((aligned (32))); - __m256i midstate2[8] __attribute__ ((aligned (32))); + __m256i istate[8] __attribute__ ((aligned (32))); + __m256i mstate1[8] __attribute__ ((aligned (32))); + __m256i mstate2[8] __attribute__ ((aligned (32))); __m256i mexp_pre[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; - const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 8; uint32_t n = first_nonce; @@ -301,6 +218,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, const bool bench = opt_benchmark; const __m256i last_byte = _mm256_set1_epi32( 0x80000000 ); const __m256i eight = _mm256_set1_epi32( 8 ); + const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x( + 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); for ( int i = 0; i < 19; i++ ) vdata[i] = _mm256_set1_epi32( pdata[i] ); @@ -309,50 +228,47 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, vdata[16+4] = last_byte; memset_zero_256( vdata+16 + 5, 10 ); - vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count + vdata[16+15] = _mm256_set1_epi32( 80*8 ); block[ 8] = last_byte; memset_zero_256( block + 9, 6 ); - block[15] = _mm256_set1_epi32( 32*8 ); // bit count - - // initialize state - initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 ); - initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 ); - initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 ); - initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A ); - initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F ); - initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C ); - initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB ); - initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 ); - - sha256_8way_transform_le( midstate1, vdata, initstate ); + block[15] = _mm256_set1_epi32( 32*8 ); - // Do 3 rounds on the first 12 bytes of the next block - sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); + // initialize state for second hash + istate[0] = _mm256_set1_epi32( sha256_iv[0] ); + istate[1] = _mm256_set1_epi32( sha256_iv[1] ); + istate[2] = _mm256_set1_epi32( sha256_iv[2] ); + istate[3] = _mm256_set1_epi32( sha256_iv[3] ); + istate[4] = _mm256_set1_epi32( sha256_iv[4] ); + istate[5] = _mm256_set1_epi32( sha256_iv[5] ); + istate[6] = _mm256_set1_epi32( sha256_iv[6] ); + istate[7] = _mm256_set1_epi32( sha256_iv[7] ); + + sha256_8way_transform_le( mstate1, vdata, istate ); + // Do 3 rounds on the first 12 bytes of the next block + sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 ); + do { - // 1. final 16 bytes of data, with padding - sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2, - mexp_pre ); - - // 2. 32 byte hash from 1. - sha256_8way_transform_le( hash32, block, initstate ); - // byte swap final hash for testing - mm256_block_bswap_32( hash32, hash32 ); - - for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre ); + if ( unlikely( sha256_8way_transform_le_short( hash32, block, + istate, ptarget ) ) ) { - extr_lane_8x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + for ( int lane = 0; lane < 8; lane++ ) { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + casti_m256i( lane_hash, 0 ) = + _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } } - } - *noncev = _mm256_add_epi32( *noncev, eight ); - n += 8; + } + *noncev = _mm256_add_epi32( *noncev, eight ); + n += 8; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; *hashes_done = n - first_nonce; @@ -366,12 +282,12 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m128i vdata[32] __attribute__ ((aligned (64))); - __m128i block[16] __attribute__ ((aligned (32))); - __m128i hash32[8] __attribute__ ((aligned (32))); - __m128i initstate[8] __attribute__ ((aligned (32))); - __m128i midstate1[8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); + __m128i vdata[32] __attribute__ ((aligned (64))); + __m128i block[16] __attribute__ ((aligned (32))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i istate[8] __attribute__ ((aligned (32))); + __m128i mstate[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -392,33 +308,30 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, vdata[16+4] = last_byte; memset_zero_128( vdata+16 + 5, 10 ); - vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count + vdata[16+15] = _mm_set1_epi32( 80*8 ); block[ 8] = last_byte; memset_zero_128( block + 9, 6 ); - block[15] = _mm_set1_epi32( 32*8 ); // bit count - + block[15] = _mm_set1_epi32( 32*8 ); + // initialize state - initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 ); - initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 ); - initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 ); - initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A ); - initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F ); - initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C ); - initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB ); - initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 ); + istate[0] = _mm_set1_epi32( sha256_iv[0] ); + istate[1] = _mm_set1_epi32( sha256_iv[1] ); + istate[2] = _mm_set1_epi32( sha256_iv[2] ); + istate[3] = _mm_set1_epi32( sha256_iv[3] ); + istate[4] = _mm_set1_epi32( sha256_iv[4] ); + istate[5] = _mm_set1_epi32( sha256_iv[5] ); + istate[6] = _mm_set1_epi32( sha256_iv[6] ); + istate[7] = _mm_set1_epi32( sha256_iv[7] ); // hash first 64 bytes of data - sha256_4way_transform_le( midstate1, vdata, initstate ); + sha256_4way_transform_le( mstate, vdata, istate ); do { - // 1. final 16 bytes of data, with padding - sha256_4way_transform_le( block, vdata+16, initstate ); + sha256_4way_transform_le( block, vdata+16, mstate ); + sha256_4way_transform_le( hash32, block, istate ); - // 2. 32 byte hash from 1. - sha256_4way_transform_le( hash32, block, initstate ); - // byte swap final hash for testing mm128_block_bswap_32( hash32, hash32 ); for ( int lane = 0; lane < 4; lane++ ) @@ -440,3 +353,5 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, } #endif + + diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c index ac339c86..e1703126 100644 --- a/algo/sha/sha256dt.c +++ b/algo/sha/sha256dt.c @@ -4,7 +4,6 @@ #include #include #include "sha256-hash.h" -#include "sha-hash-4way.h" #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define SHA256DT_16WAY 1 @@ -22,14 +21,104 @@ static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) = 0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad }; +#if defined(SHA256DT_SHA) + +int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t block1a[16] __attribute__ ((aligned (64))); + uint32_t block1b[16] __attribute__ ((aligned (64))); + uint32_t block2a[16] __attribute__ ((aligned (64))); + uint32_t block2b[16] __attribute__ ((aligned (64))); + uint32_t hasha[8] __attribute__ ((aligned (32))); + uint32_t hashb[8] __attribute__ ((aligned (32))); + uint32_t mstatea[8] __attribute__ ((aligned (32))); + uint32_t mstateb[8] __attribute__ ((aligned (32))); + uint32_t sstate[8] __attribute__ ((aligned (32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i shuf_bswap32 = + _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + + // hash first 64 byte block of data + sha256_opt_transform_le( mstatea, pdata, sha256dt_iv ); + + // fill & pad second bock without nonce + memcpy( block1a, pdata + 16, 12 ); + memcpy( block1b, pdata + 16, 12 ); + block1a[ 3] = 0; + block1b[ 3] = 0; + block1a[ 4] = block1b[ 4] = 0x80000000; + memset( block1a + 5, 0, 40 ); + memset( block1b + 5, 0, 40 ); + block1a[15] = block1b[15] = 0x480; // funky bit count + + sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea); + + // Pad third block + block2a[ 8] = block2b[ 8] = 0x80000000; + memset( block2a + 9, 0, 24 ); + memset( block2b + 9, 0, 24 ); + block2a[15] = block2b[15] = 0x300; // bit count + + do + { + // Insert nonce for second block + block1a[3] = n; + block1b[3] = n+1; + sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b, + mstateb, mstateb, sstate, sstate ); + + sha256_ni2way_transform_le( hasha, hashb, block2a, block2b, + sha256dt_iv, sha256dt_iv ); + + if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) + { + casti_m128i( hasha, 0 ) = + _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 ); + casti_m128i( hasha, 1 ) = + _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 ); + if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) + { + pdata[19] = n; + submit_solution( work, hasha, mythr ); + } + } + if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) + { + casti_m128i( hashb, 0 ) = + _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 ); + casti_m128i( hashb, 1 ) = + _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 ); + if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) + { + pdata[19] = n+1; + submit_solution( work, hashb, mythr ); + } + } + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + #if defined(SHA256DT_16WAY) int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m512i hash32[8] __attribute__ ((aligned (128))); - __m512i block[16] __attribute__ ((aligned (64))); + __m512i block[16] __attribute__ ((aligned (128))); __m512i buf[16] __attribute__ ((aligned (64))); + __m512i hash32[8] __attribute__ ((aligned (64))); __m512i mstate1[8] __attribute__ ((aligned (64))); __m512i mstate2[8] __attribute__ ((aligned (64))); __m512i istate[8] __attribute__ ((aligned (64))); @@ -37,8 +126,6 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce, uint32_t phash[8] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; -// uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]); -// const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 16; const __m512i last_byte = _mm512_set1_epi32( 0x80000000 ); @@ -75,7 +162,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce, // partially pre-expand & prehash second message block, avoiding the nonces sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 ); - // vectorize IV for 2nd sha256 + // vectorize IV for second hash istate[0] = _mm512_set1_epi32( sha256dt_iv[0] ); istate[1] = _mm512_set1_epi32( sha256dt_iv[1] ); istate[2] = _mm512_set1_epi32( sha256dt_iv[2] ); @@ -85,20 +172,18 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce, istate[6] = _mm512_set1_epi32( sha256dt_iv[6] ); istate[7] = _mm512_set1_epi32( sha256dt_iv[7] ); - // initialize padding for 2nd sha256 + // initialize padding for second hash block[ 8] = last_byte; memset_zero_512( block+9, 6 ); block[15] = _mm512_set1_epi32( 0x300 ); // bit count do { - // finish second block with nonces sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre ); if ( unlikely( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) ) ) { for ( int lane = 0; lane < 16; lane++ ) -// if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 ) { extr_lane_16x32( phash, hash32, lane, 256 ); casti_m256i( phash, 0 ) = @@ -118,86 +203,9 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce, return 0; } -#elif defined(SHA256DT_SHA) - -int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t block0[16] __attribute__ ((aligned (64))); - uint32_t block1[16] __attribute__ ((aligned (64))); - uint32_t hash0[8] __attribute__ ((aligned (32))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t mstate[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 2; - uint32_t n = first_nonce; - const int thr_id = mythr->id; - const bool bench = opt_benchmark; - const __m128i shuf_bswap32 = - _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); - - // hash first 64 bytes of data - sha256_opt_transform_le( mstate, pdata, sha256dt_iv ); - - do - { - // 1. final 16 bytes of data, with padding - memcpy( block0, pdata + 16, 16 ); - memcpy( block1, pdata + 16, 16 ); - block0[ 3] = n; - block1[ 3] = n+1; - block0[ 4] = block1[ 4] = 0x80000000; - memset( block0 + 5, 0, 40 ); - memset( block1 + 5, 0, 40 ); - block0[15] = block1[15] = 0x480; // funky bit count - sha256_ni2way_transform_le( hash0, hash1, block0, block1, - mstate, mstate ); - - // 2. 32 byte hash from 1. - memcpy( block0, hash0, 32 ); - memcpy( block1, hash1, 32 ); - block0[ 8] = block1[ 8] = 0x80000000; - memset( block0 + 9, 0, 24 ); - memset( block1 + 9, 0, 24 ); - block0[15] = block1[15] = 0x300; // bit count - sha256_ni2way_transform_le( hash0, hash1, block0, block1, - sha256dt_iv, sha256dt_iv ); - - if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) ) - { - casti_m128i( hash0, 0 ) = - _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 ); - casti_m128i( hash0, 1 ) = - _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 ); - if ( likely( valid_hash( hash0, ptarget ) && !bench ) ) - { - pdata[19] = n; - submit_solution( work, hash0, mythr ); - } - } - if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) ) - { - casti_m128i( hash1, 0 ) = - _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 ); - casti_m128i( hash1, 1 ) = - _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 ); - if ( likely( valid_hash( hash1, ptarget ) && !bench ) ) - { - pdata[19] = n+1; - submit_solution( work, hash1, mythr ); - } - } - n += 2; - } while ( (n < last_nonce) && !work_restart[thr_id].restart ); - - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} +#endif -#elif defined(SHA256DT_8WAY) +#if defined(SHA256DT_8WAY) int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) @@ -236,7 +244,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce, memset_zero_256( block + 9, 6 ); block[15] = _mm256_set1_epi32( 0x300 ); - // initialize state + // initialize state for swecond hash istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c ); istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 ); istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 ); @@ -253,11 +261,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce, do { - sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, - mexp_pre ); - - if ( unlikely( sha256_8way_transform_le_short( - hash32, block, istate, ptarget ) ) ) + sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre ); + if ( unlikely( sha256_8way_transform_le_short( hash32, block, + istate, ptarget ) ) ) { for ( int lane = 0; lane < 8; lane++ ) { @@ -279,7 +285,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce, return 0; } -#elif defined(SHA256DT_4WAY) +#endif + +#if defined(SHA256DT_4WAY) int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c index a57c80b3..0d07a396 100644 --- a/algo/sha/sha256q-4way.c +++ b/algo/sha/sha256q-4way.c @@ -3,7 +3,7 @@ #include #include #include -#include "sha-hash-4way.h" +#include "sha256-hash.h" #if defined(SHA256T_16WAY) diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index 6a54a116..411d6f58 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -4,7 +4,12 @@ #include #include #include "sha256-hash.h" -#include "sha-hash-4way.h" + + static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) = + { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + }; #if defined(SHA256T_16WAY) @@ -19,11 +24,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, __m512i istate[8] __attribute__ ((aligned (64))); __m512i mexp_pre[8] __attribute__ ((aligned (64))); uint32_t phash[8] __attribute__ ((aligned (32))); - static const uint32_t IV[8] __attribute__ ((aligned (32))) = - { - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 - }; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]); @@ -39,7 +39,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); // prehash first block directly from pdata - sha256_transform_le( phash, pdata, IV ); + sha256_transform_le( phash, pdata, sha256_iv ); // vectorize block 0 hash for second block mstate1[0] = _mm512_set1_epi32( phash[0] ); @@ -65,14 +65,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 ); // vectorize IV for 2nd & 3rd sha256 - istate[0] = _mm512_set1_epi32( IV[0] ); - istate[1] = _mm512_set1_epi32( IV[1] ); - istate[2] = _mm512_set1_epi32( IV[2] ); - istate[3] = _mm512_set1_epi32( IV[3] ); - istate[4] = _mm512_set1_epi32( IV[4] ); - istate[5] = _mm512_set1_epi32( IV[5] ); - istate[6] = _mm512_set1_epi32( IV[6] ); - istate[7] = _mm512_set1_epi32( IV[7] ); + istate[0] = _mm512_set1_epi32( sha256_iv[0] ); + istate[1] = _mm512_set1_epi32( sha256_iv[1] ); + istate[2] = _mm512_set1_epi32( sha256_iv[2] ); + istate[3] = _mm512_set1_epi32( sha256_iv[3] ); + istate[4] = _mm512_set1_epi32( sha256_iv[4] ); + istate[5] = _mm512_set1_epi32( sha256_iv[5] ); + istate[6] = _mm512_set1_epi32( sha256_iv[6] ); + istate[7] = _mm512_set1_epi32( sha256_iv[7] ); // initialize padding for 2nd & 3rd sha256 block[ 8] = last_byte; @@ -110,6 +110,97 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, #endif +#if defined(__SHA__) + +int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t block1a[16] __attribute__ ((aligned (64))); + uint32_t block1b[16] __attribute__ ((aligned (64))); + uint32_t block2a[16] __attribute__ ((aligned (64))); + uint32_t block2b[16] __attribute__ ((aligned (64))); + uint32_t hasha[8] __attribute__ ((aligned (32))); + uint32_t hashb[8] __attribute__ ((aligned (32))); + uint32_t mstatea[8] __attribute__ ((aligned (32))); + uint32_t mstateb[8] __attribute__ ((aligned (32))); + uint32_t sstate[8] __attribute__ ((aligned (32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i shuf_bswap32 = + _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + + // hash first 64 byte block of data + sha256_opt_transform_le( mstatea, pdata, sha256_iv ); + + // fill & pad second bock without nonce + memcpy( block1a, pdata + 16, 12 ); + memcpy( block1b, pdata + 16, 12 ); + block1a[ 3] = 0; + block1b[ 3] = 0; + block1a[ 4] = block1b[ 4] = 0x80000000; + memset( block1a + 5, 0, 40 ); + memset( block1b + 5, 0, 40 ); + block1a[15] = block1b[15] = 0x480; // funky bit count + + sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea); + + // Pad third block + block2a[ 8] = block2b[ 8] = 0x80000000; + memset( block2a + 9, 0, 24 ); + memset( block2b + 9, 0, 24 ); + block2a[15] = block2b[15] = 80*8; // bit count + + do + { + // Insert nonce for second block + block1a[3] = n; + block1b[3] = n+1; + sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b, + mstateb, mstateb, sstate, sstate ); + sha256_ni2way_transform_le( block2a, block2b, block2a, block2b, + sha256_iv, sha256_iv ); + sha256_ni2way_transform_le( hasha, hashb, block2a, block2b, + sha256_iv, sha256_iv ); + + if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) + { + casti_m128i( hasha, 0 ) = + _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 ); + casti_m128i( hasha, 1 ) = + _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 ); + if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) + { + pdata[19] = n; + submit_solution( work, hasha, mythr ); + } + } + if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) + { + casti_m128i( hashb, 0 ) = + _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 ); + casti_m128i( hashb, 1 ) = + _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 ); + if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) + { + pdata[19] = n+1; + submit_solution( work, hashb, mythr ); + } + } + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + #if defined(SHA256T_8WAY) int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c index e05c7060..e369f27b 100644 --- a/algo/sha/sha256t-gate.c +++ b/algo/sha/sha256t-gate.c @@ -5,9 +5,9 @@ bool register_sha256t_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; #if defined(SHA256T_16WAY) gate->scanhash = (void*)&scanhash_sha256t_16way; -#elif defined(__SHA__) +#elif defined(SHA256T_SHA) gate->optimizations = SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256t; + gate->scanhash = (void*)&scanhash_sha256t_sha; #elif defined(SHA256T_8WAY) gate->scanhash = (void*)&scanhash_sha256t_8way; #else @@ -22,7 +22,7 @@ bool register_sha256q_algo( algo_gate_t* gate ) #if defined(SHA256T_16WAY) gate->scanhash = (void*)&scanhash_sha256q_16way; gate->hash = (void*)&sha256q_16way_hash; -#elif defined(__SHA__) +#elif defined(SHA256T_SHA) gate->optimizations = SHA_OPT; gate->scanhash = (void*)&scanhash_sha256q; gate->hash = (void*)&sha256q_hash; diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index e74cfd1d..a20b3dd0 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -6,6 +6,8 @@ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define SHA256T_16WAY 1 +#elif defined(__SHA__) + #define SHA256T_SHA 1 #elif defined(__AVX2__) #define SHA256T_8WAY 1 #else @@ -42,9 +44,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif -#if defined(__SHA__) +#if defined(SHA256T_SHA) -int scanhash_sha256t( struct work *work, uint32_t max_nonce, +int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c deleted file mode 100644 index 298b5f09..00000000 --- a/algo/sha/sha256t.c +++ /dev/null @@ -1,102 +0,0 @@ -#include "sha256t-gate.h" -#include -#include -#include -#include -//#include "algo/sha/sph_sha2.h" -#include "sha256-hash.h" - -#if defined(__SHA__) - -// Only used on CPUs with SHA - - -int scanhash_sha256t( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t block0[16] __attribute__ ((aligned (64))); - uint32_t block1[16] __attribute__ ((aligned (64))); - uint32_t hash0[8] __attribute__ ((aligned (32))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t initstate[8] __attribute__ ((aligned (32))); - uint32_t midstate[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 2; - uint32_t n = first_nonce; - const int thr_id = mythr->id; - const bool bench = opt_benchmark; - __m128i shuf_bswap32 = - _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); - - // initialize state - initstate[0] = 0x6A09E667; - initstate[1] = 0xBB67AE85; - initstate[2] = 0x3C6EF372; - initstate[3] = 0xA54FF53A; - initstate[4] = 0x510E527F; - initstate[5] = 0x9B05688C; - initstate[6] = 0x1F83D9AB; - initstate[7] = 0x5BE0CD19; - - // hash first 64 bytes of data - sha256_opt_transform_le( midstate, pdata, initstate ); - - do - { - // 1. final 16 bytes of data, with padding - memcpy( block0, pdata + 16, 16 ); - memcpy( block1, pdata + 16, 16 ); - block0[ 3] = n; - block1[ 3] = n+1; - block0[ 4] = block1[ 4] = 0x80000000; - memset( block0 + 5, 0, 40 ); - memset( block1 + 5, 0, 40 ); - block0[15] = block1[15] = 80*8; // bit count - sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate ); - - // 2. 32 byte hash from 1. - memcpy( block0, hash0, 32 ); - memcpy( block1, hash1, 32 ); - block0[ 8] = block1[ 8] = 0x80000000; - memset( block0 + 9, 0, 24 ); - memset( block1 + 9, 0, 24 ); - block0[15] = block1[15] = 32*8; // bit count - sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate ); - - // 3. 32 byte hash from 2. - memcpy( block0, hash0, 32 ); - memcpy( block1, hash1, 32 ); - sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate ); - - // byte swap final hash for testing - casti_m128i( hash0, 0 ) = - _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 ); - casti_m128i( hash0, 1 ) = - _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 ); - casti_m128i( hash1, 0 ) = - _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 ); - casti_m128i( hash1, 1 ) = - _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 ); - - if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) ) - { - pdata[19] = n; - submit_solution( work, hash0, mythr ); - } - if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) ) - { - pdata[19] = n+1; - submit_solution( work, hash1, mythr ); - } - n += 2; - } while ( (n < last_nonce) && !work_restart[thr_id].restart ); - - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} - -#endif - diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 0cbd989c..cc4481bd 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -34,7 +34,7 @@ #include #include -#include "sha-hash-4way.h" +#include "sha512-hash.h" /* static const uit64_t H512[8] = diff --git a/algo/sha/sha512-hash.h b/algo/sha/sha512-hash.h new file mode 100644 index 00000000..58ef67c4 --- /dev/null +++ b/algo/sha/sha512-hash.h @@ -0,0 +1,46 @@ +#ifndef SHA512_HASH_H__ +#define SHA512_HASH_H__ 1 + +#include +#include "simd-utils.h" +#include "sph_sha2.h" + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// SHA-512 8 way + +typedef struct { + __m512i buf[128>>3]; + __m512i val[8]; + uint64_t count; + bool initialized; +} sha512_8way_context __attribute__ ((aligned (128))); + +void sha512_8way_init( sha512_8way_context *sc); +void sha512_8way_update( sha512_8way_context *sc, const void *data, + size_t len ); +void sha512_8way_close( sha512_8way_context *sc, void *dst ); +void sha512_8way_full( void *dst, const void *data, size_t len ); + +#endif // AVX512 + +#if defined (__AVX2__) + +// SHA-512 4 way + +typedef struct { + __m256i buf[128>>3]; + __m256i val[8]; + uint64_t count; + bool initialized; +} sha512_4way_context __attribute__ ((aligned (64))); + +void sha512_4way_init( sha512_4way_context *sc); +void sha512_4way_update( sha512_4way_context *sc, const void *data, + size_t len ); +void sha512_4way_close( sha512_4way_context *sc, void *dst ); +void sha512_4way_full( void *dst, const void *data, size_t len ); + +#endif // AVX2 + +#endif diff --git a/algo/sha/sha512256d-4way.c b/algo/sha/sha512256d-4way.c index 68218c41..72129b07 100644 --- a/algo/sha/sha512256d-4way.c +++ b/algo/sha/sha512256d-4way.c @@ -1,5 +1,6 @@ #include "algo-gate-api.h" -#include "sha-hash-4way.h" +#include "sha256-hash.h" +#include "sha512-hash.h" #include #include diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h index ab05423e..bd001960 100644 --- a/algo/sha/sph_sha2.h +++ b/algo/sha/sph_sha2.h @@ -41,7 +41,7 @@ #define SPH_SHA2_H__ #include -#include "sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for SHA-224. diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index 7cab8215..cd77fc6b 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -58,7 +58,7 @@ extern "C"{ M8, M9, MA, MB, MC, MD, ME, MF; \ const __m256i FIVE = _mm256_set1_epi32( 5 ); \ const __m256i THREE = _mm256_set1_epi32( 3 ); \ - sph_u32 Wlow, Whigh; + uint32_t Wlow, Whigh; #define READ_STATE8(state) do \ { \ @@ -653,7 +653,7 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) M8, M9, MA, MB, MC, MD, ME, MF; \ const __m128i FIVE = _mm_set1_epi32( 5 ); \ const __m128i THREE = _mm_set1_epi32( 3 ); \ - sph_u32 Wlow, Whigh; + uint32_t Wlow, Whigh; #define READ_STATE(state) do \ { \ diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index 550a3c6f..cec80fe7 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -1,51 +1,11 @@ -/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */ -/** - * Shabal interface. Shabal is a family of functions which differ by - * their output size; this implementation defines Shabal for output - * sizes 192, 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_shabal.h - * @author Thomas Pornin - */ - #ifndef SHABAL_HASH_4WAY_H__ #define SHABAL_HASH_4WAY_H__ 1 #ifdef __SSE4_1__ #include -#include "algo/sha/sph_types.h" #include "simd-utils.h" -#ifdef __cplusplus -extern "C"{ -#endif - #define SPH_SIZE_shabal256 256 #define SPH_SIZE_shabal512 512 @@ -55,7 +15,7 @@ extern "C"{ typedef struct { __m256i buf[16]; __m256i A[12], B[16], C[16]; - sph_u32 Whigh, Wlow; + uint32_t Whigh, Wlow; size_t ptr; bool state_loaded; } shabal_8way_context __attribute__ ((aligned (64))); @@ -80,7 +40,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, typedef struct { __m128i buf[16] __attribute__ ((aligned (64))); __m128i A[12], B[16], C[16]; - sph_u32 Whigh, Wlow; + uint32_t Whigh, Wlow; size_t ptr; bool state_loaded; } shabal_4way_context; @@ -100,10 +60,6 @@ void shabal512_4way_close( void *cc, void *dst ); void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst ); -#ifdef __cplusplus -} -#endif - #endif #endif diff --git a/algo/shabal/sph_shabal.h b/algo/shabal/sph_shabal.h index 4d230fb9..c743ee8b 100644 --- a/algo/shabal/sph_shabal.h +++ b/algo/shabal/sph_shabal.h @@ -37,7 +37,7 @@ #define SPH_SHABAL_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #ifdef __cplusplus extern "C"{ #endif diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index b96a393a..26a9ab25 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -1,6 +1,4 @@ #include "shavite-hash-2way.h" -#include "algo/sha/sph_types.h" - #include // This is a fake, it actually does not do parallel AES, that requires VAES. diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c index 3d7c8286..728a273a 100644 --- a/algo/shavite/sph_shavite.c +++ b/algo/shavite/sph_shavite.c @@ -64,7 +64,7 @@ extern "C"{ */ #define AES_BIG_ENDIAN 0 -#include "algo/sha/aes_helper.c" +#include "compat/aes_helper.c" static const sph_u32 IV224[] = { C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371), diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h index f30f4dfb..c470e6db 100644 --- a/algo/shavite/sph_shavite.h +++ b/algo/shavite/sph_shavite.h @@ -39,7 +39,7 @@ #define SPH_SHAVITE_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #ifdef __cplusplus extern "C"{ diff --git a/algo/simd/nist.h b/algo/simd/nist.h index b4737ffb..052bf71e 100644 --- a/algo/simd/nist.h +++ b/algo/simd/nist.h @@ -9,7 +9,7 @@ #endif #include "simd-compat.h" -#include "algo/sha/sha3-defs.h" +#include "compat/sha3-defs.h" /* * NIST API Specific types. */ diff --git a/algo/simd/simd-compat.h b/algo/simd/simd-compat.h index 721ab906..1c2b379a 100644 --- a/algo/simd/simd-compat.h +++ b/algo/simd/simd-compat.h @@ -24,7 +24,7 @@ */ #include -#include "algo/sha/brg_types.h" +#include "compat/brg_types.h" #define C32(x) ((u32)(x)) diff --git a/algo/simd/sph_simd.h b/algo/simd/sph_simd.h index 2c6b7bf1..3397c8c8 100644 --- a/algo/simd/sph_simd.h +++ b/algo/simd/sph_simd.h @@ -41,7 +41,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" /** * Output size (in bits) for SIMD-224. diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index 4e781681..b66c6c14 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -2,7 +2,6 @@ #include #include #include "skein-hash-4way.h" -#include "algo/sha/sha-hash-4way.h" #include "algo/sha/sha256-hash.h" #if defined (SKEIN_8WAY) diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c index 7adeac9f..191aa154 100644 --- a/algo/skein/skein-gate.c +++ b/algo/skein/skein-gate.c @@ -1,5 +1,4 @@ #include "skein-gate.h" -#include "sph_skein.h" #include "skein-hash-4way.h" bool register_skein_algo( algo_gate_t* gate ) diff --git a/algo/skein/sph_skein.h b/algo/skein/sph_skein.h index 2ba7e334..0b9ba5d0 100644 --- a/algo/skein/sph_skein.h +++ b/algo/skein/sph_skein.h @@ -46,7 +46,7 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #if SPH_64 diff --git a/algo/tiger/sph_tiger.h b/algo/tiger/sph_tiger.h index 6461b475..8107c091 100644 --- a/algo/tiger/sph_tiger.h +++ b/algo/tiger/sph_tiger.h @@ -45,7 +45,7 @@ #define SPH_TIGER_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #if SPH_64 diff --git a/algo/whirlpool/sph_whirlpool.h b/algo/whirlpool/sph_whirlpool.h index 801a9f92..10a21f36 100644 --- a/algo/whirlpool/sph_whirlpool.h +++ b/algo/whirlpool/sph_whirlpool.h @@ -49,7 +49,7 @@ #define SPH_WHIRLPOOL_H__ #include -#include "algo/sha/sph_types.h" +#include "compat/sph_types.h" #if SPH_64 diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c index fada82f6..75e6c0db 100644 --- a/algo/x11/x11-4way.c +++ b/algo/x11/x11-4way.c @@ -65,6 +65,9 @@ void init_x11_8way_ctx() #endif } +static __thread __m512i x11_8way_midstate[16] __attribute__((aligned(64))); + + void x11_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); @@ -80,8 +83,9 @@ void x11_8way_hash( void *state, const void *input ) uint64_t hash7[8] __attribute__ ((aligned (64))); x11_8way_ctx_holder ctx; memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) ); - blake512_8way_update( &ctx.blake, input, 80 ); - blake512_8way_close( &ctx.blake, vhash ); + + blake512_8way_final_le( &ctx.blake, vhash, casti_m512i( input, 9 ), + x11_8way_midstate ); bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); @@ -252,39 +256,45 @@ void x11_8way_hash( void *state, const void *input ) int scanhash_x11_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[8*8] __attribute__ ((aligned (128))); - uint32_t vdata[24*8] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - - const uint32_t last_nonce = max_nonce -8; - mm512_bswap32_intrlv80_8x64( vdata, pdata ); - - do - { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x11_8way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 8; i++ ) - if ( ( hash+(i<<3) )[7] <= Htarg - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_solution( work, hash+(i<<3), mythr ); - } - n += 8; - } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce; - return 0; + uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + __m128i edata[5] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + int thr_id = mythr->id; + __m512i *noncev = (__m512i*)vdata + 9; + const uint32_t last_nonce = max_nonce -8; + const __m512i eight = _mm512_set1_epi64( 8 ); + + // convert LE32 to LE64 + edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) ); + edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) ); + edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) ); + edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) ); + edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) ); + + mm512_intrlv80_8x64( vdata, edata ); + *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32( + 0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) ); + blake512_8way_prehash_le( &x11_8way_ctx.blake, x11_8way_midstate, vdata ); + + do + { + x11_8way_hash( hash, vdata ); + + for ( int i = 0; i < 8; i++ ) + if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark )) + { + pdata[19] = n+i; + submit_solution( work, hash+(i<<3), mythr ); + } + *noncev = _mm512_add_epi32( *noncev, eight ); + n += 8; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + return 0; } diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index c498ff7a..19ba317f 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -263,7 +263,7 @@ bool register_hex_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_hex; gate->hash = (void*)&x16r_hash; gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; + gate->gen_merkle_root = (void*)&sha256_gen_merkle_root; opt_target_factor = 128.0; return true; }; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 76ca5e7e..be425c41 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -20,7 +20,7 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha512-hash.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" @@ -42,7 +42,6 @@ #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/shabal/shabal-hash-4way.h" -#include "algo/sha/sha-hash-4way.h" #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 103bc636..74eddd52 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -12,9 +12,7 @@ #include "algo/tiger/sph_tiger.h" #include "algo/gost/sph_gost.h" #include "algo/lyra2/lyra2.h" -#if defined(__SHA__) - #include "algo/sha/sha256-hash.h" -#endif +#include "algo/sha/sha256-hash.h" #if defined (X21S_8WAY) diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index d29db77f..203cd2ac 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -20,7 +20,7 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha512-hash.h" #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" #include "algo/shavite/shavite-hash-4way.h" diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index e84bad42..5b6a7f73 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -25,7 +25,7 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha512-hash.h" #if defined(X17_8WAY) @@ -37,7 +37,6 @@ union _x17_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; -// cube_4way_context cube; cube_4way_2buf_context cube; #if defined(__VAES__) groestl512_4way_context groestl; @@ -190,7 +189,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id ) hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index 54e3051c..3566e7fd 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -19,7 +19,7 @@ #include "algo/fugue/fugue-aesni.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha512-hash.h" #include "algo/haval/haval-hash-4way.h" #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index e94cb1c8..40e60217 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -16,7 +16,8 @@ #include "algo/fugue/fugue-aesni.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha512-hash.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/haval-hash-4way.h" #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" @@ -26,9 +27,6 @@ #include "algo/shavite/shavite-hash-4way.h" #include "algo/echo/echo-hash-4way.h" #endif -#if defined(__SHA__) - #include "algo/sha/sha256-hash.h" -#endif #if defined(X22I_8WAY) diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index 76191219..07445591 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -6,7 +6,8 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/shabal/shabal-hash-4way.h" -#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha512-hash.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/haval-hash-4way.h" #include "algo/blake/blake2s-hash-4way.h" #include "algo/echo/aes_ni/hash_api.h" @@ -31,9 +32,6 @@ #include "algo/shavite/shavite-hash-4way.h" #include "algo/echo/echo-hash-4way.h" #endif -#if defined(__SHA__) - #include "algo/sha/sha256-hash.h" -#endif void x25x_shuffle( void *hash ) { diff --git a/asm/aesb-x64.S b/asm/aesb-x64.S deleted file mode 100644 index 75c04200..00000000 --- a/asm/aesb-x64.S +++ /dev/null @@ -1,72 +0,0 @@ -#include - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - - .text - .p2align 6 - .globl fast_aesb_single_round - .globl _fast_aesb_single_round -fast_aesb_single_round: -_fast_aesb_single_round: -#if defined(_WIN64) || defined(__CYGWIN__) - movdqa (%rcx), %xmm1 - aesenc (%r8), %xmm1 - movdqa %xmm1, (%rdx) -#else - movdqa (%rdi), %xmm1 - aesenc (%rdx), %xmm1 - movdqa %xmm1, (%rsi) -#endif - ret - - .text - .p2align 6 - .globl fast_aesb_pseudo_round_mut - .globl _fast_aesb_pseudo_round_mut -fast_aesb_pseudo_round_mut: -_fast_aesb_pseudo_round_mut: -#if defined(_WIN64) || defined(__CYGWIN__) - mov %rdx, %r9 - add $0xA0, %r9 - movdqa (%rcx), %xmm1 - - .LOOP: - aesenc (%rdx), %xmm1 - add $0x10, %rdx - cmp %r9, %rdx - jl .LOOP - - movdqa %xmm1, (%rcx) -#else - mov %rsi, %r9 - add $0xA0, %r9 - movdqa (%rdi), %xmm1 - - .LOOP: - aesenc (%rsi), %xmm1 - add $0x10, %rsi - cmp %r9, %rsi - jl .LOOP - - movdqa %xmm1, (%rdi) -#endif - ret - - .text - .globl mul128 - .globl _mul128 -mul128: -_mul128: -#if defined(_WIN64) || defined(__CYGWIN__) - mov %rcx, %rax - mul %rdx - mov %rdx, (%r8) -#else - mov %rdx, %r8 - mov %rdi, %rax - mul %rsi - mov %rdx, (%r8) -#endif - ret diff --git a/asm/aesb-x86.S b/asm/aesb-x86.S deleted file mode 100644 index ab3d1eab..00000000 --- a/asm/aesb-x86.S +++ /dev/null @@ -1,21 +0,0 @@ -#include - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - - .text - .p2align 6 - .globl fast_aesb_single_round - .globl _fast_aesb_single_round -fast_aesb_single_round: -_fast_aesb_single_round: - ret - - .text - .p2align 6 - .globl fast_aesb_pseudo_round_mut - .globl _fast_aesb_pseudo_round_mut -fast_aesb_pseudo_round_mut: -_fast_aesb_pseudo_round_mut: - ret diff --git a/comp.log b/comp.log deleted file mode 100644 index 096dbf73..00000000 --- a/comp.log +++ /dev/null @@ -1,50 +0,0 @@ -make all-recursive -make[1]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev' -Making all in compat -make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat' -Making all in jansson -make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson' -make[3]: Nothing to be done for `all'. -make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson' -make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat' -make[3]: Nothing to be done for `all-am'. -make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat' -make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat' -make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev' -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT cpuminer-cpu-miner.o -MD -MP -MF .deps/cpuminer-cpu-miner.Tpo -c -o cpuminer-cpu-miner.o `test -f 'cpu-miner.c' || echo './'`cpu-miner.c -mv -f .deps/cpuminer-cpu-miner.Tpo .deps/cpuminer-cpu-miner.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT cpuminer-util.o -MD -MP -MF .deps/cpuminer-util.Tpo -c -o cpuminer-util.o `test -f 'util.c' || echo './'`util.c -mv -f .deps/cpuminer-util.Tpo .deps/cpuminer-util.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT cpuminer-algo-gate-api.o -MD -MP -MF .deps/cpuminer-algo-gate-api.Tpo -c -o cpuminer-algo-gate-api.o `test -f 'algo-gate-api.c' || echo './'`algo-gate-api.c -mv -f .deps/cpuminer-algo-gate-api.Tpo .deps/cpuminer-algo-gate-api.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/shavite/cpuminer-shavite.o -MD -MP -MF algo/shavite/.deps/cpuminer-shavite.Tpo -c -o algo/shavite/cpuminer-shavite.o `test -f 'algo/shavite/shavite.c' || echo './'`algo/shavite/shavite.c -mv -f algo/shavite/.deps/cpuminer-shavite.Tpo algo/shavite/.deps/cpuminer-shavite.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/keccak/cpuminer-keccak.o -MD -MP -MF algo/keccak/.deps/cpuminer-keccak.Tpo -c -o algo/keccak/cpuminer-keccak.o `test -f 'algo/keccak/keccak.c' || echo './'`algo/keccak/keccak.c -mv -f algo/keccak/.deps/cpuminer-keccak.Tpo algo/keccak/.deps/cpuminer-keccak.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-axiom.o -MD -MP -MF algo/.deps/cpuminer-axiom.Tpo -c -o algo/cpuminer-axiom.o `test -f 'algo/axiom.c' || echo './'`algo/axiom.c -mv -f algo/.deps/cpuminer-axiom.Tpo algo/.deps/cpuminer-axiom.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake.o -MD -MP -MF algo/blake/.deps/cpuminer-blake.Tpo -c -o algo/blake/cpuminer-blake.o `test -f 'algo/blake/blake.c' || echo './'`algo/blake/blake.c -mv -f algo/blake/.deps/cpuminer-blake.Tpo algo/blake/.deps/cpuminer-blake.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake2.o -MD -MP -MF algo/blake/.deps/cpuminer-blake2.Tpo -c -o algo/blake/cpuminer-blake2.o `test -f 'algo/blake/blake2.c' || echo './'`algo/blake/blake2.c -mv -f algo/blake/.deps/cpuminer-blake2.Tpo algo/blake/.deps/cpuminer-blake2.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blakecoin.o -MD -MP -MF algo/blake/.deps/cpuminer-blakecoin.Tpo -c -o algo/blake/cpuminer-blakecoin.o `test -f 'algo/blake/blakecoin.c' || echo './'`algo/blake/blakecoin.c -mv -f algo/blake/.deps/cpuminer-blakecoin.Tpo algo/blake/.deps/cpuminer-blakecoin.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-decred.o -MD -MP -MF algo/blake/.deps/cpuminer-decred.Tpo -c -o algo/blake/cpuminer-decred.o `test -f 'algo/blake/decred.c' || echo './'`algo/blake/decred.c -mv -f algo/blake/.deps/cpuminer-decred.Tpo algo/blake/.deps/cpuminer-decred.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-pentablake.o -MD -MP -MF algo/blake/.deps/cpuminer-pentablake.Tpo -c -o algo/blake/cpuminer-pentablake.o `test -f 'algo/blake/pentablake.c' || echo './'`algo/blake/pentablake.c -mv -f algo/blake/.deps/cpuminer-pentablake.Tpo algo/blake/.deps/cpuminer-pentablake.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/bmw/cpuminer-bmw256.o -MD -MP -MF algo/bmw/.deps/cpuminer-bmw256.Tpo -c -o algo/bmw/cpuminer-bmw256.o `test -f 'algo/bmw/bmw256.c' || echo './'`algo/bmw/bmw256.c -mv -f algo/bmw/.deps/cpuminer-bmw256.Tpo algo/bmw/.deps/cpuminer-bmw256.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-c11.o -MD -MP -MF algo/.deps/cpuminer-c11.Tpo -c -o algo/cpuminer-c11.o `test -f 'algo/c11.c' || echo './'`algo/c11.c -mv -f algo/.deps/cpuminer-c11.Tpo algo/.deps/cpuminer-c11.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-cryptolight.o -MD -MP -MF algo/.deps/cpuminer-cryptolight.Tpo -c -o algo/cpuminer-cryptolight.o `test -f 'algo/cryptolight.c' || echo './'`algo/cryptolight.c -mv -f algo/.deps/cpuminer-cryptolight.Tpo algo/.deps/cpuminer-cryptolight.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cryptonight/cpuminer-cryptonight-common.o -MD -MP -MF algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo -c -o algo/cryptonight/cpuminer-cryptonight-common.o `test -f 'algo/cryptonight/cryptonight-common.c' || echo './'`algo/cryptonight/cryptonight-common.c -mv -f algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo algo/cryptonight/.deps/cpuminer-cryptonight-common.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-drop.o -MD -MP -MF algo/.deps/cpuminer-drop.Tpo -c -o algo/cpuminer-drop.o `test -f 'algo/drop.c' || echo './'`algo/drop.c -mv -f algo/.deps/cpuminer-drop.Tpo algo/.deps/cpuminer-drop.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-fresh.o -MD -MP -MF algo/.deps/cpuminer-fresh.Tpo -c -o algo/cpuminer-fresh.o `test -f 'algo/fresh.c' || echo './'`algo/fresh.c -mv -f algo/.deps/cpuminer-fresh.Tpo algo/.deps/cpuminer-fresh.Po -gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/groestl/cpuminer-groestl.o -MD -MP -MF algo/groestl/.deps/cpuminer-groestl.Tpo -c -o algo/groestl/cpuminer-groestl.o `test -f 'algo/groestl/groestl.c' || echo './'`algo/groestl/groestl.c -make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev' -make[1]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev' diff --git a/algo/sha/aes_helper.c b/compat/aes_helper.c similarity index 98% rename from algo/sha/aes_helper.c rename to compat/aes_helper.c index 75b7cc69..30063440 100644 --- a/algo/sha/aes_helper.c +++ b/compat/aes_helper.c @@ -43,16 +43,15 @@ * @author Thomas Pornin */ -#include "sph_types.h" #ifdef __cplusplus extern "C"{ #endif #if AES_BIG_ENDIAN -#define AESx(x) ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define AESx(x) ( (((x) >> 24) & 0x000000FF) \ + | (((x) >> 8) & 0x0000FF00) \ + | (((x) << 8) & 0x00FF0000) \ + | (((x) << 24) & 0xFF000000)) #define AES0 AES0_BE #define AES1 AES1_BE @@ -83,7 +82,7 @@ extern "C"{ #else -#define AESx(x) SPH_C32(x) +#define AESx(x) (x) #define AES0 AES0_LE #define AES1 AES1_LE #define AES2 AES2_LE @@ -119,7 +118,7 @@ extern "C"{ * MixColumns for the column where that byte goes after ShiftRows. */ -static const sph_u32 AES0[256] = { +static const uint32_t AES0[256] = { AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), @@ -186,7 +185,7 @@ static const sph_u32 AES0[256] = { AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) }; -static const sph_u32 AES1[256] = { +static const uint32_t AES1[256] = { AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), @@ -253,7 +252,7 @@ static const sph_u32 AES1[256] = { AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) }; -static const sph_u32 AES2[256] = { +static const uint32_t AES2[256] = { AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), @@ -320,7 +319,7 @@ static const sph_u32 AES2[256] = { AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) }; -static const sph_u32 AES3[256] = { +static const uint32_t AES3[256] = { AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), diff --git a/algo/sha/brg_types.h b/compat/brg_types.h similarity index 100% rename from algo/sha/brg_types.h rename to compat/brg_types.h diff --git a/algo/sha/sha3-defs.h b/compat/sha3-defs.h similarity index 100% rename from algo/sha/sha3-defs.h rename to compat/sha3-defs.h diff --git a/algo/sha/sha3_common.h b/compat/sha3_common.h similarity index 100% rename from algo/sha/sha3_common.h rename to compat/sha3_common.h diff --git a/algo/sha/sph_types.h b/compat/sph_types.h similarity index 100% rename from algo/sha/sph_types.h rename to compat/sph_types.h diff --git a/config-template.json b/config-template.json new file mode 100644 index 00000000..9f82214d --- /dev/null +++ b/config-template.json @@ -0,0 +1,22 @@ +{ + "_comment" : "Any long-format command line argument ", + "_comment" : "may be used in this JSON configuration file", + "_comment" : "Additional arguments may be added to the command line.", + "_comment" : "Usage: cpuminer -c myconfig.json [additional arguments]", + + "_comment" : "Required arguments, replace dummy values", + + "url" : "stratum+tcp://example.com:3333", + "user" : "read.pool.instructions", + "pass" : "x.often.works", + "algo" : "algo", + + "_comment" : "Often used optional arguments with default values selected.", + "_comment" : "Change values, add or delete arguments as desired.", + + "threads" : 0, + "cpu-affinity" : -1, + "api-bind" : "127.0.0.1:4048", + "benchmark" : false, + "quiet" : false +} diff --git a/configure b/configure index 2403ac96..2fd8e71b 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.1. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.2. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.23.1' -PACKAGE_STRING='cpuminer-opt 3.23.1' +PACKAGE_VERSION='3.23.2' +PACKAGE_STRING='cpuminer-opt 3.23.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.23.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.23.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.23.1 +cpuminer-opt configure 3.23.2 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.23.1, which was +It was created by cpuminer-opt $as_me 3.23.2, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.23.1' + VERSION='3.23.2' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.23.1, which was +This file was extended by cpuminer-opt $as_me 3.23.2, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 3.23.1 +cpuminer-opt config.status 3.23.2 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 8276943b..d6a28a5e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.23.1]) +AC_INIT([cpuminer-opt], [3.23.2]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index d69fd1f9..5e85cc30 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.23.1' -PACKAGE_STRING='cpuminer-opt 3.23.1' +PACKAGE_VERSION='3.23.2' +PACKAGE_STRING='cpuminer-opt 3.23.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.23.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.23.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.23.1 +cpuminer-opt configure 3.23.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.23.1, which was +It was created by cpuminer-opt $as_me 3.23.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.23.1' + VERSION='3.23.2' cat >>confdefs.h <<_ACEOF @@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.23.1, which was +This file was extended by cpuminer-opt $as_me 3.23.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6784,7 +6784,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.23.1 +cpuminer-opt config.status 3.23.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index 37234ddb..e86e6a77 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -954,10 +954,10 @@ static inline void sprintf_et( char *str, long unsigned int seconds ) sprintf( str, "%lum%02lus", min, sec ); } -const long double exp32 = EXP32; // 2**32 -const long double exp48 = EXP32 * EXP16; // 2**48 -const long double exp64 = EXP32 * EXP32; // 2**64 -const long double exp96 = EXP32 * EXP32 * EXP32; // 2**96 +const long double exp32 = EXP32; // 2**32 +const long double exp48 = EXP32 * EXP16; // 2**48 +const long double exp64 = EXP32 * EXP32; // 2**64 +const long double exp96 = EXP32 * EXP32 * EXP32; // 2**96 const long double exp128 = EXP32 * EXP32 * EXP32 * EXP32; // 2**128 const long double exp160 = EXP32 * EXP32 * EXP32 * EXP32 * EXP16; // 2**160 @@ -1280,53 +1280,11 @@ static int share_result( int result, struct work *work, applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)", my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol, bres, CL_N, share_time, latency ); - -/* - if ( unlikely( opt_debug || !result || solved ) ) - { - if ( have_stratum ) - applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s", - my_stats.share_diff, my_stats.height, my_stats.job_id ); - else - applog2( LOG_INFO, "Diff %.5g, Block %d", - my_stats.share_diff, work ? work->height : last_block_height ); - } -*/ - if ( unlikely( !( opt_quiet || result || stale ) ) ) { -// uint32_t str[8]; -// uint32_t *targ; - - if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason ); - { - // The exact hash is not avaiable here, it's just an imprecise - // approximation calculated from the share difficulty. It's useless - // for anything other than low diff rejects. Until and unless a - // solution is implemented to make the hash and targets avaiable - // don't bother displaying them. In the meantime display the diff for - // low diff rejects. - - if ( strstr( reason, "difficulty" ) ) - applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g", - my_stats.share_diff, my_stats.target_diff ); - -/* - diff_to_hash( str, my_stats.share_diff ); - applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6], - str[5], str[4], str[3],str[2], str[1], str[0] ); - - if ( work ) - targ = work->target; - else - { - diff_to_hash( str, my_stats.target_diff ); - targ = &str[0]; - } - applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6], - targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] ); -*/ - } + applog2( LOG_INFO, "Reject reason: %s", reason ? reason : "NULL" ); + applog2( LOG_INFO, "Share diff: %.5g, Target: %.5g", + my_stats.share_diff, my_stats.target_diff ); } return 1; } @@ -1986,6 +1944,7 @@ void sha256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) sha256d( merkle_root, merkle_root, 64 ); } } +/* // OpenSSL single sha256, deprecated void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) { @@ -1996,6 +1955,7 @@ void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) sha256d( merkle_root, merkle_root, 64 ); } } +*/ // Default is do_nothing (assumed LE) void set_work_data_big_endian( struct work *work ) diff --git a/cpuminer-conf.json b/cpuminer-conf.json deleted file mode 100644 index d464f528..00000000 --- a/cpuminer-conf.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "_comment1" : "Any long-format command line argument ", - "_comment2" : "may be used in this JSON configuration file", - - "api-bind" : "127.0.0.1:4048", - - "url" : "stratum+tcp://mine.xpool.ca:1131", - "user" : "XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3", - "pass" : "cpuminer", - - "algo" : "x11", - "threads" : 0, - "cpu-priority" : 0, - "cpu-affinity" : -1, - - "benchmark" : false, - "debug" : false, - "protocol": false, - "quiet" : false -} diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index ad895256..bafcded5 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -43,6 +43,9 @@ typedef union } __attribute__ ((aligned (16))) m128_ovly; +#define v128_64(i) _mm_set1_epi64x(i) +#define v128_32(i) _mm_set1_epi32(i) + // Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements // that make these functions either unnecessary or inefficient. // In cases where an explicit move betweeen GP & SIMD registers is still diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index d5425c8b..2f86a3f4 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -33,6 +33,10 @@ typedef union uint32_t u32[8]; } __attribute__ ((aligned (32))) m256_ovly; + +#define v256_64(i) _mm256_set1_epi64x(i) +#define v256_32(i) _mm256_set1_epi32(i) + // // Pointer casting diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index ebd7d764..7b902823 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -97,6 +97,9 @@ typedef union uint64_t u64[8]; } __attribute__ ((aligned (64))) m512_ovly; +#define v512_64(i) _mm512_set1_epi64(i) +#define v512_32(i) _mm512_set1_epi32(i) + // A simple 128 bit permute, using function instead of macro avoids // problems if the v arg passed as an expression. static inline __m512i mm512_perm_128( const __m512i v, const int c ) diff --git a/verthash-help.txt b/verthash-help.txt index f8e02db4..c7228950 100644 --- a/verthash-help.txt +++ b/verthash-help.txt @@ -64,8 +64,8 @@ then exit. --algo verthash --verify A data file will never be created if --data-file is specified. The miner -will exit with an error if the file is not found. This is to avoid accidentally -creating an unwanted data file due to a typo. +will exit with an error if the file is not found in the specified location. +This is to avoid accidentally creating an unwanted data file due to a typo. After creation the data file can moved to a more convenient location and referenced by --data-file, or left where it is and used by default without the