From 042d13d1e10395764c6607f9b038d588536d638a Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Mon, 20 May 2024 23:08:50 -0400 Subject: [PATCH] v24.2 --- RELEASE_NOTES | 6 + algo-gate-api.c | 2 +- algo-gate-api.h | 5 +- algo/argon2d/argon2d/opt.c | 4 +- algo/argon2d/blake2/blamka-round-opt.h | 2 +- algo/blake/blake256-hash.c | 4 +- algo/blake/blake256-hash.h | 2 +- algo/blake/blake2b-hash.c | 2 +- algo/blake/blake2b-hash.h | 2 +- algo/blake/blake2b.c | 2 +- algo/blake/blake2s-hash.c | 2 +- algo/blake/blake2s-hash.h | 34 +- algo/blake/blake2s.c | 2 +- algo/blake/blake512-hash.c | 50 +-- algo/blake/blake512-hash.h | 2 +- algo/blake/blakecoin-gate.h | 2 +- algo/blake/sph_blake2b.c | 8 +- algo/bmw/bmw-hash-4way.h | 4 +- algo/bmw/bmw256-hash-4way.c | 2 +- algo/bmw/bmw512-gate.h | 2 +- algo/bmw/bmw512-hash-4way.c | 2 +- algo/cubehash/cube-hash-2way.c | 2 +- algo/cubehash/cube-hash-2way.h | 2 +- algo/cubehash/cubehash_sse2.c | 2 +- algo/echo/echo-hash-4way.c | 2 +- algo/echo/echo-hash-4way.h | 2 +- algo/gost/sph_gost.c | 2 +- algo/groestl/aes_ni/groestl-intr-aes.h | 2 +- algo/groestl/aes_ni/groestl256-intr-aes.h | 2 +- algo/groestl/groestl-gate.h | 2 +- algo/groestl/groestl256-hash-4way.c | 2 +- algo/groestl/groestl256-hash-4way.h | 2 +- algo/groestl/groestl256-intr-4way.h | 2 +- algo/groestl/groestl512-hash-4way.c | 2 +- algo/groestl/groestl512-hash-4way.h | 2 +- algo/groestl/groestl512-intr-4way.h | 2 +- algo/groestl/myrgr-gate.h | 2 +- algo/hamsi/hamsi-hash-4way.c | 6 +- algo/hamsi/hamsi-hash-4way.h | 2 +- algo/haval/haval-hash-4way.c | 6 +- algo/haval/haval-hash-4way.h | 2 +- algo/jh/jh-hash-4way.c | 7 +- algo/jh/jh-hash-4way.h | 2 +- algo/keccak/keccak-gate.h | 4 +- algo/keccak/keccak-hash-4way.c | 2 +- algo/keccak/keccak-hash-4way.h | 2 +- algo/luffa/luffa-hash-2way.c | 5 +- algo/luffa/luffa-hash-2way.h | 2 +- algo/luffa/luffa_for_sse2.c | 6 +- algo/luffa/luffa_for_sse2.h | 2 +- algo/lyra2/allium-4way.c | 2 +- algo/lyra2/lyra2-gate.h | 6 +- algo/lyra2/lyra2-hash-2way.c | 2 +- algo/lyra2/lyra2.h | 2 +- algo/lyra2/lyra2z-4way.c | 2 +- algo/lyra2/phi2-4way.c | 2 +- algo/lyra2/sponge-2way.c | 2 +- algo/lyra2/sponge.h | 12 +- algo/nist5/nist5-gate.h | 2 +- algo/panama/panama-hash-4way.c | 5 +- algo/quark/anime-gate.h | 2 +- algo/quark/hmq1725-gate.h | 2 +- algo/quark/quark-gate.h | 2 +- algo/qubit/qubit-gate.h | 2 +- algo/ripemd/lbry-gate.h | 2 +- algo/ripemd/ripemd-hash-4way.c | 10 +- algo/ripemd/ripemd-hash-4way.h | 2 +- algo/scrypt/scrypt-core-4way.c | 6 +- algo/scrypt/scrypt-core-4way.h | 2 +- algo/scrypt/scrypt.c | 6 +- algo/sha/hmac-sha256-hash-4way.c | 2 +- algo/sha/hmac-sha256-hash-4way.h | 2 +- algo/sha/sha256-hash-4way.c | 12 +- algo/sha/sha256-hash.c | 52 ++- algo/sha/sha256-hash.h | 2 +- algo/sha/sha256d-4way.h | 2 +- algo/sha/sha256d.h | 2 +- algo/sha/sha256dt.c | 2 +- algo/sha/sha256t-gate.h | 2 +- algo/sha/sha512-hash-4way.c | 383 ++++++++++++++-------- algo/sha/sha512-hash.h | 2 +- algo/sha/sha512256d-4way.c | 2 +- algo/shabal/shabal-hash-4way.c | 2 +- algo/shabal/shabal-hash-4way.h | 2 +- algo/shavite/shavite-hash-2way.c | 3 +- algo/shavite/shavite-hash-4way.c | 2 +- algo/shavite/shavite-hash-4way.h | 4 +- algo/simd/simd-hash-2way.c | 5 +- algo/simd/simd-hash-2way.h | 2 +- algo/skein/skein-gate.h | 2 +- algo/skein/skein-hash-4way.c | 4 +- algo/skein/skein-hash-4way.h | 2 +- algo/swifftx/swifftx.c | 4 +- algo/verthash/tiny_sha3/sha3-4way.c | 2 +- algo/verthash/tiny_sha3/sha3-4way.h | 2 +- algo/x11/c11-gate.h | 2 +- algo/x11/tribus-gate.h | 2 +- algo/x11/x11-gate.h | 2 +- algo/x11/x11gost-gate.h | 2 +- algo/x12/x12-gate.h | 2 +- algo/x13/phi1612-gate.h | 2 +- algo/x13/skunk-gate.h | 2 +- algo/x13/x13-gate.h | 2 +- algo/x13/x13sm3-gate.h | 2 +- algo/x14/x14-gate.h | 2 +- algo/x15/x15-gate.h | 2 +- algo/x16/x16r-gate.h | 8 +- algo/x16/x20r.c | 2 +- algo/x17/sonoa-gate.h | 2 +- algo/x17/x17-gate.h | 2 +- algo/x17/xevan-gate.h | 2 +- algo/x22/x22i-gate.c | 4 +- algo/x22/x22i-gate.h | 4 +- algo/yespower/yespower-blake2b.c | 2 +- algo/yespower/yespower-opt.c | 20 +- armbuild-all.sh | 35 +- build-allarch.sh | 31 +- clean-all.sh | 2 +- configure | 20 +- configure.ac | 2 +- configure~ | 20 +- cpu-miner.c | 51 ++- simd-utils.h | 37 +++ simd-utils/intrlv.h | 44 ++- simd-utils/simd-128.h | 34 +- simd-utils/simd-256.h | 41 ++- simd-utils/simd-512.h | 11 +- simd-utils/simd-neon.h | 153 +++++---- sysinfos.c | 29 +- 129 files changed, 835 insertions(+), 538 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 5927c158..feff95f0 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,12 @@ If not what makes it happen or not happen? Change Log ---------- +v24.2 + +x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4. +x86_64: Initial support for CPUs with AVX10, needs GCC-14. +ARM NEON: Various code optimisations. + v24.1 #414: fix bug in merkle error handling. diff --git a/algo-gate-api.c b/algo-gate-api.c index a456c873..2b057ac4 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) //int scanhash_8way_64_64( struct work *work, uint32_t max_nonce, // uint64_t *hashes_done, struct thr_info *mythr ) diff --git a/algo-gate-api.h b/algo-gate-api.h index e9ac10e0..3626a521 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -99,8 +99,11 @@ typedef uint32_t set_t; #define AES_OPT 1 << 7 // Intel Westmere, AArch64 #define VAES_OPT 1 << 8 // Icelake, Zen3 #define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64 +#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64 #define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64 #define NEON_OPT 1 << 11 // AArch64 +#define AVX10_256 1 << 12 +#define AVX10_512 1 << 13 // AVX10 does not have explicit algo features: // AVX10_512 is compatible with AVX512 + VAES @@ -246,7 +249,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) //int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce, // uint64_t *hashes_done, struct thr_info *mythr ); diff --git a/algo/argon2d/argon2d/opt.c b/algo/argon2d/argon2d/opt.c index a363a614..d32aebd0 100644 --- a/algo/argon2d/argon2d/opt.c +++ b/algo/argon2d/argon2d/opt.c @@ -35,7 +35,7 @@ * @pre all block pointers must be valid */ -#if defined(__AVX512F__) +#if defined(SIMD512) static inline __m512i blamka( __m512i x, __m512i y ) { @@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance, uint64_t pseudo_rand, ref_index, ref_lane; uint32_t prev_offset, curr_offset; uint32_t starting_index, i; -#if defined(__AVX512F__) +#if defined(SIMD512) __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK]; #elif defined(__AVX2__) __m256i state[ARGON2_HWORDS_IN_BLOCK]; diff --git a/algo/argon2d/blake2/blamka-round-opt.h b/algo/argon2d/blake2/blamka-round-opt.h index 3e0fc3c5..0b4cd786 100644 --- a/algo/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2d/blake2/blamka-round-opt.h @@ -21,7 +21,7 @@ #include "blake2-impl.h" #include "simd-utils.h" -#if !defined(__AVX512F__) +#if !defined(SIMD512) #if !defined(__AVX2__) diff --git a/algo/blake/blake256-hash.c b/algo/blake/blake256-hash.c index e60376f5..066e8e7d 100644 --- a/algo/blake/blake256-hash.c +++ b/algo/blake/blake256-hash.c @@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) /////////////////////////////////////// // @@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) //Blake-256 16 way AVX512 diff --git a/algo/blake/blake256-hash.h b/algo/blake/blake256-hash.h index 2408e540..6b53ef57 100644 --- a/algo/blake/blake256-hash.h +++ b/algo/blake/blake256-hash.h @@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst); #define blake256r8_8x32_update blake256r14_8way_update #define blake256r8_8x32_close blake256r14_8way_close -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) /////////////////////////////////// // diff --git a/algo/blake/blake2b-hash.c b/algo/blake/blake2b-hash.c index eb4679a4..cf178375 100644 --- a/algo/blake/blake2b-hash.c +++ b/algo/blake/blake2b-hash.c @@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] = #define Mx_(n) Mx__(n) #define Mx__(n) M ## n -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define B2B8W_G(a, b, c, d, x, y) \ { \ diff --git a/algo/blake/blake2b-hash.h b/algo/blake/blake2b-hash.h index 1256fb18..88f5b415 100644 --- a/algo/blake/blake2b-hash.h +++ b/algo/blake/blake2b-hash.h @@ -15,7 +15,7 @@ #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct ALIGN( 64 ) { __m512i b[16]; // input buffer diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c index 7707366c..e2b62765 100644 --- a/algo/blake/blake2b.c +++ b/algo/blake/blake2b.c @@ -3,7 +3,7 @@ #include #include "blake2b-hash.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define BLAKE2B_8WAY #elif defined(__AVX2__) #define BLAKE2B_4WAY diff --git a/algo/blake/blake2s-hash.c b/algo/blake/blake2s-hash.c index ae8030b0..ee895453 100644 --- a/algo/blake/blake2s-hash.c +++ b/algo/blake/blake2s-hash.c @@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out, #endif // __AVX2__ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // Blake2s-256 16 way diff --git a/algo/blake/blake2s-hash.h b/algo/blake/blake2s-hash.h index 2764a899..4e2d66c3 100644 --- a/algo/blake/blake2s-hash.h +++ b/algo/blake/blake2s-hash.h @@ -29,20 +29,20 @@ #define ALIGN(x) __attribute__((aligned(x))) #endif - typedef struct __blake2s_nway_param - { - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint8_t node_offset[6];// 14 - uint8_t node_depth; // 15 - uint8_t inner_length; // 16 - // uint8_t reserved[0]; - uint8_t salt[8]; // 24 - uint8_t personal[8]; // 32 - } blake2s_nway_param; +typedef struct __blake2s_nway_param +{ + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint8_t node_offset[6];// 14 + uint8_t node_depth; // 15 + uint8_t inner_length; // 16 + // uint8_t reserved[0]; + uint8_t salt[8]; // 24 + uint8_t personal[8]; // 32 +} blake2s_nway_param; typedef struct ALIGN( 64 ) __blake2s_4way_state { @@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, typedef struct ALIGN( 64 ) __blake2s_8way_state { __m256i h[8]; - uint8_t buf[ 32 * 8 ]; + uint8_t buf[ 64 * 8 ]; uint32_t t[2]; uint32_t f[2]; size_t buflen; @@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct ALIGN( 64 ) __blake2s_16way_state { __m512i h[8]; - uint8_t buf[ 32 * 16 ]; + uint8_t buf[ 64 * 16 ]; uint32_t t[2]; uint32_t f[2]; size_t buflen; diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c index dba7ffb6..011bad37 100644 --- a/algo/blake/blake2s.c +++ b/algo/blake/blake2s.c @@ -3,7 +3,7 @@ #include #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define BLAKE2S_16WAY #elif defined(__AVX2__) #define BLAKE2S_8WAY diff --git a/algo/blake/blake512-hash.c b/algo/blake/blake512-hash.c index 2ab7159e..73799e20 100644 --- a/algo/blake/blake512-hash.c +++ b/algo/blake/blake512-hash.c @@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0, Va = v128_add64( Va, v128_add64( Vb, \ v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \ CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \ - Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \ + Vd = v128_ror64xor( Vd, Va, 32 ); \ Vc = v128_add64( Vc, Vd ); \ - Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \ + Vb = v128_ror64xor( Vb, Vc, 25 ); \ \ Va = v128_add64( Va, v128_add64( Vb, \ v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \ CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \ - Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \ + Vd = v128_ror64xor( Vd, Va, 16 ); \ Vc = v128_add64( Vc, Vd ); \ - Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \ + Vb = v128_ror64xor( Vb, Vc, 11 ); \ } #define BLAKE512_ROUND( R ) \ @@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data, #if defined(__AVX2__) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) //////////////////////////////////// // @@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst) #define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \ { \ a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \ - d = v128_ror64( v128_xor( d, a ), 32 ); \ + d = v128_ror64xor( d, a, 32 ); \ c = v128_add64( c, d ); \ - b = v128_ror64( v128_xor( b, c ), 25 ); \ + b = v128_ror64xor( b, c, 25 ); \ a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \ - d = v128_ror64( v128_xor( d, a ), 16 ); \ + d = v128_ror64xor( d, a, 16 ); \ c = v128_add64( c, d ); \ - b = v128_ror64( v128_xor( b, c ), 11 ); \ + b = v128_ror64xor( b, c, 11 ); \ } #define ROUND_B_2X64(r) \ @@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc, // G4 skip nonce V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); - VF = v128_ror64( v128_xor( VF, V0 ), 32 ); + VF = v128_ror64xor( VF, V0, 32 ); VA = v128_add64( VA, VF ); - V5 = v128_ror64( v128_xor( V5, VA ), 25 ); + V5 = v128_ror64xor( V5, VA, 25 ); V0 = v128_add64( V0, V5 ); GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC ); @@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash, // finish round 0, with the nonce now available V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) ); - VF = v128_ror64( v128_xor( VF, V0 ), 16 ); + VF = v128_ror64xor( VF, V0, 16 ); VA = v128_add64( VA, VF ); - V5 = v128_ror64( v128_xor( V5, VA ), 11 ); + V5 = v128_ror64xor( V5, VA, 11 ); // Round 1 // G0 @@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash, // G1 V1 = v128_add64( V1, V5 ); - VD = v128_ror64( v128_xor( VD, V1 ), 32 ); + VD = v128_ror64xor( VD, V1, 32 ); V9 = v128_add64( V9, VD ); - V5 = v128_ror64( v128_xor( V5, V9 ), 25 ); + V5 = v128_ror64xor( V5, V9, 25 ); V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ), V5 ) ); - VD = v128_ror64( v128_xor( VD, V1 ), 16 ); + VD = v128_ror64xor( VD, V1, 16 ); V9 = v128_add64( V9, VD ); - V5 = v128_ror64( v128_xor( V5, V9 ), 11 ); + V5 = v128_ror64xor( V5, V9, 11 ); // G2 V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) ); - VE = v128_ror64( v128_xor( VE, V2 ), 32 ); + VE = v128_ror64xor( VE, V2, 32 ); VA = v128_add64( VA, VE ); - V6 = v128_ror64( v128_xor( V6, VA ), 25 ); + V6 = v128_ror64xor( V6, VA, 25 ); V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) ); - VE = v128_ror64( v128_xor( VE, V2 ), 16 ); + VE = v128_ror64xor( VE, V2, 16 ); VA = v128_add64( VA, VE ); - V6 = v128_ror64( v128_xor( V6, VA ), 11 ); + V6 = v128_ror64xor( V6, VA, 11 ); // G3 - VF = v128_ror64( v128_xor( VF, V3 ), 32 ); + VF = v128_ror64xor( VF, V3, 32 ); VB = v128_add64( VB, VF ); - V7 = v128_ror64( v128_xor( V7, VB ), 25 ); + V7 = v128_ror64xor( V7, VB, 25 ); V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) ); - VF = v128_ror64( v128_xor( VF, V3 ), 16 ); + VF = v128_ror64xor( VF, V3, 16 ); VB = v128_add64( VB, VF ); - V7 = v128_ror64( v128_xor( V7, VB ), 11 ); + V7 = v128_ror64xor( V7, VB, 11 ); // G4, G5, G6, G7 GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF); diff --git a/algo/blake/blake512-hash.h b/algo/blake/blake512-hash.h index 32de52ae..12e401e4 100644 --- a/algo/blake/blake512-hash.h +++ b/algo/blake/blake512-hash.h @@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash, #define blake512_4way_prehash_le blake512_4x64_prehash_le #define blake512_4way_final_le blake512_4x64_final_le -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) //////////////////////////// // diff --git a/algo/blake/blakecoin-gate.h b/algo/blake/blakecoin-gate.h index 1abeef19..73adfc0a 100644 --- a/algo/blake/blakecoin-gate.h +++ b/algo/blake/blakecoin-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define BLAKECOIN_16WAY #elif defined(__AVX2__) #define BLAKECOIN_8WAY diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index 40bcfa30..6ee4fe54 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -101,15 +101,15 @@ { \ Va = v128_add64( Va, v128_add64( Vb, \ v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \ - Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \ + Vd = v128_ror64xor( Vd, Va, 32 ); \ Vc = v128_add64( Vc, Vd ); \ - Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \ + Vb = v128_ror64xor( Vb, Vc, 24 ); \ \ Va = v128_add64( Va, v128_add64( Vb, \ v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \ - Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \ + Vd = v128_ror64xor( Vd, Va, 16 ); \ Vc = v128_add64( Vc, Vd ); \ - Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \ + Vb = v128_ror64xor( Vb, Vc, 63 ); \ } #define BLAKE2B_ROUND( R ) \ diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h index ecba9e44..4ab3ab01 100644 --- a/algo/bmw/bmw-hash-4way.h +++ b/algo/bmw/bmw-hash-4way.h @@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ); #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // BMW-256 16 way 32 @@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close( #endif // __AVX2__ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // BMW-512 64 bit 8 way typedef struct diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index 38acd699..6392028b 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ) #endif // __AVX2__ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // BMW-256 16 way 32 diff --git a/algo/bmw/bmw512-gate.h b/algo/bmw/bmw512-gate.h index e7542cac..059ea595 100644 --- a/algo/bmw/bmw512-gate.h +++ b/algo/bmw/bmw512-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define BMW512_8WAY 1 #elif defined(__AVX2__) #define BMW512_4WAY 1 diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index 58e82387..e1645d38 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #endif // __AVX2__ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // BMW-512 8 WAY diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c index 5888c2a4..4e12cdd6 100644 --- a/algo/cubehash/cube-hash-2way.c +++ b/algo/cubehash/cube-hash-2way.c @@ -26,7 +26,7 @@ static const uint64_t IV512[] = 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246 }; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // 4 way 128 is handy to avoid reinterleaving in many algos. // If reinterleaving is necessary it may be more efficient to use diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h index a31ffde0..3d2b0f0d 100644 --- a/algo/cubehash/cube-hash-2way.h +++ b/algo/cubehash/cube-hash-2way.h @@ -6,7 +6,7 @@ #if defined(__AVX2__) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) struct _cube_4way_context { diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index 518a227d..6fab0ef2 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -13,7 +13,7 @@ static void transform( cubehashParam *sp ) int r; const int rounds = sp->rounds; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) register __m512i x0, x1; diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c index 7891ec52..6fa25500 100644 --- a/algo/echo/echo-hash-4way.c +++ b/algo/echo/echo-hash-4way.c @@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = }; */ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define ECHO_SUBBYTES4(state, j) \ state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \ diff --git a/algo/echo/echo-hash-4way.h b/algo/echo/echo-hash-4way.h index 58086859..e9e10919 100644 --- a/algo/echo/echo-hash-4way.h +++ b/algo/echo/echo-hash-4way.h @@ -5,7 +5,7 @@ #include "simd-utils.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { diff --git a/algo/gost/sph_gost.c b/algo/gost/sph_gost.c index c3eb91d8..a67584bb 100644 --- a/algo/gost/sph_gost.c +++ b/algo/gost/sph_gost.c @@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c) static void AddXor512(const void *a,const void *b,void *c) { -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ), casti_m512i( b, 0 ) ); #elif defined(__AVX2__) diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index 0c102ad8..963c56ef 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -103,7 +103,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) = This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ -#if defined(__AVX512VL__) +#if defined(VL256) #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h index 358ea104..cf5c41c3 100644 --- a/algo/groestl/aes_ni/groestl256-intr-aes.h +++ b/algo/groestl/aes_ni/groestl256-intr-aes.h @@ -95,7 +95,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) = This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ -#if defined(__AVX512VL__) +#if defined(VL256) #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ diff --git a/algo/groestl/groestl-gate.h b/algo/groestl/groestl-gate.h index 25551e60..0e00800e 100644 --- a/algo/groestl/groestl-gate.h +++ b/algo/groestl/groestl-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__VAES__) && defined(SIMD512) #define GROESTL_4WAY_VAES 1 #endif diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c index 6d804cb4..30b55460 100644 --- a/algo/groestl/groestl256-hash-4way.c +++ b/algo/groestl/groestl256-hash-4way.c @@ -17,7 +17,7 @@ #if defined(__AVX2__) && defined(__VAES__) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen ) diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h index 1439ef18..7446d431 100644 --- a/algo/groestl/groestl256-hash-4way.h +++ b/algo/groestl/groestl256-hash-4way.h @@ -43,7 +43,7 @@ #define SIZE256 (SIZE_512/16) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { __attribute__ ((aligned (128))) __m512i chaining[SIZE256]; diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index db724720..8e647ad1 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) = { 0x0000000000000000, 0x8696a6b6c6d6e6f6 } }; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02, 0x1d1519111c141810, 0x1f171b131e161a12, diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c index b52e02ca..5afd014a 100644 --- a/algo/groestl/groestl512-hash-4way.c +++ b/algo/groestl/groestl512-hash-4way.c @@ -17,7 +17,7 @@ #if defined(__AVX2__) && defined(__VAES__) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen ) { diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h index 99870c0c..68236fe8 100644 --- a/algo/groestl/groestl512-hash-4way.h +++ b/algo/groestl/groestl512-hash-4way.h @@ -33,7 +33,7 @@ #define SIZE512 (SIZE_1024/16) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { __attribute__ ((aligned (128))) __m512i chaining[SIZE512]; diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h index 33336a5c..99dfa895 100644 --- a/algo/groestl/groestl512-intr-4way.h +++ b/algo/groestl/groestl512-intr-4way.h @@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) = { 0x8292a2b2c2d2e2f2, 0x0212223242526272 } }; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02, 0x1d1519111c141810, 0x1f171b131e161a12, diff --git a/algo/groestl/myrgr-gate.h b/algo/groestl/myrgr-gate.h index 80cc3fd8..477053b3 100644 --- a/algo/groestl/myrgr-gate.h +++ b/algo/groestl/myrgr-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__VAES__) && defined(SIMD512) #define MYRGR_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__) #define MYRGR_4WAY 1 diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index b14b1281..f21f27bf 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -382,7 +382,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) = #define S1F MF -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // Hamsi 8 way AVX512 @@ -1122,7 +1122,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) // Hamsi 4 way AVX2 -#if defined(__AVX512VL__) +#if defined(VL256) #define INPUT_BIG \ do { \ @@ -1501,7 +1501,7 @@ do { /* order is important */ \ sc->h[14] = CE; \ sc->h[15] = CF; -#if defined(__AVX512VL__) +#if defined(VL256) #define INPUT_8X32 \ { \ diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h index 4850ca12..15d964ea 100644 --- a/algo/hamsi/hamsi-hash-4way.h +++ b/algo/hamsi/hamsi-hash-4way.h @@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // Hamsi-512 8x64 diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index bf1fca37..651124ab 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -53,7 +53,7 @@ extern "C"{ #define SPH_SMALL_FOOTPRINT_HAVAL 1 //#endif -#if defined(__AVX512VL__) +#if defined(VL256) // ( ~( a ^ b ) ) & c #define v128_andnotxor( a, b, c ) \ @@ -583,7 +583,7 @@ do { \ // Haval-256 8 way 32 bit avx2 -#if defined (__AVX512VL__) +#if defined (VL256) // ( ~( a ^ b ) ) & c #define mm256_andnotxor( a, b, c ) \ @@ -882,7 +882,7 @@ do { \ #endif // AVX2 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // ( ~( a ^ b ) ) & c #define mm512_andnotxor( a, b, c ) \ diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h index db14188d..0006d965 100644 --- a/algo/haval/haval-hash-4way.h +++ b/algo/haval/haval-hash-4way.h @@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst ); #endif // AVX2 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { __m512i buf[32]; diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c index fbc30180..a468a90f 100644 --- a/algo/jh/jh-hash-4way.c +++ b/algo/jh/jh-hash-4way.c @@ -204,7 +204,7 @@ static const uint64_t IV512[] = (state)->H[15] = h7l; \ } while (0) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define Sb_8W(x0, x1, x2, x3, c) \ { \ @@ -364,8 +364,7 @@ static const uint64_t IV512[] = #if defined(__AVX2__) -#if defined(__AVX512VL__) -//TODO enable for AVX10_256, not used with AVX512VL +#if defined(VL256) #define notxorandnot( a, b, c ) \ _mm256_ternarylogic_epi64( a, b, c, 0x2d ) @@ -522,7 +521,7 @@ static const uint64_t IV512[] = #endif // AVX2 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) void jh256_8x64_init( jh_8x64_context *sc ) { diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h index faf3bb87..4551e59a 100644 --- a/algo/jh/jh-hash-4way.h +++ b/algo/jh/jh-hash-4way.h @@ -55,7 +55,7 @@ * memcpy()). */ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { diff --git a/algo/keccak/keccak-gate.h b/algo/keccak/keccak-gate.h index bd2b6a3f..be776eb4 100644 --- a/algo/keccak/keccak-gate.h +++ b/algo/keccak/keccak-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define KECCAK_8WAY 1 #elif defined(__AVX2__) #define KECCAK_4WAY 1 @@ -12,7 +12,7 @@ #define KECCAK_2WAY 1 #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SHA3D_8WAY 1 #elif defined(__AVX2__) #define SHA3D_4WAY 1 diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index 6d4a1a0d..bbffb732 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -57,7 +57,7 @@ static const uint64_t RC[] = { #define DO(x) x -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define INPUT_BUF(size) do { \ size_t j; \ diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h index d387da03..c16465c7 100644 --- a/algo/keccak/keccak-hash-4way.h +++ b/algo/keccak/keccak-hash-4way.h @@ -4,7 +4,7 @@ #include #include "simd-utils.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index 45e27fa2..54df4887 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = { }; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] ) @@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state, a = _mm256_xor_si256( a, c0 ); \ b = _mm256_xor_si256( b, c1 ); -//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512 -#if defined(__AVX512VL__) +#if defined(VL256) #define MULT2( a0, a1 ) \ { \ diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h index a274995f..24a2aae7 100644 --- a/algo/luffa/luffa-hash-2way.h +++ b/algo/luffa/luffa-hash-2way.h @@ -51,7 +51,7 @@ #define LIMIT_512 128 /*********************************/ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { uint32_t buffer[8*4]; diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index ef06313a..90a3acdc 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -28,8 +28,7 @@ a = v128_xor( a, c0 ); \ b = v128_xor( b, c1 ); \ -#if defined(__AVX512VL__) -//TODO enable for AVX10_512 AVX10_256 +#if defined(VL256) #define MULT2( a0, a1 ) \ { \ @@ -69,8 +68,7 @@ #endif -#if defined(__AVX512VL__) -//TODO enable for AVX10_512 AVX10_256 +#if defined(VL256) #define SUBCRUMB( a0, a1, a2, a3 ) \ { \ diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h index bbad313f..1d404bf8 100644 --- a/algo/luffa/luffa_for_sse2.h +++ b/algo/luffa/luffa_for_sse2.h @@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output, int luffa_full( hashState_luffa *state, void* output, int hashbitlen, const void* data, size_t inlen ); -#endif // LUFFA_FOR_SSE2_H___ +#endif // LUFFA_FOR_SSE2_H__ diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index 0bf4d6c5..453ab126 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -15,7 +15,7 @@ #include "algo/groestl/sph_groestl.h" #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define ALLIUM_16WAY 1 #elif defined(__AVX2__) #define ALLIUM_8WAY 1 diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h index 6648da3e..196dd51f 100644 --- a/algo/lyra2/lyra2-gate.h +++ b/algo/lyra2/lyra2-gate.h @@ -5,7 +5,7 @@ #include #include "lyra2.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define LYRA2REV3_16WAY 1 #elif defined(__AVX2__) #define LYRA2REV3_8WAY 1 @@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx(); ////////////////////////////////// -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define LYRA2REV2_16WAY 1 #elif defined(__AVX2__) #define LYRA2REV2_8WAY 1 @@ -108,7 +108,7 @@ bool lyra2h_thread_init(); ///////////////////////////////////////// -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define PHI2_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define PHI2_4WAY 1 diff --git a/algo/lyra2/lyra2-hash-2way.c b/algo/lyra2/lyra2-hash-2way.c index 80124bae..acd02677 100644 --- a/algo/lyra2/lyra2-hash-2way.c +++ b/algo/lyra2/lyra2-hash-2way.c @@ -41,7 +41,7 @@ // lyra2z330, lyra2h, -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) /** * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h index 91447b6e..62a546eb 100644 --- a/algo/lyra2/lyra2.h +++ b/algo/lyra2/lyra2.h @@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd, int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ); diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 5c36e00e..203e2520 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -3,7 +3,7 @@ #include "lyra2.h" #include "algo/blake/blake256-hash.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define LYRA2Z_16WAY 1 #elif defined(__AVX2__) #define LYRA2Z_8WAY 1 diff --git a/algo/lyra2/phi2-4way.c b/algo/lyra2/phi2-4way.c index 3d385c90..ec4e3856 100644 --- a/algo/lyra2/phi2-4way.c +++ b/algo/lyra2/phi2-4way.c @@ -4,7 +4,7 @@ #include "algo/gost/sph_gost.h" #include "algo/cubehash/cubehash_sse2.h" #include "lyra2.h" -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__VAES__) && defined(SIMD512) #include "algo/echo/echo-hash-4way.h" #elif defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" diff --git a/algo/lyra2/sponge-2way.c b/algo/lyra2/sponge-2way.c index f798ef2e..1b8f3ebf 100644 --- a/algo/lyra2/sponge-2way.c +++ b/algo/lyra2/sponge-2way.c @@ -27,7 +27,7 @@ #include "lyra2.h" #include "simd-utils.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len ) { diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index e089ee94..7937981e 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] = 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define G2W_4X64(a,b,c,d) \ a = _mm512_add_epi64( a, b ); \ @@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] = // returns void, all args updated #define G_2X64(a,b,c,d) \ a = v128_add64( a, b ); \ - d = v128_ror64( v128_xor( d, a), 32 ); \ + d = v128_ror64xor( d, a, 32 ); \ c = v128_add64( c, d ); \ - b = v128_ror64( v128_xor( b, c ), 24 ); \ + b = v128_ror64xor( b, c, 24 ); \ a = v128_add64( a, b ); \ - d = v128_ror64( v128_xor( d, a ), 16 ); \ + d = v128_ror64xor( d, a, 16 ); \ c = v128_add64( c, d ); \ - b = v128_ror64( v128_xor( b, c ), 63 ); + b = v128_ror64xor( b, c, 63 ); #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ { \ @@ -222,7 +222,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ G( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) union _ovly_512 { diff --git a/algo/nist5/nist5-gate.h b/algo/nist5/nist5-gate.h index 1846806d..af081b37 100644 --- a/algo/nist5/nist5-gate.h +++ b/algo/nist5/nist5-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define NIST5_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define NIST5_4WAY 1 diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c index 98b57e13..f586cc57 100644 --- a/algo/panama/panama-hash-4way.c +++ b/algo/panama/panama-hash-4way.c @@ -71,8 +71,7 @@ do { \ } while (0) #define GAMMA_4W(n0, n1, n2, n4) \ - (g ## n0 = v128_xor( a ## n0, \ - v128_or( a ## n1, v128_not( a ## n2 ) ) ) ) + (g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) ) #define PI_ALL_4W do { \ a0 = g0; \ @@ -312,7 +311,7 @@ do { \ BUPDATE1_8W( 7, 1 ); \ } while (0) -#if defined(__AVX512VL__) +#if defined(VL256) #define GAMMA_8W(n0, n1, n2, n4) \ ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) ) diff --git a/algo/quark/anime-gate.h b/algo/quark/anime-gate.h index a7b08376..db80fe82 100644 --- a/algo/quark/anime-gate.h +++ b/algo/quark/anime-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define ANIME_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define ANIME_4WAY 1 diff --git a/algo/quark/hmq1725-gate.h b/algo/quark/hmq1725-gate.h index bc0ff99b..fcaa9a8f 100644 --- a/algo/quark/hmq1725-gate.h +++ b/algo/quark/hmq1725-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define HMQ1725_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define HMQ1725_4WAY 1 diff --git a/algo/quark/quark-gate.h b/algo/quark/quark-gate.h index 69ec5605..d39102c3 100644 --- a/algo/quark/quark-gate.h +++ b/algo/quark/quark-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define QUARK_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define QUARK_4WAY 1 diff --git a/algo/qubit/qubit-gate.h b/algo/qubit/qubit-gate.h index 2a29e03d..de439842 100644 --- a/algo/qubit/qubit-gate.h +++ b/algo/qubit/qubit-gate.h @@ -5,7 +5,7 @@ #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define QUBIT_4WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define QUBIT_2WAY 1 diff --git a/algo/ripemd/lbry-gate.h b/algo/ripemd/lbry-gate.h index 2aedd6b4..64eb05fc 100644 --- a/algo/ripemd/lbry-gate.h +++ b/algo/ripemd/lbry-gate.h @@ -5,7 +5,7 @@ #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define LBRY_16WAY 1 #elif defined(__AVX2__) #define LBRY_8WAY 1 diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c index 292949c4..b60c333f 100644 --- a/algo/ripemd/ripemd-hash-4way.c +++ b/algo/ripemd/ripemd-hash-4way.c @@ -35,13 +35,13 @@ static const uint32_t IV[5] = _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z ) #define F3(x, y, z) \ - _mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z ) + _mm_xor_si128( v128_ornot( y, x ), z ) #define F4(x, y, z) \ _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y ) #define F5(x, y, z) \ - _mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) ) + _mm_xor_si128( x, v128_ornot( z, y ) ) #define RR(a, b, c, d, e, f, s, r, k) \ do{ \ @@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z ) #define F8W_3(x, y, z) \ - _mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z ) + _mm256_xor_si256( mm256_ornot( y, x ), z ) #define F8W_4(x, y, z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y ) #define F8W_5(x, y, z) \ - _mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) ) + _mm256_xor_si256( x, mm256_ornot( z, y ) ) #define RR_8W(a, b, c, d, e, f, s, r, k) \ do{ \ @@ -625,7 +625,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ) #endif // __AVX2__ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // RIPEMD-160 16 way diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h index c0c87db4..2f0fceb5 100644 --- a/algo/ripemd/ripemd-hash-4way.h +++ b/algo/ripemd/ripemd-hash-4way.h @@ -33,7 +33,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data, size_t len ); void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ); -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c index 3c83b983..633d77c0 100644 --- a/algo/scrypt/scrypt-core-4way.c +++ b/algo/scrypt/scrypt-core-4way.c @@ -745,7 +745,7 @@ do{ \ SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // Tested OK but very slow // 16 way parallel, requires 16x32 interleaving @@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, XA3 = BA[3] = v128_xor( BA[3], CA[3] ); XB3 = BB[3] = v128_xor( BB[3], CB[3] ); -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) SALSA_8ROUNDS_SIMD128_2BUF; @@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, XB3 = BB[3] = v128_xor( BB[3], CB[3] ); XC3 = BC[3] = v128_xor( BC[3], CC[3] ); -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) SALSA_8ROUNDS_SIMD128_3BUF; diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h index 709ba674..7107bd3f 100644 --- a/algo/scrypt/scrypt-core-4way.h +++ b/algo/scrypt/scrypt-core-4way.h @@ -5,7 +5,7 @@ #include #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ); diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index e0ab88be..13cd0aa4 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -35,7 +35,7 @@ //#include #include "malloc-huge.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SCRYPT_THROUGHPUT 16 #elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2) #define SCRYPT_THROUGHPUT 2 @@ -592,7 +592,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate, #endif /* HAVE_SHA256_8WAY */ -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) static inline void sha256_16way_init_state( void *state ) { @@ -1494,7 +1494,7 @@ bool register_scrypt_algo( algo_gate_t* gate ) // scrypt_throughput defined at compile time and used to replace // MAX_WAYS to reduce memory usage. -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // scrypt_throughput = 16; if ( opt_param_n > 0x4000 ) scratchbuf_size = opt_param_n * 3 * 128; // 3 buf diff --git a/algo/sha/hmac-sha256-hash-4way.c b/algo/sha/hmac-sha256-hash-4way.c index c039ac94..bc4d095e 100644 --- a/algo/sha/hmac-sha256-hash-4way.c +++ b/algo/sha/hmac-sha256-hash-4way.c @@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd, } } -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // HMAC 16-way AVX512 diff --git a/algo/sha/hmac-sha256-hash-4way.h b/algo/sha/hmac-sha256-hash-4way.h index c096b08f..320c27b4 100644 --- a/algo/sha/hmac-sha256-hash-4way.h +++ b/algo/sha/hmac-sha256-hash-4way.h @@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t, #endif // AVX2 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct _hmac_sha256_16way_context { diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index 5e37c0b1..bfe4729f 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len ) // to avoid recalculating it as Y^Z. This optimization is not applicable // when MAJ is optimized with ternary logic. -#if defined(__AVX512VL__) +#if defined(VL256) #define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca ) @@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, G = _mm256_load_si256( state_in + 6 ); H = _mm256_load_si256( state_in + 7 ); -#if !defined(__AVX512VL__) +#if !defined(VL256) __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); #endif @@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, G = _mm256_load_si256( state_mid + 6 ); H = _mm256_load_si256( state_mid + 7 ); -#if !defined(__AVX512VL__) +#if !defined(VL256) __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G ); #endif @@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, const __m256i IV7 = H; const __m256i IV6 = G; -#if !defined(__AVX512VL__) +#if !defined(VL256) __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); #endif @@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] ); W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] ); -#if !defined(__AVX512VL__) +#if !defined(VL256) Y_xor_Z = _mm256_xor_si256( B, C ); #endif @@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len ) sha256_8way_close( &ctx, dst ); } -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // SHA-256 16 way diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c index e2c10e94..5110faea 100644 --- a/algo/sha/sha256-hash.c +++ b/algo/sha/sha256-hash.c @@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] = #if defined(__x86_64__) && defined(__SHA__) + +/* common code used for rounds 12 through 51 */ + +#define sha256_generic_qround( s0, s1, m, a, b, c ) \ + TMP = _mm_alignr_epi8( a, c, 4 ); \ + s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \ + b = _mm_add_epi32( b, TMP ); \ + b = _mm_sha256msg2_epu32( b, a ); \ + m = _mm_shuffle_epi32( m, 0x0e ); \ + s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \ + c = _mm_sha256msg1_epu32( c, a ); + +// r12-15 +// sha256_generic_qround( s0, s1, m, t3, t0, t2 ) +// r16-19 +// sha256_generic_qround( s0, s1, m, t0, t1, t3 ) +// r20-23 +// sha256_generic_qround( s0, s1, m, t1, t2, t0 ) +// r24-27 +// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ... + + #define sha256_opt_rounds( state_out, input, state_in ) \ { \ __m128i STATE0, STATE1; \ @@ -887,14 +909,14 @@ static const uint32_t K256[64] = #define sha256_neon_rounds( state_out, input, state_in ) \ { \ - uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \ + uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \ uint32x4_t MSG0, MSG1, MSG2, MSG3; \ uint32x4_t TMP0, TMP1, TMP2; \ \ STATE0 = vld1q_u32( state_in ); \ STATE1 = vld1q_u32( state_in+4 ); \ - ABEF_SAVE = STATE0; \ - CDGH_SAVE = STATE1; \ + ABCD_SAVE = STATE0; \ + EFGH_SAVE = STATE1; \ \ MSG0 = load_msg( input, 0 ); \ MSG1 = load_msg( input, 1 ); \ @@ -1004,8 +1026,8 @@ static const uint32_t K256[64] = TMP2 = STATE0; \ STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ - STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \ - STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \ + STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \ + STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \ vst1q_u32( state_out , STATE0 ); \ vst1q_u32( state_out+4, STATE1 ); \ } @@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input, #define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \ input_Y, state_in_X, state_in_Y ) \ { \ - uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \ - uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \ + uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \ + uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \ uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \ uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \ uint32x4_t TMP0_X, TMP1_X, TMP2_X; \ @@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input, STATE0_Y = vld1q_u32( state_in_Y ); \ STATE1_X = vld1q_u32( state_in_X+4 ); \ STATE1_Y = vld1q_u32( state_in_Y+4 ); \ - ABEF_SAVE_X = STATE0_X; \ - ABEF_SAVE_Y = STATE0_Y; \ - CDGH_SAVE_X = STATE1_X; \ - CDGH_SAVE_Y = STATE1_Y; \ + ABCD_SAVE_X = STATE0_X; \ + ABCD_SAVE_Y = STATE0_Y; \ + EFGH_SAVE_X = STATE1_X; \ + EFGH_SAVE_Y = STATE1_Y; \ \ MSG0_X = load_msg( input_X, 0 ); \ MSG0_Y = load_msg( input_Y, 0 ); \ @@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input, STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ - STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \ - STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \ - STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \ - STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \ + STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \ + STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \ + STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \ + STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \ vst1q_u32( state_out_X , STATE0_X ); \ vst1q_u32( state_out_Y , STATE0_Y ); \ vst1q_u32( state_out_X+4, STATE1_X ); \ diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h index 4a201110..70c652a3 100644 --- a/algo/sha/sha256-hash.h +++ b/algo/sha/sha256-hash.h @@ -113,7 +113,7 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // SHA-256 16 way x86_64 diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h index ce459e9c..7ec1143e 100644 --- a/algo/sha/sha256d-4way.h +++ b/algo/sha/sha256d-4way.h @@ -4,7 +4,7 @@ #include #include "algo-gate-api.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SHA256D_16WAY 1 #elif defined(__SHA__) #define SHA256D_SHA 1 diff --git a/algo/sha/sha256d.h b/algo/sha/sha256d.h index 35a093d8..d20319a1 100644 --- a/algo/sha/sha256d.h +++ b/algo/sha/sha256d.h @@ -4,7 +4,7 @@ #include #include "algo-gate-api.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SHA256D_16WAY 1 #elif defined(__SHA__) #define SHA256D_SHA 1 diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c index 24cbdb4d..9a906e00 100644 --- a/algo/sha/sha256dt.c +++ b/algo/sha/sha256dt.c @@ -6,7 +6,7 @@ #include "sha256-hash.h" #include "sph_sha2.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SHA256DT_16X32 1 #elif defined(__x86_64__) && defined(__SHA__) #define SHA256DT_X86_SHA256 1 diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index 5933555c..9e0bc82d 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -4,7 +4,7 @@ #include #include "algo-gate-api.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SHA256T_16WAY 1 #elif defined(__SHA__) #define SHA256T_SHA 1 diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 2495c861..381219e9 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -73,29 +73,10 @@ static const uint64_t K512[80] = // Experimental. Not tested. Not reviewed. Compile tested only. -// Needs GCC-13 for compilation. -// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution. +// Needs GCC-14 for compilation. +// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution. // Modelled after noloader sha256 implementation. -// It's not clear how SHA512 will be supported before AVX10 considering how -// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL -// until AVX10-256. - -#if defined(__AVX512VL__) - -#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 ) - -#else -// Ugly workaround to make it work with AVX2 - -static const __m256i mask __attribute__ ((aligned (32))) - = { 0xffffffffffffffffull, 0ull, 0ull, 0ull }; - -#define mm256_alignr_1x64( v1, v0 ) \ - _mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \ - _mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) ); - -#endif void sha512_opt_transform_be( uint64_t *state_out, const void *input, const uint64_t *state_in ) @@ -109,7 +90,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input, TMP = _mm256_load_si256( (__m256i*) &state_in[0] ); STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] ); BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, - 0x0001020304050607 ) ) + 0x0001020304050607 ) ); TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF @@ -123,153 +104,233 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input, TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) ); TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 ); MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128 (MSG ) ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); // Rounds 4-7 TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) ); TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 ); MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 8-11 TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) ); TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 ); MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 12-15 TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) ); TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 ); MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 ); TMSG0 = _mm256_add_epi32( TMSG0, TMP ); TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); // Rounds 16-19 MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG0, TMSG3 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); TMSG1 = _mm256_add_epi64( TMSG1, TMP ); TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); // Rounds 20-23 MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG1, TMSG0 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); TMSG2 = _mm256_add_epi64( TMSG2, TMP ); TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 24-27 MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG2, TMSG1 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); TMSG3 = _mm256_add_epi32( TMSG3, TMP ); TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 28-31 MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ; - TMP = mm256_alignr_1x64( TMSG3, TMSG2 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG3, TMSG2, 1 ); TMSG0 = _mm256_add_epi64( TMSG0, TMP ); TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); // Rounds 32-35 MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG0, TMSG3 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); TMSG1 = _mm256_add_epi64( TMSG1, TMP ); TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); // Rounds 36-39 MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG1, TMSG0 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); TMSG2 = _mm256_add_epi64( TMSG2, TMP ); TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 40-43 MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG2, TMSG1 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); TMSG3 = _mm256_add_epi64( TMSG3, TMP ); TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 44-47 MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG3, TMSG2 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG3, TMSG2, 1 ); TMSG0 = _mm256_add_epi64( TMSG0, TMP ); TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); // Rounds 48-51 MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG0, TMSG3 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); TMSG1 = _mm256_add_epi64( TMSG1, TMP ); TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); // Rounds 52-55 MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG1, TMSG0 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); TMSG2 = _mm256_add_epi64( TMSG2, TMP ); TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 56-59 MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ; + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); TMSG3 = _mm256_add_epi64( TMSG3, TMP ); TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 60-63 MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) ); - STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG3, TMSG2, 1 ); + TMSG0 = _mm256_add_epi64( TMSG0, TMP ); + TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); + + // Rounds 64-67 + MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); + TMSG1 = _mm256_add_epi64( TMSG1, TMP ); + TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); + + // Rounds 68-71 + MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); + TMSG2 = _mm256_add_epi64( TMSG2, TMP ); + TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + + // Rounds 72-75 + MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); + TMSG3 = _mm256_add_epi64( TMSG3, TMP ); + TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); + + // Rounds 76-79 + MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, + _mm256_castsi256_si128( MSG ) ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, + _mm256_castsi256_si128( MSG ) ); // Add initial state STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE ); @@ -289,7 +350,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input, const uint64_t *state_in ) { __m256i STATE0, STATE1; - __m256i MSG, TMP, BSWAP64; + __m256i MSG, TMP; __m256i TMSG0, TMSG1, TMSG2, TMSG3; __m256i ABEF_SAVE, CDGH_SAVE; @@ -308,141 +369,190 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input, // Rounds 0-3 TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) ); MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); // Rounds 4-7 TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) ); MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 8-11 TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) ); MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 12-15 TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) ); MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 ); TMSG0 = _mm256_add_epi32( TMSG0, TMP ); TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); // Rounds 16-19 MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG0, TMSG3 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); TMSG1 = _mm256_add_epi64( TMSG1, TMP ); TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); // Rounds 20-23 MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG1, TMSG0 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); TMSG2 = _mm256_add_epi64( TMSG2, TMP ); TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 24-27 MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG2, TMSG1 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); TMSG3 = _mm256_add_epi32( TMSG3, TMP ); TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 28-31 MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ; - TMP = mm256_alignr_1x64( TMSG3, TMSG2 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG3, TMSG2, 1 ); TMSG0 = _mm256_add_epi64( TMSG0, TMP ); TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); // Rounds 32-35 MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG0, TMSG3 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); TMSG1 = _mm256_add_epi64( TMSG1, TMP ); TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); // Rounds 36-39 MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG1, TMSG0 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); TMSG2 = _mm256_add_epi64( TMSG2, TMP ); TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 40-43 MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG2, TMSG1 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); TMSG3 = _mm256_add_epi64( TMSG3, TMP ); TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 44-47 MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG3, TMSG2 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG3, TMSG2, 1 ); TMSG0 = _mm256_add_epi64( TMSG0, TMP ); TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); // Rounds 48-51 MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG0, TMSG3 ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); TMSG1 = _mm256_add_epi64( TMSG1, TMP ); TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); - TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); + + // Rounds 52-55 + MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); + TMSG2 = _mm256_add_epi64( TMSG2, TMP ); + TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) ); // Rounds 56-59 MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) ); - STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG ); - TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ; + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); TMSG3 = _mm256_add_epi64( TMSG3, TMP ); TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) ); // Rounds 60-63 MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) ); - STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG3, TMSG2, 1 ); + TMSG0 = _mm256_add_epi64( TMSG0, TMP ); + TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) ); + + // Rounds 64-67 + MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG0, TMSG3, 1 ); + TMSG1 = _mm256_add_epi64( TMSG1, TMP ); + TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) ); + + // Rounds 68-71 + MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG1, TMSG0, 1 ); + TMSG2 = _mm256_add_epi64( TMSG2, TMP ); + TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + + // Rounds 72-75 + MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); + TMP = mm256_alignr64( TMSG2, TMSG1, 1 ); + TMSG3 = _mm256_add_epi64( TMSG3, TMP ); + TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 ); + MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); + + // Rounds 76-79 + MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) ); + STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); MSG = _mm256_permute4x64_epi64( MSG, 0x0E ); - STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG ); + STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) ); // Add initial state STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE ); @@ -462,7 +572,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // SHA-512 8 way 64 bit @@ -664,8 +774,7 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data, mm256_ror_64( x, 61 ), \ _mm256_srli_epi64( x, 6 ) ) -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) // 4 way is not used whith AVX512 but will be whith AVX10_256 when it // becomes available. @@ -717,7 +826,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] ) int i; register __m256i A, B, C, D, E, F, G, H; -#if !defined(__AVX512VL__) +#if !defined(VL256) // Disable for AVX10_256 __m256i X_xor_Y, Y_xor_Z; #endif @@ -754,7 +863,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] ) H = v256_64( 0x5BE0CD19137E2179 ); } -#if !defined(__AVX512VL__) +#if !defined(VL256) // Disable for AVX10_256 Y_xor_Z = _mm256_xor_si256( B, C ); #endif diff --git a/algo/sha/sha512-hash.h b/algo/sha/sha512-hash.h index bb8d8665..0f8cda91 100644 --- a/algo/sha/sha512-hash.h +++ b/algo/sha/sha512-hash.h @@ -25,7 +25,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // SHA-512 8 way diff --git a/algo/sha/sha512256d-4way.c b/algo/sha/sha512256d-4way.c index 8d38da76..62edcac1 100644 --- a/algo/sha/sha512256d-4way.c +++ b/algo/sha/sha512256d-4way.c @@ -4,7 +4,7 @@ #include #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SHA512256D_8WAY 1 #elif defined(__AVX2__) #define SHA512256D_4WAY 1 diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index d411389d..2bd4bddb 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -34,7 +34,7 @@ #include #include "shabal-hash-4way.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define DECL_STATE16 \ __m512i A0, A1, A2, A3, A4, A5, A6, A7, \ diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index 81707021..fa51e3d1 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -8,7 +8,7 @@ #define SPH_SIZE_shabal512 512 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { __m512i buf[16]; diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index 66288ed4..6f8a3db5 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -30,8 +30,7 @@ static const uint32_t IV512[] = #endif -#if defined (__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined (VL256) #define DECL_m256i_count \ const __m256i count = \ diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c index 9d3956db..cb8e7212 100644 --- a/algo/shavite/shavite-hash-4way.c +++ b/algo/shavite/shavite-hash-4way.c @@ -1,7 +1,7 @@ #include "shavite-hash-4way.h" #include -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__VAES__) && defined(SIMD512) static const uint32_t IV512[] = { diff --git a/algo/shavite/shavite-hash-4way.h b/algo/shavite/shavite-hash-4way.h index 10ff0957..1e55271b 100644 --- a/algo/shavite/shavite-hash-4way.h +++ b/algo/shavite/shavite-hash-4way.h @@ -1,10 +1,10 @@ #ifndef SHAVITE_HASH_4WAY_H__ #define SHAVITE_HASH_4WAY_H__ 1 -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - #include "simd-utils.h" +#if defined(__VAES__) && defined(SIMD512) + typedef struct { unsigned char buf[128<<2]; uint32_t h[16<<2]; diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index 5debceca..a4f9d4ff 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -803,8 +803,7 @@ static const m256_v16 FFT256_Twiddle[] = #define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x) -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) #define REDUCE(x) \ _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \ @@ -1500,7 +1499,7 @@ int simd512_2way( void *hashval, const void *data, int datalen ) #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) //////////////////////////////////// // diff --git a/algo/simd/simd-hash-2way.h b/algo/simd/simd-hash-2way.h index 2a94518c..9bb0709a 100644 --- a/algo/simd/simd-hash-2way.h +++ b/algo/simd/simd-hash-2way.h @@ -52,7 +52,7 @@ int simd512_2way( void *hashval, const void *data, int datalen ); #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h index 1bdae7f4..96a15e85 100644 --- a/algo/skein/skein-gate.h +++ b/algo/skein/skein-gate.h @@ -3,7 +3,7 @@ #include #include "algo-gate-api.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SKEIN_8WAY 1 #elif defined(__AVX2__) #define SKEIN_4WAY 1 diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index 3a2327f8..33677ebd 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -298,7 +298,7 @@ static const uint64_t IV512[] = { sc->bcount = bcount; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \ k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), \ @@ -511,7 +511,7 @@ do { \ #endif // AVX2 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) void skein256_8way_init( skein256_8way_context *sc ) { diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index cc1fb96a..f1f2d427 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -44,7 +44,7 @@ #include #include "simd-utils.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) typedef struct { diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c index 66d63886..7bd2d2a4 100644 --- a/algo/swifftx/swifftx.c +++ b/algo/swifftx/swifftx.c @@ -687,7 +687,7 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output ) #undef ADD_SUB -#if defined (__AVX512VL__) && defined(__AVX512BW__) +#if defined(VL256) #define Q_REDUCE( a ) \ _mm256_sub_epi32( _mm256_maskz_mov_epi8( 0x11111111, a ), \ @@ -1233,7 +1233,7 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output, swift_int32_t result[N] __attribute__ ((aligned (64))); register swift_int16_t carry = 0; -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) __m512i *res = (__m512i*)result; for ( j = 0; j < N/16; ++j ) diff --git a/algo/verthash/tiny_sha3/sha3-4way.c b/algo/verthash/tiny_sha3/sha3-4way.c index 1fada155..2aff1191 100644 --- a/algo/verthash/tiny_sha3/sha3-4way.c +++ b/algo/verthash/tiny_sha3/sha3-4way.c @@ -152,7 +152,7 @@ void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen ) return md; } -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) void sha3_8way_keccakf( __m512i st[25] ) { diff --git a/algo/verthash/tiny_sha3/sha3-4way.h b/algo/verthash/tiny_sha3/sha3-4way.h index 6723b73b..f1c3a92e 100644 --- a/algo/verthash/tiny_sha3/sha3-4way.h +++ b/algo/verthash/tiny_sha3/sha3-4way.h @@ -37,7 +37,7 @@ int sha3_4way_final( void *md, sha3_4way_ctx_t *c ); // digest goes to md void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen ); -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) // state context typedef struct diff --git a/algo/x11/c11-gate.h b/algo/x11/c11-gate.h index 712e7873..81ecaf32 100644 --- a/algo/x11/c11-gate.h +++ b/algo/x11/c11-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define C11_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define C11_4WAY 1 diff --git a/algo/x11/tribus-gate.h b/algo/x11/tribus-gate.h index fbcf61d2..01758628 100644 --- a/algo/x11/tribus-gate.h +++ b/algo/x11/tribus-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define TRIBUS_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define TRIBUS_4WAY 1 diff --git a/algo/x11/x11-gate.h b/algo/x11/x11-gate.h index aed68370..c1afe3eb 100644 --- a/algo/x11/x11-gate.h +++ b/algo/x11/x11-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X11_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X11_4WAY 1 diff --git a/algo/x11/x11gost-gate.h b/algo/x11/x11gost-gate.h index f24dbfc5..a6a820d5 100644 --- a/algo/x11/x11gost-gate.h +++ b/algo/x11/x11gost-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X11GOST_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X11GOST_4WAY 1 diff --git a/algo/x12/x12-gate.h b/algo/x12/x12-gate.h index 998f09bb..5bdba0aa 100644 --- a/algo/x12/x12-gate.h +++ b/algo/x12/x12-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X12_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X12_4WAY 1 diff --git a/algo/x13/phi1612-gate.h b/algo/x13/phi1612-gate.h index b4151919..11523e00 100644 --- a/algo/x13/phi1612-gate.h +++ b/algo/x13/phi1612-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define PHI1612_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define PHI1612_4WAY 1 diff --git a/algo/x13/skunk-gate.h b/algo/x13/skunk-gate.h index e5ade93f..b1905c25 100644 --- a/algo/x13/skunk-gate.h +++ b/algo/x13/skunk-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SKUNK_8WAY 1 #elif defined(__AVX2__) #define SKUNK_4WAY 1 diff --git a/algo/x13/x13-gate.h b/algo/x13/x13-gate.h index 6718eb37..f5fa4f6e 100644 --- a/algo/x13/x13-gate.h +++ b/algo/x13/x13-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X13_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X13_4WAY 1 diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h index fc6154ac..85d4cda7 100644 --- a/algo/x13/x13sm3-gate.h +++ b/algo/x13/x13sm3-gate.h @@ -26,7 +26,7 @@ void init_x13sm3_ctx(); #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X13BCD_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X13BCD_4WAY 1 diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h index 97f4800d..09065c96 100644 --- a/algo/x14/x14-gate.h +++ b/algo/x14/x14-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X14_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X14_4WAY 1 diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h index 44568c28..e5e1174c 100644 --- a/algo/x15/x15-gate.h +++ b/algo/x15/x15-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X15_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X15_4WAY 1 diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 146338d1..b5bc1209 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -51,7 +51,7 @@ #endif // X16R, X16S -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X16R_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X16R_4WAY 1 @@ -59,7 +59,7 @@ #define X16R_2WAY 1 #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X16RV2_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X16RV2_4WAY 1 @@ -68,7 +68,7 @@ #endif // X16RT, VEIL -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X16RT_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X16RT_4WAY 1 @@ -76,7 +76,7 @@ #define X16RT_2WAY 1 #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X21S_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X21S_4WAY 1 diff --git a/algo/x16/x20r.c b/algo/x16/x20r.c index b6c63966..5d8b60c2 100644 --- a/algo/x16/x20r.c +++ b/algo/x16/x20r.c @@ -22,7 +22,7 @@ #include "algo/sha/sph_sha2.h" #include "x16r-gate.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X20R_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X20R_4WAY 1 diff --git a/algo/x17/sonoa-gate.h b/algo/x17/sonoa-gate.h index 997bff18..d431b2e1 100644 --- a/algo/x17/sonoa-gate.h +++ b/algo/x17/sonoa-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define SONOA_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define SONOA_4WAY 1 diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h index 4a5b035a..6bc030c6 100644 --- a/algo/x17/x17-gate.h +++ b/algo/x17/x17-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X17_8WAY 1 // #define X17_16X32 1 #elif defined(__AVX2__) && defined(__AES__) diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h index 8ef9a2e6..eccd433f 100644 --- a/algo/x17/xevan-gate.h +++ b/algo/x17/xevan-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define XEVAN_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define XEVAN_4WAY 1 diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c index b8e087f1..432cf949 100644 --- a/algo/x22/x22i-gate.c +++ b/algo/x22/x22i-gate.c @@ -32,7 +32,7 @@ bool register_x22i_algo( algo_gate_t* gate ) #endif gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT - | AVX512_OPT | VAES_OPT; + | AVX512_OPT | VAES_OPT | NEON_OPT; return true; }; @@ -49,7 +49,7 @@ bool register_x25x_algo( algo_gate_t* gate ) gate->hash = (void*)&x25x_hash; #endif gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT | - AVX512_OPT | VAES_OPT; + AVX512_OPT | VAES_OPT | NEON_OPT; InitializeSWIFFTX(); return true; }; diff --git a/algo/x22/x22i-gate.h b/algo/x22/x22i-gate.h index 4dc1bf24..17b7999d 100644 --- a/algo/x22/x22i-gate.h +++ b/algo/x22/x22i-gate.h @@ -7,7 +7,7 @@ #include #include "algo/swifftx/swifftx.h" -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X22I_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X22I_4WAY 1 @@ -50,7 +50,7 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce, #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) #define X25X_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X25X_4WAY 1 diff --git a/algo/yespower/yespower-blake2b.c b/algo/yespower/yespower-blake2b.c index 41dec41b..a5b34928 100644 --- a/algo/yespower/yespower-blake2b.c +++ b/algo/yespower/yespower-blake2b.c @@ -259,7 +259,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, #define WRITE_X(out) \ (out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3; -#if defined(__AVX512VL__) +#if defined(VL256) #define ARX(out, in1, in2, s) \ out = _mm_xor_si128(out, _mm_rol_epi32(_mm_add_epi32(in1, in2), s)); diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index 9315baa4..dcfbe0f2 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -93,12 +93,12 @@ typedef union #if defined(__AVX2__) __m256i m256[2]; #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(YESPOWER_USE_AVX512) && defined(SIMD512) __m512i m512; #endif } salsa20_blk_t; -#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(YESPOWER_USE_AVX512) && defined(SIMD512) // Slow static const __m512i simd_shuffle_index = @@ -114,7 +114,7 @@ static const __m512i simd_unshuffle_index = #elif defined(__AVX2__) -#if defined(__AVX512VL__) +#if defined(VL256) // alternative when not using 512 bit vectors static const __m256i simd_shuffle_index = @@ -138,13 +138,13 @@ static const __m256i simd_shuffle_index = static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin, salsa20_blk_t *Bout) { -#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(YESPOWER_USE_AVX512) && defined(SIMD512) Bout->m512 = _mm512_permutexvar_epi32( simd_shuffle_index, Bin->m512 ); #elif defined(__AVX2__) -#if defined(__AVX512VL__) +#if defined(VL256) Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_shuffle_index, Bin->m256[1] ); @@ -193,13 +193,13 @@ static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin, static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, salsa20_blk_t *Bout) { -#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(YESPOWER_USE_AVX512) && defined(SIMD512) Bout->m512 = _mm512_permutexvar_epi32( simd_unshuffle_index, Bin->m512 ); #elif defined(__AVX2__) -#if defined(__AVX512VL__) +#if defined(VL256) Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_unshuffle_index, Bin->m256[1] ); @@ -318,7 +318,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, // AVX512 ternary logic optimization -#if defined(__AVX512VL__) +#if defined(VL256) #define XOR_X_XOR_X( in1, in2 ) \ X0 = _mm_ternarylogic_epi32( X0, (in1).m128[0], (in2).m128[0], 0x96 ); \ @@ -335,7 +335,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, #endif // General vectored optimizations -#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(YESPOWER_USE_AVX512) && defined(SIMD512) #define READ_X( in ) \ X.m512 = (in).m512; @@ -379,7 +379,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, X.m256[0] = (in).m256[0]; \ X.m256[1] = (in).m256[1]; -#if defined(__AVX512VL__) +#if defined(VL256) #define XOR_X_2_XOR_X( in1, in2, in3 ) \ X.m256[0] = _mm256_ternarylogic_epi32( (in1).m256[0], (in2).m256[0], \ diff --git a/armbuild-all.sh b/armbuild-all.sh index 328fd71c..61690aac 100755 --- a/armbuild-all.sh +++ b/armbuild-all.sh @@ -4,11 +4,37 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null +rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null + +# armv9 needs gcc-13 make distclean || echo clean rm -f config.status ./autogen.sh || echo done +CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-armv9-aes-sha3 + +make clean || echo clean +CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-armv9-aes-sha3-sve2 + +make clean || echo clean +CFLAGS="-O3 -march=armv8.2-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-armv8.2-aes-sha3-sve2 + +make clean || echo clean +CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-armv8-aes-sha2-sve2 + +make clean || echo clean CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl make -j $(nproc) strip -s cpuminer @@ -28,6 +54,13 @@ make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-armv8-aes +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-armv8-crypto + make clean || echo clean rm -f config.status CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure --with-curl diff --git a/build-allarch.sh b/build-allarch.sh index 7fbebc27..28f66ba9 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,7 +4,7 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null # AVX512 SHA VAES: Intel Core Icelake, Rocketlake make distclean || echo clean @@ -17,20 +17,35 @@ make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx512-sha-vaes -# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12 +# Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12 +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl +make -j 8 +strip -s cpuminer +mv cpuminer cpuminer-alderlake + +# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14 #make clean || echo clean #rm -f config.status -#./autogen.sh || echo done -#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl +#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl #make -j 8 #strip -s cpuminer -#mv cpuminer cpuminer-alderlake +#mv cpuminer cpuminer-arrowlake + +# Zen5: AVX512 SHA VAES, requires gcc-14. +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=znver5" ./configure --with-curl +#make -j $(nproc) +#strip -s cpuminer +#mv cpuminer cpuminer-zen4 -# Zen4 AVX512 SHA VAES +# Zen4: AVX512 SHA VAES make clean || echo clean rm -f config.status # znver3 needs gcc-11, znver4 needs gcc-12.3. -#CFLAGS="-O3 -march=znver4" ./configure --with-curl +#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer. CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl #CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall" ./configure --with-curl @@ -55,7 +70,7 @@ make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx512 -# AVX2 SHA VAES: Intel Alderlake, AMD Zen3 +# AVX2 SHA VAES: generic make clean || echo done rm -f config.status # vaes doesn't include aes diff --git a/clean-all.sh b/clean-all.sh index 8364e408..98979c28 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,7 +2,7 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null diff --git a/configure b/configure index dd6ce7fc..b7f7c371 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.1. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='24.1' -PACKAGE_STRING='cpuminer-opt 24.1' +PACKAGE_VERSION='24.2' +PACKAGE_STRING='cpuminer-opt 24.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 24.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 24.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 24.2:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 24.1 +cpuminer-opt configure 24.2 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 24.1, which was +It was created by cpuminer-opt $as_me 24.2, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='24.1' + VERSION='24.2' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 24.1, which was +This file was extended by cpuminer-opt $as_me 24.2, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 24.1 +cpuminer-opt config.status 24.2 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 09fbbdb2..ff354444 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [24.1]) +AC_INIT([cpuminer-opt], [24.2]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index dab87859..dd6ce7fc 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.16. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.1. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.16' -PACKAGE_STRING='cpuminer-opt 23.16' +PACKAGE_VERSION='24.1' +PACKAGE_STRING='cpuminer-opt 24.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.16 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 24.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.16:";; + short | recursive ) echo "Configuration of cpuminer-opt 24.1:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.16 +cpuminer-opt configure 24.1 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.16, which was +It was created by cpuminer-opt $as_me 24.1, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.16' + VERSION='24.1' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.16, which was +This file was extended by cpuminer-opt $as_me 24.1, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.16 +cpuminer-opt config.status 24.1 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index ad7cc03e..7ccbab0f 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2852,12 +2852,14 @@ static bool cpu_capability( bool display_only ) bool cpu_has_avx10 = has_avx10(); bool cpu_has_aes = has_aes_ni(); // x86_64 or AArch64 AES bool cpu_has_vaes = has_vaes(); - bool cpu_has_sha = has_sha(); // x86_64 or AArch64 + bool cpu_has_sha256 = has_sha(); // x86_64 or AArch64 bool cpu_has_sha512 = has_sha512(); bool sw_has_x86_64 = false; bool sw_has_aarch64 = false; - int sw_arm_arch = 0; - bool sw_has_neon = false; + int sw_arm_arch = 0; // AArch64 + bool sw_has_neon = false; // AArch64 +// bool sw_has_sve = false; // AArch64 +// bool sw_has_sve2 = false; // AArch64 bool sw_has_sse2 = false; // x86_64 bool sw_has_ssse3 = false; // x86_64 bool sw_has_sse41 = false; // x86_64 @@ -2865,9 +2867,11 @@ static bool cpu_capability( bool display_only ) bool sw_has_avx = false; bool sw_has_avx2 = false; bool sw_has_avx512 = false; + bool sw_has_avx10_256 = false; + bool sw_has_avx10_512 = false; bool sw_has_aes = false; bool sw_has_vaes = false; - bool sw_has_sha = false; // x86_64 or AArch64 SHA2 + bool sw_has_sha256 = false; // x86_64 or AArch64 SHA2 bool sw_has_sha512 = false; // x86_64 or AArch64 SHA3 set_t algo_features = algo_gate.optimizations; bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); @@ -2877,7 +2881,7 @@ static bool cpu_capability( bool display_only ) bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); bool algo_has_aes = set_incl( AES_OPT, algo_features ); bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); - bool algo_has_sha = set_incl( SHA_OPT, algo_features ); + bool algo_has_sha256 = set_incl( SHA_OPT, algo_features ); bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features ); bool algo_has_neon = set_incl( NEON_OPT, algo_features ); bool use_sse2; @@ -2887,7 +2891,7 @@ static bool cpu_capability( bool display_only ) bool use_avx512; bool use_aes; bool use_vaes; - bool use_sha; + bool use_sha256; bool use_sha512; bool use_neon; bool use_none; @@ -2925,6 +2929,13 @@ static bool cpu_capability( bool display_only ) #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) sw_has_avx512 = true; #endif + #if defined(__AVX10_1_256__) + sw_has_avx10_256 = true; + #endif + #if defined(__AVX10_1_512__) + sw_has_avx10_512 = true; + #endif + #if defined(__AES__) || defined(__ARM_FEATURE_AES) sw_has_aes = true; #endif @@ -2932,16 +2943,21 @@ static bool cpu_capability( bool display_only ) sw_has_vaes = true; #endif #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2) - sw_has_sha = true; + sw_has_sha256 = true; #endif - #if defined(__SHA512__) || defined(____ARM_FEATURE_SHA3) + #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA3) sw_has_sha512 = true; #endif #if defined(__ARM_NEON) sw_has_neon = true; #endif +// #if defined(__ARM_FEATURE_SVE) +// sw_has_sve = true; +// #endif +// #if defined(__ARM_FEATURE_SVE2) +// sw_has_sve2 = true; +// #endif - cpu_brand_string( cpu_brand ); printf( "CPU: %s\n", cpu_brand ); @@ -2983,7 +2999,7 @@ static bool cpu_capability( bool display_only ) if ( cpu_has_vaes ) printf( " VAES" ); else if ( cpu_has_aes ) printf( " AES" ); if ( cpu_has_sha512 ) printf( " SHA512" ); - else if ( cpu_has_sha ) printf( " SHA256" ); + else if ( cpu_has_sha256 ) printf( " SHA256" ); if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(), avx10_vector_length() ); @@ -2998,17 +3014,22 @@ static bool cpu_capability( bool display_only ) else if ( sw_has_sse41 ) printf( " SSE4.1" ); else if ( sw_has_ssse3 ) printf( " SSSE3 " ); else if ( sw_has_sse2 ) printf( " SSE2 " ); + if ( sw_has_avx10_512 ) printf( " AVX10-512" ); + else if ( sw_has_avx10_256 ) printf( " AVX10-256" ); } else if ( sw_has_aarch64 ) { printf( " AArch64" ); if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch ); if ( sw_has_neon ) printf( " NEON" ); +// if ( sw_has_sve ) printf( " SVE" ); +// else if ( sw_has_sve2 ) printf( " SVE2" ); + } if ( sw_has_vaes ) printf( " VAES" ); else if ( sw_has_aes ) printf( " AES" ); if ( sw_has_sha512 ) printf( " SHA512" ); - else if ( sw_has_sha ) printf( " SHA256" ); + else if ( sw_has_sha256 ) printf( " SHA256" ); if ( !display_only ) { @@ -3024,7 +3045,7 @@ static bool cpu_capability( bool display_only ) if ( algo_has_vaes ) printf( " VAES" ); else if ( algo_has_aes ) printf( " AES" ); if ( algo_has_sha512 ) printf( " SHA512" ); - else if ( algo_has_sha ) printf( " SHA256" ); + else if ( algo_has_sha256 ) printf( " SHA256" ); } } printf("\n"); @@ -3068,11 +3089,11 @@ static bool cpu_capability( bool display_only ) use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes; - use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; + use_sha256 = cpu_has_sha256 && sw_has_sha256 && algo_has_sha256; use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512; use_neon = sw_has_aarch64 && sw_has_neon && algo_has_neon; use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512 - || use_avx2 || use_sha || use_vaes || use_sha512 || use_neon ); + || use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon ); // Display best options applog_nl( "Enabled optimizations:" ); @@ -3090,7 +3111,7 @@ static bool cpu_capability( bool display_only ) if ( use_vaes ) printf( " VAES" ); else if ( use_aes ) printf( " AES" ); if ( use_sha512 ) printf( " SHA512" ); - else if ( use_sha ) printf( " SHA256" ); + else if ( use_sha256 ) printf( " SHA256" ); if ( use_neon ) printf( " NEON" ); } printf( "\n" ); diff --git a/simd-utils.h b/simd-utils.h index 98552aee..406ed82f 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -139,6 +139,43 @@ #include #include #include +#include + +// SIMD512: Use 512, 256 & 128 bit vectors, excludes AVX512VBMI +// VL256: Include AVX512VL instructions on 256 & 128 bit vectors +// VBMI: Include AVX512VBMI instructions on all vectors. + +// AVX10 can exist without support for 512 bit vectors. +#if defined(__AVX10_1_512__) + #define SIMD512 1 +#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define SIMD512 1 +#endif + +// AVX512VL instructions applied to 256 & 128 bit vectors is supported with +// either AVX512VL or any version of AVX10. +#if defined(__AVX10_1__) + #define VL256 1 +#elif defined(__AVX512VL__) + #define VL256 1 +#endif + +// VBMI does not exist on early versions of AVX512 +#if defined(__AVX10_1__) || defined(__AVX512VBMI__) + #define VBMI 1 +#endif + +/* +#if defined(SIMD512) +#warning "SIMD512" +#endif +#if defined(VBMI) +#warning "VBMI" +#endif +#if defined(VL256) +#warning "VL256" +#endif +*/ #if defined(__x86_64__) diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index d5a87fd8..93687d8d 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -469,7 +469,7 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) #if defined(__SSSE3__) const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); + 0x0405060700010203 ); s0 = _mm_shuffle_epi8( s0, bswap_shuf ); s1 = _mm_shuffle_epi8( s1, bswap_shuf ); @@ -913,9 +913,7 @@ static inline void extr_lane_8x32( void *d, const void *s, #if defined(__AVX2__) -#if defined(__AVX512VL__) && defined(__AVX512VBMI__) - -//TODO Enable for AVX10_256 AVX10_512 +#if defined(VL256) && defined(VBMI) // Combine byte swap & broadcast in one permute static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) @@ -977,7 +975,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) { const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); + 0x0405060700010203 ); const __m256i c1 = v256_32( 1 ); const __m256i c2 = _mm256_add_epi32( c1, c1 ); const __m256i c3 = _mm256_add_epi32( c2, c1 ); @@ -1035,7 +1033,8 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) _mm256_castsi128_si256( s4 ), c3 ); } -#endif // AVX512VBMI else +#endif + #endif // AVX2 // 16x32 @@ -1417,11 +1416,9 @@ static inline void extr_lane_16x32( void *d, const void *s, ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ]; } -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) -#if defined(__AVX512VBMI__) - -// TODO Enable for AVX10_512 +#if defined(VBMI) // Combine byte swap & broadcast in one permute static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) @@ -1540,7 +1537,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) _mm512_castsi128_si512( s4 ) ); } -#endif // VBMI else +#endif #endif // AVX512 /////////////////////////// @@ -1983,9 +1980,9 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src ) #endif -#if defined(__AVX512VL__) && defined(__AVX512VBMI__) +#if defined(__AVX2__) -//TODO Enable for AVX10_256 AVX10_512 +#if defined(VL256) && defined(VBMI) static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { @@ -2019,7 +2016,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) _mm256_castsi128_si256( s4 ) ); } -#elif defined(__AVX2__) +#else static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { @@ -2049,6 +2046,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) _mm256_castsi128_si256( s4 ), 0x55 ); } +#endif + #endif // AVX2 #endif // SSE2 @@ -2375,9 +2374,7 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane, #endif // SSE2 -#if defined(__AVX512F__) && defined(__AVX512VL__) - -//TODO Enable for AVX10_512 +#if defined(SIMD512) // broadcast to all lanes static inline void mm512_intrlv80_8x64( void *dst, const void *src ) @@ -2399,7 +2396,7 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src ) // byte swap and broadcast to all lanes -#if defined(__AVX512VBMI__) +#if defined(VBMI) // Combine byte swap & broadcast in one permute static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) @@ -2626,10 +2623,9 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2, #endif // SSE2 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(SIMD512) -#if defined(__AVX512VBMI__) -//TODO Enable for AVX10_512 +#if defined(VBMI) static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src ) { @@ -3532,9 +3528,7 @@ do { \ #endif // AVX2 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -//TODO Enable for AVX10_512 +#if defined(SIMD512) /* #define mm512_intrlv_blend_128( hi, lo ) \ @@ -3559,7 +3553,7 @@ do { \ dst[7] = _mm512_mask_blend_epi64( mask, a[7], b[7] ); \ } while(0) -#endif // AVX512 +#endif // SIMD512 #undef ILEAVE_4x32 #undef LOAD_SRCE_4x32 diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 2fa9898f..b963c24f 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -207,12 +207,12 @@ static inline __m128i mm128_mov32_128( const uint32_t n ) #endif -// broadcast (replicate) lane l to all lanes -#define v128_replane64( v, l ) \ +// Broadcast lane l to all lanes +#define v128_duplane64( v, l ) \ ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \ : _mm_shuffle_epi32( v, 0xee ) -#define v128_replane32( v, l ) \ +#define v128_duplane32( v, l ) \ ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \ : ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \ : ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \ @@ -347,8 +347,7 @@ static inline __m128i v128_neg1_fn() // Basic operations without equivalent SIMD intrinsic // Bitwise not (~v) -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) static inline __m128i v128_not( const __m128i v ) { return _mm_ternarylogic_epi64( v, v, v, 1 ); } @@ -402,8 +401,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } #define memcpy_128 v128_memcpy -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) + +// ~v1 | v0 +#define v128_ornot( v1, v0 ) _mm_ternarylogic_epi64( v1, v0, v0, 0xcf ) // a ^ b ^ c #define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 ) @@ -434,6 +435,8 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #else +#define v128_ornot( v1, v0 ) _mm_or_si128( v1, v128_not( v0 ) ) + #define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) ) #define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) ) @@ -454,7 +457,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #endif -#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) ) // Mask making // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask. @@ -494,7 +496,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #define v128_rol32_sse2( v, c ) \ _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) -#if defined(__AVX512VL__) +#if defined(VL256) // AVX512 fastest for all rotations. #define v128_ror64 _mm_ror_epi64 @@ -609,13 +611,15 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) // deprecated #define mm128_rol_32 v128_rol32 +// ror( v1 ^ v0, n ) +#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n ) + /* not used // x2 rotates elements in 2 individual vectors in a double buffered // optimization for SSE2, does nothing for AVX512 but is there for // transparency. -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) #define v128_2ror64( v1, v0, c ) \ _mm_ror_epi64( v0, c ); \ @@ -917,10 +921,8 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) #define v128_block_bswap32 mm128_block_bswap_32 #define v128_block_bswap64 mm128_block_bswap_64 - // alignr instruction for 32 & 64 bit elements is only available with AVX512 // but emulated here. Behaviour is consistent with Intel alignr intrinsics. - #if defined(__SSSE3__) #define v128_alignr8 _mm_alignr_epi8 @@ -929,6 +931,9 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) #else +#define v128_alignr8( hi, lo, c ) \ + _mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) ) + #define v128_alignr64( hi, lo, c ) \ _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) ) @@ -937,12 +942,15 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) #endif +// blend using vector mask #if defined(__SSE4_1__) +// Bytewise using sign bit of each byte element of mask #define v128_blendv _mm_blendv_epi8 #else +// Bitwise #define v128_blendv( v1, v0, mask ) \ v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) ) diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index c568dafa..f48bfd5a 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -66,8 +66,7 @@ typedef union // Set either the low or high 64 bit elements in 128 bit lanes, other elements // are set to zero. -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) #define mm256_bcast128lo_64( i64 ) _mm256_maskz_set1_epi64( 0x55, i64 ) #define mm256_bcast128hi_64( i64 ) _mm256_maskz_set1_epi64( 0xaa, i64 ) @@ -117,8 +116,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // // Basic operations without SIMD equivalent -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) static inline __m256i mm256_not( const __m256i v ) { return _mm256_ternarylogic_epi64( v, v, v, 1 ); } @@ -137,8 +135,10 @@ static inline __m256i mm256_not( const __m256i v ) #define mm256_add4_32( a, b, c, d ) \ _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) ) -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) + +// ~v1 | v0 +#define mm256_ornot( v1, v0 ) _mm256_ternarylogic_epi64( v1, v0, v0, 0xcf ) // a ^ b ^ c #define mm256_xor3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x96 ) @@ -172,6 +172,8 @@ static inline __m256i mm256_not( const __m256i v ) #else +#define mm256_ornot( v1, v0 ) _mm256_or_si256( v1, mm256_not( v0 ) ) + #define mm256_xor3( a, b, c ) \ _mm256_xor_si256( a, _mm256_xor_si256( b, c ) ) @@ -257,7 +259,7 @@ static inline __m256i mm256_not( const __m256i v ) _mm256_or_si256( _mm256_slli_epi32( v, c ), \ _mm256_srli_epi32( v, 32-(c) ) ) -#if defined(__AVX512VL__) +#if defined(VL256) #define mm256_ror_64 _mm256_ror_epi64 #define mm256_rol_64 _mm256_rol_epi64 @@ -343,8 +345,7 @@ static inline __m256i mm256_not( const __m256i v ) // optimization for AVX2, does nothing for AVX512 but is here for // transparency. -#if defined(__AVX512VL__) -//TODO Enable for AVX10_256 +#if defined(VL256) /* #define mm256_ror_64 _mm256_ror_epi64 #define mm256_rol_64 _mm256_rol_epi64 @@ -470,7 +471,7 @@ static inline __m256i mm256_not( const __m256i v ) /* Not used // Rotate 256 bit vector by one 32 bit element. -#if defined(__AVX512VL__) +#if defined(VL256) static inline __m256i mm256_shuflr_32( const __m256i v ) { return _mm256_alignr_epi32( v, v, 1 ); } static inline __m256i mm256_shufll_32( const __m256i v ) @@ -507,8 +508,8 @@ static inline __m256i mm256_shufll_32( const __m256i v ) #define mm256_shuflr128_32(v) _mm256_shuffle_epi32( v, 0x39 ) #define mm256_shufll128_32(v) _mm256_shuffle_epi32( v, 0x93 ) -#define mm256_shuflr128_16(v) _mm256_shuffle_epi16( v, 0x39 ) -#define mm256_shufll128_16(v) _mm256_shuffle_epi16( v, 0x93 ) +#define mm256_shuflr128_16(v) mm256_shuffle_16( v, 0x39 ) +#define mm256_shufll128_16(v) mm256_shuffle_16( v, 0x93 ) /* Not used static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) @@ -606,6 +607,22 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \ } +#if defined(VL256) + +#define mm256_alignr64 _mm256_alignr_epi64 + +#else + +#define mm256_alignr64( v1, v0, c ) \ + ( ( (c) & 3 ) == 1 ) ? _mm256_blend_epi32( mm256_shuflr_64( v1 ), \ + mm256_shuflr_64( v0 ), 0x3f ) \ + : ( ( (c) & 3 ) == 2 ) ? _mm256_blend_epi32( mm256_rev_128( v1 ), \ + mm256_rev_128( v0 ), 0x0f ) \ + : ( ( (c) & 3 ) == 3 ) ? _mm256_blend_epi32( mm256_shufll_64( v1 ), \ + mm256_shufll_64( v0 ), 0x03 ) \ + : v0 + +#endif #endif // __AVX2__ #endif // SIMD_256_H__ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 66ea45cf..7675ada4 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -14,7 +14,13 @@ // vectors. It is therefore not technically required for any 512 bit vector // utilities defined below. -#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +// if avx10 // avx512 is always set +// if evex512: yes +// else if avx512 : yes // avx512 is set but not avx10 +// else : no // avx512 not set or avx10.1 is set without evex512 + + +#if defined(SIMD512) // AVX512 intrinsics have a few changes from previous conventions. // @@ -180,6 +186,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // Ternary logic uses 8 bit truth table to define any 3 input logical // expression using any number or combinations of AND, OR, XOR, NOT. +// ~v1 | v0 +#define mm512_ornot( v1, v0 ) _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf ) + // a ^ b ^ c #define mm512_xor3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x96 ) diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index b8a31da8..9c55b16c 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -4,22 +4,20 @@ #if defined(__aarch64__) && defined(__ARM_NEON) // Targeted functions supporting NEON SIMD 128 & 64 bit vectors. -// Size matters! +// Element size matters! // // Intel naming is generally used. // -// documented instructions that aren't defined on RPi 4. -// They seem to be all 3 op instructionsi. +// Some advanced logical operations that require SHA3. Prior to GCC-13 +// they also require armv8.2 // -// veor3q ie xor3 -// vxarq_u64( v1, v0, n ) ror( xor( v1, v0 ), n ) -// vraxlq_u64( v1, v0 ) xor( rol( v1, 1 ), rol( v0, 1 ) ) -// vbcaxq( v2, v1, v0 ) xor( v2, and( v1, not(v0) ) ) -// vsraq_n( v1, v0, n ) add( v1, sr( v0, n ) ) +// veor3q( v2, v1, v0 ) xor3 v2 ^ v1 ^ v0 +// vxarq_u64( v1, v0, n ) ror64xor ( v1 ^ v0 ) >>> n ) +// vbcaxq_u{64,32,16,8}( v2, v1, v0 ) xorandnot v2 ^ ( v1 & ~v0 ) // -// Doesn't work on RPi but works on OPi: -// -// vornq( v1, v0 ) or( v1, not( v0 ) ) +// not used anywhere yet +// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 ) +// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n ) #define v128_t uint32x4_t // default, #define v128u64_t uint64x2_t @@ -87,15 +85,15 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) // Not yet needed //#define v128_cmpeq1 // Signed -#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 ) -#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 ) -#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 ) -#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 ) +#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)(v0) ) +#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)(v0) ) +#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)(v0) ) +#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)(v0) ) -#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 ) -#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 ) -#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 ) -#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 ) +#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)(v0) ) +#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)(v0) ) +#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) ) +#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) ) // Logical bit shift #define v128_sl64 vshlq_n_u64 @@ -109,33 +107,38 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) #define v128_sr8 vshrq_n_u8 // Arithmetic shift. -#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)v, c ) -#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)v, c ) -#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)v, c ) +#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)(v), c ) +#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)(v), c ) +#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)(v), c ) // unary logic + #define v128_not vmvnq_u32 // binary logic + #define v128_or vorrq_u32 #define v128_and vandq_u32 #define v128_xor veorq_u32 // ~v1 & v0 -#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 ) +#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 ) // ~( a ^ b ), same as (~a) ^ b #define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) -// ~v1 | v0, x86_64 convention, first arg is not'ed -#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) +// ~v1 | v0, args reversed for consistency with x86_64 +#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) // ternary logic -// v2 ^ v1 ^ v0 -// veorq_u32 not defined -//#define v128_xor3 veor3q_u32 -#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) ) +// This will compile with GCC-11 on armv8.2 and above. At this time there is no +// known way to test arm minor version. +#if defined(__ARM_FEATURE_SHA3) + #define v128_xor3 veor3q_u32 +#else + #define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) ) +#endif // v2 & v1 & v0 #define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) ) @@ -143,8 +146,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) // v2 | v1 | v0 #define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) ) -// a ^ ( ~b & c ) -#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) ) +// v2 ^ ( ~v1 & v0 ) +#if defined(__ARM_FEATURE_SHA3) + #define v128_xorandnot( v2, v1, v0 ) vbcaxq_u32( v2, v0, v1 ) +#else + #define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) ) +#endif // a ^ ( b & c ) #define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) ) @@ -158,12 +165,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) // v2 | ( v1 & v0 ) #define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) ) -// shift 2 concatenated vectors right. +// shift 2 concatenated vectors right, args reversed for consistency with x86_64 #define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c ) #define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c ) #define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c ) -// Intetleave high or low half of 2 vectors. +// Interleave high or low half of 2 vectors. #define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 ) #define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 ) #define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 ) @@ -214,10 +221,10 @@ typedef union #define v128_bcast32(v) vdupq_laneq_u32( v, 0 ) #define v128_bcast16(v) vdupq_laneq_u16( v, 0 ) -// Replicate (broadcast) lane l to all lanes -#define v128_replane64( v, l ) vdupq_laneq_u64( v, l ) -#define v128_replane32( v, l ) vdupq_laneq_u32( v, l ) -#define v128_replane16( v, l ) vdupq_laneq_u16( v, l ) +// Broadcast lane l to all lanes +#define v128_duplane64( v, l ) vdupq_laneq_u64( v, l ) +#define v128_duplane32( v, l ) vdupq_laneq_u32( v, l ) +#define v128_duplane16( v, l ) vdupq_laneq_u16( v, l ) // pointer indexing #define casti_v128( p, i ) (((uint32x4_t*)(p))[i]) @@ -232,16 +239,6 @@ typedef union #define cast_v128u32( p ) (*((uint32x4_t*)(p))) #define castp_v128u32( p ) ((uint32x4_t*)(p)) -// use C cast, flexible source type -#define u32_to_u64 vreinterpretq_u64_u32 -#define u64_to_u32 vreinterpretq_u32_u64 - -#define u64_to_u8 vreinterpretq_u8_u64 -#define u8_to_u64 vreinterpretq_u64_u8 - -#define u32_to_u8 vreinterpretq_u8_u32 -#define u8_to_u32 vreinterpretq_u32_u8 - #define v128_zero v128_64( 0ull ) #define v128_cmpeq_zero vceqzq_u64 @@ -336,35 +333,56 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) #define v128_movmask64 // Bit rotation + #define v128_ror64( v, c ) \ - ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \ - : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c ) + ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ + : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ + ((uint64x2_t)(v)), c ) #define v128_rol64( v, c ) \ - ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \ - : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c ) + ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ + : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ + ((uint64x2_t)(v)), c ) #define v128_ror32( v, c ) \ - ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \ - : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c ) + ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ + : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ + ((uint32x4_t)(v)), c ) #define v128_rol32( v, c ) \ - ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \ - : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c ) + ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ + : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ + ((uint32x4_t)(v)), c ) #define v128_ror16( v, c ) \ - ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \ - : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c ) + ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \ + : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \ + ((uint16x8_t)(v)), c ) #define v128_rol16( v, c ) \ ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \ - : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c ) + : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \ + ((uint16x8_t)(v)), c ) #define v128_ror8( v, c ) \ - vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)v), 8-c ), ((uint8x16_t)v), c ) + vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \ + ((uint8x16_t)(v)), c ) #define v128_rol8( v, c ) \ - vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)v), 8-c ), ((uint8x16_t)v), c ) + vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \ + ((uint8x16_t)(v)), c ) + + +// ror( v1 ^ v0, n ) +#if defined(__ARM_FEATURE_SHA3) + +#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n ) + +#else + +#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n ) + +#endif #define v128_2ror64( v1, v0, c ) \ { \ @@ -416,7 +434,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) */ #define v128_shuffle8( v, vmask ) \ - vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask ) + vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) ) // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster. // Bit rotation already promotes faster widths. Usage is context sensitive. @@ -465,7 +483,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v ) #define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) ) #define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) ) #define v128_bswap128(v) (uint32x4_t)v128_swap64( v128_bswap64(v) ) -#define v128_bswap256(p) v128_bswap128( (p)[0], (p)[1] ) // Usefull for x86_64 but does nothing for ARM #define v128_block_bswap32( dst, src ) \ @@ -534,16 +551,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v ) casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \ } -// Blendv -#define v128_blendv( v1, v0, mask ) \ - v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) ) - -/* -// vbcaxq not defined -#define v128_blendv( v1, v0, mask ) \ - vbcaxq_u32( v128_and( mask, v1 ), v0, mask ) -*/ +// Bitwise blend using vector mask +#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v1, v0 ) #endif // __ARM_NEON - #endif // SIMD_NEON_H__ diff --git a/sysinfos.c b/sysinfos.c index 18fb507f..a6c3c769 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -436,19 +436,40 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) #warning "__ARM_NEON" #endif #ifdef __ARM_FEATURE_CRYPTO -#warning "___ARM_FEATURE_CRYPTO" +#warning "__ARM_FEATURE_CRYPTO" #endif #ifdef __ARM_FEATURE_AES -#warning "___ARM_FEATURE_AES" +#warning "__ARM_FEATURE_AES" #endif #ifdef __ARM_FEATURE_SHA2 -#warning "___ARM_FEATURE_SHA2" +#warning "__ARM_FEATURE_SHA2" #endif #ifdef __ARM_FEATURE_SHA3 -#warning "___ARM_FEATURE_SHA3" +#warning "__ARM_FEATURE_SHA3" #endif */ +// GCC-14.1: the AVX512 macros are defined even when compiled with only +// -mavx10.1-256, causing compile errors in AVX512 code. Only with +// -mavx10.1-512 does it compile successfully. +// __EVEX512__ is set only when compiled with -mavx10.1-512. +// Adding -fno-evex512 doesn't help. +// Building with -mapxf fails to configure on a CPU without APX because it can +// run the test program. +/* +#ifdef __AVX10_1__ +#warning "__AVX10_1__" +#endif +#ifdef __AVX10_1_256__ +#warning "__AVX10_1_256__" +#endif +#ifdef __AVX10_1_512__ +#warning "__AVX10_1_512__" +#endif +#ifdef __EVEX512__ +#warning "__EVEX512__" +#endif +*/ // Typical display format: AVX10.[version]_[vectorlength], if vector length is