diff --git a/README.md b/README.md index c4532bde..ef9279d5 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,6 @@ Supported Algorithms groestl Groestl coin hex x16r-hex hmq1725 - hodl Hodlcoin jha Jackpotcoin keccak Maxcoin keccakc Creative coin @@ -115,9 +114,11 @@ Supported Algorithms scrypt:N scrypt(N, 1, 1) scryptn2 scrypt(1048576, 1, 1) sha256d Double SHA-256 + sha256dt sha256q Quad SHA-256 sha256t Triple SHA-256 sha3d Double keccak256 (BSHA3) + sha512256d skein Skein+Sha (Skeincoin) skein2 Double Skein (Woodcoin) skunk Signatum (SIGT) @@ -145,6 +146,7 @@ Supported Algorithms x16rt-veil veil x16s x17 + x20r x21s x22i x25x diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 3d232b97..5dbbb71b 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,13 @@ If not what makes it happen or not happen? Change Log ---------- +v23.14 + +ARM: Groestl AES optimizations enabled. +All: Small optimization to Shabal 4way. +x86_64: Extend Shabal 4way support to SSE2 from SSE4.1. +All: deleted some unused files. + v23.13 Added x20r algo. diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c index 6671bfae..7662bec4 100644 --- a/algo/blake/blake-4way.c +++ b/algo/blake/blake-4way.c @@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce, blake256r14_4way_update( &blake_4w_ctx, vdata, 64 ); do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); + *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) ); blakehash_4way( hash, vdata ); diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c index 6ebd75d8..9d9befab 100644 --- a/algo/blake/blakecoin-4way.c +++ b/algo/blake/blakecoin-4way.c @@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce, blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 ); do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); + *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) ); pdata[19] = n; blakecoin_4way_hash( hash, vdata ); diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index ec27470f..0c102ad8 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -60,54 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; #if defined(__ARM_NEON) -// No fast shuffle on NEON -//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 }; -static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff }; - -#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK ) - -/* -#define TRANSP_MASK \ - 0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2 -#define SUBSH_MASK0 \ - 0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8 -#define SUBSH_MASK1 \ - 0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9 -#define SUBSH_MASK2 \ - 0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa -#define SUBSH_MASK3 \ - 0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb -#define SUBSH_MASK4 \ - 0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc -#define SUBSH_MASK5 \ - 0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd -#define SUBSH_MASK6 \ - 0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe -#define SUBSH_MASK7 \ - 0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3 - -//#define gr_shuffle8( v, c ) v128_shullfev8( v, c ) - - -#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \ - c07, c06, c05, c04, c03, c02, c01, c00 ) \ - v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ - v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ - v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ - v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ - v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \ - 11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \ - 7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \ - 3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 ) -*/ +static const v128u32_t gr_mask __attribute__ ((aligned (16))) = + { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c }; + +#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask ) #else -#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 ) +#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 ) #endif - #define tos(a) #a #define tostr(a) tos(a) @@ -334,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff }; */ #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* SubBytes */\ - b0 = v128_xor(b0, b0);\ - a0 = v128_aesenclast(a0, b0);\ - a1 = v128_aesenclast(a1, b0);\ - a2 = v128_aesenclast(a2, b0);\ - a3 = v128_aesenclast(a3, b0);\ - a4 = v128_aesenclast(a4, b0);\ - a5 = v128_aesenclast(a5, b0);\ - a6 = v128_aesenclast(a6, b0);\ - a7 = v128_aesenclast(a7, b0);\ + a0 = v128_aesenclast_nokey( a0 ); \ + a1 = v128_aesenclast_nokey( a1 ); \ + a2 = v128_aesenclast_nokey( a2 ); \ + a3 = v128_aesenclast_nokey( a3 ); \ + a4 = v128_aesenclast_nokey( a4 ); \ + a5 = v128_aesenclast_nokey( a5 ); \ + a6 = v128_aesenclast_nokey( a6 ); \ + a7 = v128_aesenclast_nokey( a7 ); \ /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ + MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \ } #define ROUNDS_P(){\ @@ -362,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff }; xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \ xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \ xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \ - /* SubBytes + MixBytes */\ + /* SubBytes + MixBytes */\ SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \ xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \ - \ /* AddRoundConstant P1024 */\ xmm0 = v128_xor( xmm0, \ casti_v128( round_const_p, round_counter+1 ) ); \ @@ -467,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff }; t1 = v128_unpackhi16(t1, i3);\ i2 = v128_unpacklo16(i2, i3);\ i0 = v128_unpacklo16(i0, i1);\ -\ /* shuffle with immediate */\ t0 = gr_shuffle32( t0 ); \ t1 = gr_shuffle32( t1 ); \ @@ -477,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff }; i2 = gr_shuffle32( i2 ); \ i4 = gr_shuffle32( i4 ); \ i6 = gr_shuffle32( i6 ); \ -\ /* continue with unpack */\ t4 = i0;\ i0 = v128_unpacklo32(i0, i2);\ @@ -584,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff }; /* transpose done */\ }/**/ - +#if 0 +// not used void INIT( v128_t* chaining ) { static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -613,6 +573,7 @@ void INIT( v128_t* chaining ) chaining[6] = xmm14; chaining[7] = xmm15; } +#endif void TF1024( v128_t* chaining, const v128_t* message ) { diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h index 11d05ef7..358ea104 100644 --- a/algo/groestl/aes_ni/groestl256-intr-aes.h +++ b/algo/groestl/aes_ni/groestl256-intr-aes.h @@ -1,3 +1,6 @@ +#if !defined GROESTL256_INTR_AES_H__ +#define GROESTL256_INTR_AES_H__ + /* groestl-intr-aes.h Aug 2011 * * Groestl implementation with intrinsics using ssse3, sse4.1, and aes @@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; #if defined(__ARM_NEON) -// No fast shuffle on NEON -static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 }; +static const v128u32_t gr_mask __attribute__ ((aligned (16))) = + { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c }; -#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 ) +#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask ) #else -#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 ) +#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 ) #endif - #define tos(a) #a #define tostr(a) tos(a) @@ -598,4 +600,4 @@ void OF512( v128_t* chaining ) chaining[3] = xmm11; } - +#endif diff --git a/algo/groestl/aes_ni/hash-groestl.c b/algo/groestl/aes_ni/hash-groestl.c index 1f66f62e..089fdb46 100644 --- a/algo/groestl/aes_ni/hash-groestl.c +++ b/algo/groestl/aes_ni/hash-groestl.c @@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input, const int hash_offset = SIZE512 - hashlen_m128i; uint64_t blocks = len / SIZE512; v128_t* in = (v128_t*)input; - + // digest any full blocks, process directly from input for ( i = 0; i < blocks; i++ ) TF1024( ctx->chaining, &in[ i * SIZE512 ] ); @@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input, // digest final padding block and do output transform TF1024( ctx->chaining, ctx->buffer ); + OF1024( ctx->chaining ); // store hash result in output diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h index fe6c1111..c56e980c 100644 --- a/algo/groestl/aes_ni/hash-groestl.h +++ b/algo/groestl/aes_ni/hash-groestl.h @@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* ); int update_and_final_groestl( hashState_groestl*, void*, const void*, int ); int groestl512( hashState_groestl*, void*, const void*, uint64_t ); #define groestl512_full groestl512 +#define groestl512_ctx groestl512 #endif /* __hash_h */ diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index a4ffb645..db724720 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY = #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* AddRoundConstant */\ - b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \ + b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \ a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\ a1 = _mm256_xor_si256( a1, b1 );\ a2 = _mm256_xor_si256( a2, b1 );\ diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index 856a7fc8..1a374d33 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce, v128_bswap32_intrlv80_4x32( vdata, pdata ); do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); + *noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) ); myriad_4way_hash( hash, vdata ); pdata[19] = n; diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index c1e840af..0bf4d6c5 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -465,12 +465,8 @@ typedef union { keccak256_2x64_context keccak; cubehashParam cube; -//#if defined(__x86_64__) skein256_2x64_context skein; -//#else -// sph_skein512_context skein; -//#endif -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) hashState_groestl256 groestl; #else sph_groestl256_context groestl; @@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars, LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); -//#if defined(__x86_64__) intrlv_2x64( vhashA, hash0, hash1, 256 ); skein256_2x64_init( &ctx.skein ); skein256_2x64_update( &ctx.skein, vhashA, 32 ); @@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars, skein256_2x64_update( &ctx.skein, vhashA, 32 ); skein256_2x64_close( &ctx.skein, vhashA ); dintrlv_2x64( hash2, hash3, vhashA, 256 ); -/* -#else - sph_skein256_init( &ctx.skein ); - sph_skein256( &ctx.skein, hash0, 32 ); - sph_skein256_close( &ctx.skein, hash0 ); - sph_skein256_init( &ctx.skein ); - sph_skein256( &ctx.skein, hash1, 32 ); - sph_skein256_close( &ctx.skein, hash1 ); - sph_skein256_init( &ctx.skein ); - sph_skein256( &ctx.skein, hash2, 32 ); - sph_skein256_close( &ctx.skein, hash2 ); - sph_skein256_init( &ctx.skein ); - sph_skein256( &ctx.skein, hash3, 32 ); - sph_skein256_close( &ctx.skein, hash3 ); -#endif -*/ -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) + +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl256_full( &ctx.groestl, hash0, hash0, 256 ); groestl256_full( &ctx.groestl, hash1, hash1, 256 ); groestl256_full( &ctx.groestl, hash2, hash2, 256 ); diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c index 1a92629c..453177cc 100644 --- a/algo/lyra2/lyra2h-4way.c +++ b/algo/lyra2/lyra2h-4way.c @@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce, lyra2h_4way_midstate( vdata ); do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); + *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) ); lyra2h_4way_hash( hash, vdata ); for ( int i = 0; i < 4; i++ ) diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c index 5632fddd..32ef6390 100644 --- a/algo/lyra2/lyra2rev2-4way.c +++ b/algo/lyra2/lyra2rev2-4way.c @@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); + *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) ); lyra2rev2_4way_hash( hash, vdata ); diff --git a/algo/quark/hmq1725.c b/algo/quark/hmq1725.c index 67f055e5..1cb724e0 100644 --- a/algo/quark/hmq1725.c +++ b/algo/quark/hmq1725.c @@ -7,15 +7,15 @@ #include "algo/blake/blake512-hash.h" #include "algo/bmw/sph_bmw.h" #if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/fugue/fugue-aesni.h" #else - #include "algo/groestl/sph_groestl.h" #include "algo/fugue/sph_fugue.h" #endif #if defined(__AES__) || defined(__ARM_FEATURE_AES) + #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #else + #include "algo/groestl/sph_groestl.h" #include "algo/echo/sph_echo.h" #endif #include "algo/jh/sph_jh.h" @@ -33,18 +33,18 @@ union _hmq1725_ctx_holder { - blake512_context blake; + blake512_context blake; sph_bmw512_context bmw; #if defined(__AES__) - hashState_groestl groestl; hashState_fugue fugue; #else - sph_groestl512_context groestl; sph_fugue512_context fugue; #endif #if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_groestl groestl; hashState_echo echo; #else + sph_groestl512_context groestl; sph_echo512_context echo; #endif sph_skein512_context skein; @@ -62,9 +62,6 @@ union _hmq1725_ctx_holder }; typedef union _hmq1725_ctx_holder hmq1725_ctx_holder; -//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64))); -//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64))); - extern void hmq1725hash(void *state, const void *input) { const uint32_t mask = 24; @@ -82,7 +79,7 @@ extern void hmq1725hash(void *state, const void *input) if ( hashB[0] & mask ) //1 { -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &ctx.groestl, hashA, hashB, 512 ); #else sph_groestl512_init( &ctx.groestl ); @@ -226,7 +223,7 @@ extern void hmq1725hash(void *state, const void *input) sph_sha512_close( &ctx.sha, hashA ); } -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &ctx.groestl, hashB, hashA, 512 ); #else sph_groestl512_init( &ctx.groestl ); diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c index 2f3b3015..51971101 100644 --- a/algo/sha/sha256-hash.c +++ b/algo/sha/sha256-hash.c @@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y, // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3) TMSG0_X = casti_m128i( msg_X, 0 ); TMSG0_Y = casti_m128i( msg_Y, 0 ); - TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 ); - TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 ); + TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 ); + TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 ); STATE0_X = _mm_add_epi32( STATE0_X, TMP_X ); STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y ); diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index 8ccee88d..d411389d 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -34,8 +34,6 @@ #include #include "shabal-hash-4way.h" -//#if defined(__SSE4_1__) || defined(__ARM_NEON) - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define DECL_STATE16 \ @@ -47,8 +45,6 @@ C8, C9, CA, CB, CC, CD, CE, CF; \ __m512i M0, M1, M2, M3, M4, M5, M6, M7, \ M8, M9, MA, MB, MC, MD, ME, MF; \ - const __m512i FIVE = v512_32( 5 ); \ - const __m512i THREE = v512_32( 3 ); \ uint32_t Wlow, Whigh; #define READ_STATE16(state) do \ @@ -292,11 +288,21 @@ do { \ mm512_swap1024_512( BF, CF ); \ } while (0) +static inline __m512i v512_mult_x3( const __m512i x ) +{ + return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) ); +} + +static inline __m512i v512_mult_x5( const __m512i x ) +{ + return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) ); +} + #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \ do { \ xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \ - _mm512_mullo_epi32( mm512_xor3( xa0, xc, \ - _mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \ + v512_mult_x3( mm512_xor3( xa0, xc, \ + v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \ xb3, xb2 ) ); \ xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \ } while (0) @@ -644,8 +650,6 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) C8, C9, CA, CB, CC, CD, CE, CF; \ __m256i M0, M1, M2, M3, M4, M5, M6, M7, \ M8, M9, MA, MB, MC, MD, ME, MF; \ - const __m256i FIVE = v256_32( 5 ); \ - const __m256i THREE = v256_32( 3 ); \ uint32_t Wlow, Whigh; #define READ_STATE8(state) do \ @@ -889,11 +893,21 @@ do { \ mm256_swap512_256( BF, CF ); \ } while (0) +static inline __m256i v256_mult_x3( const __m256i x ) +{ + return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) ); +} + +static inline __m256i v256_mult_x5( const __m256i x ) +{ + return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) ); +} + #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \ do { \ xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \ - _mm256_mullo_epi32( mm256_xor3( xa0, xc, \ - _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \ + v256_mult_x3( mm256_xor3( xa0, xc, \ + v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \ xb3, xb2 ) ); \ xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \ } while (0) @@ -1226,15 +1240,13 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #endif // AVX2 -#if defined(__SSE4_1__) || defined(__ARM_NEON) +#if defined(__SSE2__) || defined(__ARM_NEON) #define DECL_STATE \ v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \ v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \ v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \ v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \ - const v128u32_t FIVE = v128_32( 5 ); \ - const v128u32_t THREE = v128_32( 3 ); \ uint32_t Wlow, Whigh; #define READ_STATE( state ) \ @@ -1479,12 +1491,22 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) v128_swap256_128( BF, CF ); \ } +static inline v128_t v128_mult_x3( const v128_t x ) +{ + return v128_add32( x, v128_sl32( x, 1 ) ); +} + +static inline v128_t v128_mult_x5( const v128_t x ) +{ + return v128_add32( x, v128_sl32( x, 2 ) ); +} + #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \ { \ xa0 = v128_xor3( xm, xb1, v128_xorandnot( \ - v128_mul32( v128_xor3( xa0, xc, \ - v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \ - xb3, xb2 ) ); \ + v128_mult_x3( v128_xor3( xa0, xc, \ + v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \ + xb3, xb2 ) ); \ xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \ } diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index 4bce0c4b..81707021 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, #endif -#if defined(__SSE4_1__) || defined(__ARM_NEON) +#if defined(__SSE2__) || defined(__ARM_NEON) typedef struct { v128_t buf[16] __attribute__ ((aligned (64))); diff --git a/algo/swifftx/Swifftx_sha3.cpp b/algo/swifftx/Swifftx_sha3.cpp deleted file mode 100644 index a71099f7..00000000 --- a/algo/swifftx/Swifftx_sha3.cpp +++ /dev/null @@ -1,369 +0,0 @@ -#include "Swifftx_sha3.h" -extern "C" { -#include "SWIFFTX.h" -} -#include -#include -#include - -// The default salt value. -// This is the expansion of e (Euler's number) - the 19 digits after 2.71: -// 8281828459045235360. -// The above in base 256, from MSB to LSB: -BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160}; - -// All the IVs here below were produced from the decimal digits of e's expansion. -// The code can be found in 'ProduceRandomIV.c'. -// The initial value for 224 digest size. -const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] = -{37, 242, 132, 2, 167, 81, 158, 237, 113, 77, 162, 60, 65, 236, 108, 246, -101, 72, 190, 109, 58, 205, 99, 6, 114, 169, 104, 114, 38, 146, 121, 142, - 59, 98, 233, 84, 72, 227, 22, 199, 17, 102, 198, 145, 24, 178, 37, 1, -215, 245, 66, 120, 230, 193, 113, 253, 165, 218, 66, 134, 49, 231, 124, 204, - 0}; - -// The initial value for 256 digest size. -const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] = -{250, 50, 42, 40, 14, 233, 53, 48, 227, 42, 237, 187, 211, 120, 209, 234, - 27, 144, 4, 61, 243, 244, 29, 247, 37, 162, 70, 11, 231, 196, 53, 6, - 193, 240, 94, 126, 204, 132, 104, 46, 114, 29, 3, 104, 118, 184, 201, 3, - 57, 77, 91, 101, 31, 155, 84, 199, 228, 39, 198, 42, 248, 198, 201, 178, - 8}; - -// The initial value for 384 digest size. -const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] = -{40, 145, 193, 100, 205, 171, 47, 76, 254, 10, 196, 41, 165, 207, 200, 79, -109, 13, 75, 201, 17, 172, 64, 162, 217, 22, 88, 39, 51, 30, 220, 151, -133, 73, 216, 233, 184, 203, 77, 0, 248, 13, 28, 199, 30, 147, 232, 242, -227, 124, 169, 174, 14, 45, 27, 87, 254, 73, 68, 136, 135, 159, 83, 152, - 0}; - -// The initial value for 512 digest size. -const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] = -{195, 126, 197, 167, 157, 114, 99, 126, 208, 105, 200, 90, 71, 195, 144, 138, - 142, 122, 123, 116, 24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138, 42, - 114, 118, 132, 33, 35, 149, 143, 163, 163, 183, 243, 175, 72, 22, 201, 255, - 102, 243, 22, 187, 211, 167, 239, 76, 164, 70, 80, 182, 181, 212, 9, 185, - 0}; - - -/////////////////////////////////////////////////////////////////////////////////////////////// -// NIST API implementation portion. -/////////////////////////////////////////////////////////////////////////////////////////////// - -int Swifftx::Init(int hashbitlen) -{ - switch(hashbitlen) - { - case 224: - swifftxState.hashbitlen = hashbitlen; - // Initializes h_0 in HAIFA: - memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE); - break; - case 256: - swifftxState.hashbitlen = hashbitlen; - memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE); - break; - case 384: - swifftxState.hashbitlen = hashbitlen; - memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE); - break; - case 512: - swifftxState.hashbitlen = hashbitlen; - memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE); - break; - default: - return BAD_HASHBITLEN; - } - - swifftxState.wasUpdated = false; - swifftxState.remainingSize = 0; - memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE); - memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE); - // Initialize the salt with the default value. - memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE); - - InitializeSWIFFTX(); - - return SUCCESS; -} - -int Swifftx::Update(const BitSequence *data, DataLength databitlen) -{ - // The size of input in bytes after putting the remaining data from previous invocation. - int sizeOfInputAfterRemaining = 0; - // The input block to compression function of SWIFFTX: - BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0}; - // Whether we handled a single block. - bool wasSingleBlockHandled = false; - - swifftxState.wasUpdated = true; - - // Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input - // (but of course uses the output of the compression function from the previous round, - // which is called h_{i-1} in HAIFA article), we have to do nothing here. - if (databitlen == 0) - return SUCCESS; - - // If we had before an input with unaligned length, return an error - if (swifftxState.remainingSize % 8) - { - return INPUT_DATA_NOT_ALIGNED; - } - - // Convert remaining size to bytes. - swifftxState.remainingSize /= 8; - - // As long as we have enough data combined from (remaining + data) to fill input block - //NASTAVENIE RUND - while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE) - { - // Fill the input block with data: - // 1. The output of the previous block: - memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE); - // 2. The input part of the block: - // 2a. The remaining data from the previous 'Update()' call: - if (swifftxState.remainingSize) - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, - swifftxState.remainingSize); - // 2b. The input data that we have place for after the 'remaining': - sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE - - ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE - - SWIF_HAIFA_SALT_SIZE; - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize, - data, sizeOfInputAfterRemaining); - - // 3. The #bits part of the block: - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize - + sizeOfInputAfterRemaining, - swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE); - // 4. The salt part of the block: - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize - + sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE, - swifftxState.salt, SWIF_HAIFA_SALT_SIZE); - - ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false); - - // Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE. - AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8); - wasSingleBlockHandled = true; - data += sizeOfInputAfterRemaining; - databitlen -= (sizeOfInputAfterRemaining * 8); - swifftxState.remainingSize = 0; - } - - // Update the swifftxState.remaining and swifftxState.remainingSize. - // remainingSize will be in bits after exiting 'Update()'. - if (wasSingleBlockHandled) - { - swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits. - if (swifftxState.remainingSize) - memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8); - } - else - { - memcpy(swifftxState.remaining + swifftxState.remainingSize, data, - (size_t) (databitlen + 7) / 8); - swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen; - } - - return SUCCESS; -} - -int Swifftx::Final(BitSequence *hashval) -{ - int i; - // Whether to add one last block. True if the padding appended to the last block overflows - // the block size. - bool toAddFinalBlock = false; - bool toPutOneInFinalBlock = false; - unsigned short oneShift = 0; - // The size of the last input block before the zeroes padding. We add 1 here because we - // include the final '1' bit in the calculation and 7 as we round the length to bytes. - unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8; - // The number of bytes of zero in the padding part. - // The padding contains: - // 1. A single 1 bit. - // 2. As many zeroes as needed. - // 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes. - // 4. The digest size. Maximum is 512, so we need 2 bytes. - // If the total number achieved is negative, add an additional block, as HAIFA specifies. - short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE - - sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2 - - SWIF_HAIFA_SALT_SIZE; - // The input block to compression function of SWIFFTX: - BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0}; - // The message length in base 256. - BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0}; - // The digest size used for padding: - unsigned char digestSizeLSB = swifftxState.hashbitlen % 256; - unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256; - - if (numOfZeroBytesInPadding < 1) - toAddFinalBlock = true; - - // Fill the input block with data: - // 1. The output of the previous block: - memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE); - // 2a. The input part of the block, which is the remaining data from the previous 'Update()' - // call, if exists and an extra '1' bit (maybe all we have is this extra 1): - - // Add the last 1 in big-endian convention ... - if (swifftxState.remainingSize % 8 == 0) - { - swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80; - } - else - { - swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8))); - } - - if (sizeOfLastInputBlock) - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, - sizeOfLastInputBlock); - - // Compute the message length in base 256: - for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i) - messageLengthChar[i] = swifftxState.numOfBitsChar[i]; - if (sizeOfLastInputBlock) - AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8); - - if (!toAddFinalBlock) - { - // 2b. Put the zeroes: - memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock, - 0, numOfZeroBytesInPadding); - // 2c. Pad the message length: - for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i) - currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock - + numOfZeroBytesInPadding + i] = messageLengthChar[i]; - // 2d. Pad the digest size: - currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock - + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB; - currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock - + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB; - } - else - { - // 2b. Put the zeroes, if at all: - if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0) - { - memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock, - 0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock); - } - } - - // 3. The #bits part of the block: - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE, - swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE); - // 4. The salt part of the block: - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - + SWIF_HAIFA_NUM_OF_BITS_SIZE, - swifftxState.salt, - SWIF_HAIFA_SALT_SIZE); - - ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock); - - // If we have to add one more block, it is now: - if (toAddFinalBlock) - { - // 1. The previous output block, as usual. - memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE); - - // 2a. Instead of the input, zeroes: - memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0, - SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2); - // 2b. Instead of the input, the message length: - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2, - messageLengthChar, - SWIF_HAIFA_NUM_OF_BITS_SIZE); - // 2c. Instead of the input, the digest size: - currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB; - currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB; - // 3. The #bits part of the block, which is zero in case of additional block: - memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE, - 0, - SWIF_HAIFA_NUM_OF_BITS_SIZE); - // 4. The salt part of the block: - memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - + SWIF_HAIFA_NUM_OF_BITS_SIZE, - swifftxState.salt, - SWIF_HAIFA_SALT_SIZE); - - ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true); - } - - // Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the - // first hashbitlen of them: - for (i = 0; i < (swifftxState.hashbitlen / 8); ++i) - hashval[i] = swifftxState.currOutputBlock[i]; - - return SUCCESS; -} - -int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, - BitSequence *hashval) -{ - int result; - //hashState state; - // The pointer to the current place in the input we take into the compression function. - DataLength currInputIndex = 0; - - result = Swifftx::Init(hashbitlen); - - if (result != SUCCESS) - return result; - - for ( ; (databitlen / 8) > SWIF_HAIFA_INPUT_BLOCK_SIZE; - currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)) - { - result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8); - if (result != SUCCESS) - return result; - } - - // The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8) - result = Swifftx::Update(data + currInputIndex, databitlen); - if (result != SUCCESS) - { - return result; - } - - return Swifftx::Final(hashval); -} - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Helper fuction implementation portion. -/////////////////////////////////////////////////////////////////////////////////////////////// - -void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], - unsigned short toAdd) -{ - unsigned char remainder = 0; - short i; - BitSequence currValueInBase256[8] = {0}; - unsigned short currIndex = 7; - unsigned short temp = 0; - - do - { - remainder = toAdd % 256; - currValueInBase256[currIndex--] = remainder; - toAdd -= remainder; - toAdd /= 256; - } - while(toAdd != 0); - - for (i = 7; i >= 0; --i) - { - temp = value[i] + currValueInBase256[i]; - if (temp > 255) - { - value[i] = temp % 256; - currValueInBase256[i - 1]++; - } - else - value[i] = (unsigned char) temp; - } -} \ No newline at end of file diff --git a/algo/swifftx/Swifftx_sha3.h b/algo/swifftx/Swifftx_sha3.h deleted file mode 100644 index 9f6985e9..00000000 --- a/algo/swifftx/Swifftx_sha3.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef SWIFFTX_SHA3_H -#define SWIFFTX_SHA3_H - -#include "sha3_interface.h" -#include "stdbool.h" -#include "stdint.h" - -class Swifftx : public SHA3 { - -#define SWIFFTX_INPUT_BLOCK_SIZE 256 -#define SWIFFTX_OUTPUT_BLOCK_SIZE 65 -#define SWIF_HAIFA_SALT_SIZE 8 -#define SWIF_HAIFA_NUM_OF_BITS_SIZE 8 -#define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \ - - SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE) - - typedef unsigned char BitSequence; -//const DataLength SWIF_SALT_VALUE; - -#define SWIF_HAIFA_IV 0 - -/*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE]; -const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE]; -const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE]; -const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/ - -typedef enum -{ - SUCCESS = 0, - FAIL = 1, - BAD_HASHBITLEN = 2, - BAD_SALT_SIZE = 3, - SET_SALT_VALUE_FAILED = 4, - INPUT_DATA_NOT_ALIGNED = 5 -} HashReturn; - -typedef struct hashState { - unsigned short hashbitlen; - - // The data remained after the recent call to 'Update()'. - BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1]; - - // The size of the remaining data in bits. - // Is 0 in case there is no remaning data at all. - unsigned int remainingSize; - - // The current output of the compression function. At the end will contain the final digest - // (which may be needed to be truncated, depending on hashbitlen). - BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE]; - - // The value of '#bits hashed so far' field in HAIFA, in base 256. - BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE]; - - // The salt value currently in use: - BitSequence salt[SWIF_HAIFA_SALT_SIZE]; - - // Indicates whether a single 'Update()' occured. - // Ater a call to 'Update()' the key and the salt values cannot be changed. - bool wasUpdated; -} hashState; - -private: -int swifftxNumRounds; -hashState swifftxState; - - -public: -int Init(int hashbitlen); -int Update(const BitSequence *data, DataLength databitlen); -int Final(BitSequence *hashval); -int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, - BitSequence *hashval); - -private: -static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd); - -}; - -#endif \ No newline at end of file diff --git a/algo/swifftx/hash_interface.h b/algo/swifftx/hash_interface.h deleted file mode 100644 index 8857cb73..00000000 --- a/algo/swifftx/hash_interface.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include - -namespace hash { - -using BitSequence = unsigned char; -using DataLength = unsigned long long; - -struct hash_interface { - virtual ~hash_interface() = default; - - virtual int Init(int hash_bitsize) = 0; - virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0; - virtual int Final(BitSequence *hash) = 0; - - virtual int - Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0; -}; - -} // namespace hash diff --git a/algo/swifftx/sha3_interface.h b/algo/swifftx/sha3_interface.h deleted file mode 100644 index 1ae23609..00000000 --- a/algo/swifftx/sha3_interface.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -//#include -#include "hash_interface.h" - -namespace sha3 { - -using BitSequence = hash::BitSequence; -using DataLength = hash::DataLength; - -struct sha3_interface : hash::hash_interface {}; - -} // namespace sha3 diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index 1ffb7b01..9f676088 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -27,7 +27,7 @@ #else #include "algo/echo/sph_echo.h" #endif -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) #include "algo/groestl/aes_ni/hash-groestl.h" #else #include "algo/groestl/sph_groestl.h" @@ -50,7 +50,7 @@ typedef struct TortureGarden TortureGarden; // Graph of hash algos plus SPH contexts struct TortureGarden { -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) hashState_groestl groestl; #else sph_groestl512_context groestl; @@ -123,7 +123,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden, #endif break; case 5: -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &garden->groestl, hash, input, 512 ); #else sph_groestl512_init( &garden->groestl) ; diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 300d866e..43b583f4 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -1092,7 +1092,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid, dintrlv_2x64( hash0, hash1, vhash, 512 ); break; case GROESTL: -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &ctx.groestl, hash0, in0, size<<3 ); groestl512_full( &ctx.groestl, hash1, in1, size<<3 ); #else @@ -1173,7 +1173,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid, simd512_ctx( &ctx.simd, hash1, in1, size ); break; case ECHO: -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) echo_full( &ctx.echo, hash0, 512, in0, size ); echo_full( &ctx.echo, hash1, 512, in1, size ); #else diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 7302d8eb..f8d93aed 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -218,7 +218,7 @@ union _x16r_2x64_context_overlay { blake512_2x64_context blake; bmw512_2x64_context bmw; -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) hashState_groestl groestl; #else sph_groestl512_context groestl; diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index c271d2ca..212b7268 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -1208,7 +1208,7 @@ union _x16rv2_2x64_context_overlay { blake512_2x64_context blake; bmw512_2x64_context bmw; -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) hashState_groestl groestl; #else sph_groestl512_context groestl; @@ -1294,7 +1294,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid ) dintrlv_2x64( hash0, hash1, vhash, 512 ); break; case GROESTL: -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &ctx.groestl, hash0, in0, size<<3 ); groestl512_full( &ctx.groestl, hash1, in1, size<<3 ); #else @@ -1400,7 +1400,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid ) simd512_ctx( &ctx.simd, hash1, in1, size ); break; case ECHO: -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) echo_full( &ctx.echo, hash0, 512, in0, size ); echo_full( &ctx.echo, hash1, 512, in1, size ); #else diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 7a66081b..8769480b 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -294,7 +294,6 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, { uint32_t hash[16*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t bedata1[2] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 0c39dbd4..068dc066 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -938,7 +938,7 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce, #endif #include "algo/shabal/sph_shabal.h" #include "algo/haval/sph-haval.h" -#if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) ) +#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) ) #include "algo/groestl/sph_groestl.h" #endif #if !( defined(__AES__) || defined(__ARM_FEATURE_AES) ) @@ -950,7 +950,7 @@ union _x17_context_overlay { blake512_2x64_context blake; bmw512_2x64_context bmw; -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) hashState_groestl groestl; #else sph_groestl512_context groestl; @@ -1000,7 +1000,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id ) dintrlv_2x64( hash0, hash1, vhash, 512 ); -#if defined(__AES__) // || defined(__ARM_FEATURE_AES) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &ctx.groestl, hash0, hash0, 512 ); groestl512_full( &ctx.groestl, hash1, hash1, 512 ); #else diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c index d804ef2e..4aa848a6 100644 --- a/algo/x22/x22i.c +++ b/algo/x22/x22i.c @@ -5,15 +5,15 @@ #include "algo/blake/blake512-hash.h" #include "algo/bmw/sph_bmw.h" #if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/fugue/fugue-aesni.h" #else - #include "algo/groestl/sph_groestl.h" #include "algo/fugue/sph_fugue.h" #endif #if defined(__AES__) || defined(__ARM_FEATURE_AES) + #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #else + #include "algo/groestl/sph_groestl.h" #include "algo/echo/sph_echo.h" #endif #include "algo/skein/sph_skein.h" @@ -39,15 +39,15 @@ union _x22i_context_overlay blake512_context blake; sph_bmw512_context bmw; #if defined(__AES__) - hashState_groestl groestl; hashState_fugue fugue; #else - sph_groestl512_context groestl; sph_fugue512_context fugue; #endif #if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_groestl groestl; hashState_echo echo; #else + sph_groestl512_context groestl; sph_echo512_context echo; #endif sph_jh512_context jh; @@ -81,7 +81,7 @@ int x22i_hash( void *output, const void *input, int thrid ) sph_bmw512(&ctx.bmw, (const void*) hash, 64); sph_bmw512_close(&ctx.bmw, hash); -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &ctx.groestl, hash, hash, 512 ); #else sph_groestl512_init( &ctx.groestl ); diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c index 99827b5b..d58e3435 100644 --- a/algo/x22/x25x.c +++ b/algo/x22/x25x.c @@ -5,15 +5,15 @@ #include "algo/blake/blake512-hash.h" #include "algo/bmw/sph_bmw.h" #if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/fugue/fugue-aesni.h" #else - #include "algo/groestl/sph_groestl.h" #include "algo/fugue/sph_fugue.h" #endif #if defined(__AES__) || defined(__ARM_FEATURE_AES) + #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #else + #include "algo/groestl/sph_groestl.h" #include "algo/echo/sph_echo.h" #endif #include "algo/skein/sph_skein.h" @@ -42,15 +42,15 @@ union _x25x_context_overlay blake512_context blake; sph_bmw512_context bmw; #if defined(__AES__) - hashState_groestl groestl; hashState_fugue fugue; #else - sph_groestl512_context groestl; sph_fugue512_context fugue; #endif #if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_groestl groestl; hashState_echo echo; #else + sph_groestl512_context groestl; sph_echo512_context echo; #endif sph_jh512_context jh; @@ -86,7 +86,7 @@ int x25x_hash( void *output, const void *input, int thrid ) sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64); sph_bmw512_close(&ctx.bmw, &hash[1]); -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 ); #else sph_groestl512_init( &ctx.groestl ); @@ -119,7 +119,7 @@ int x25x_hash( void *output, const void *input, int thrid ) simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 ); #if defined(__AES__) || defined(__ARM_FEATURE_AES) - echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 ); + echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 ); #else sph_echo512_init( &ctx.echo ); sph_echo512( &ctx.echo, &hash[9], 64 ); diff --git a/configure b/configure index 8424a26e..e8088bfc 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.13. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.13' -PACKAGE_STRING='cpuminer-opt 23.13' +PACKAGE_VERSION='23.14' +PACKAGE_STRING='cpuminer-opt 23.14' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.13 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.13:";; + short | recursive ) echo "Configuration of cpuminer-opt 23.14:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.13 +cpuminer-opt configure 23.14 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.13, which was +It was created by cpuminer-opt $as_me 23.14, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.13' + VERSION='23.14' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.13, which was +This file was extended by cpuminer-opt $as_me 23.14, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.13 +cpuminer-opt config.status 23.14 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 9f839b1e..51c9f5ba 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [23.13]) +AC_INIT([cpuminer-opt], [23.14]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index 9f87ae04..6f1b6965 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.13. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.14. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.13' -PACKAGE_STRING='cpuminer-opt 23.13' +PACKAGE_VERSION='23.14' +PACKAGE_STRING='cpuminer-opt 23.14' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.13 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.13:";; + short | recursive ) echo "Configuration of cpuminer-opt 23.14:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.13 +cpuminer-opt configure 23.14 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.13, which was +It was created by cpuminer-opt $as_me 23.14, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.13' + VERSION='23.14' cat >>confdefs.h <<_ACEOF @@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.13, which was +This file was extended by cpuminer-opt $as_me 23.14, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6784,7 +6784,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 23.13 +cpuminer-opt config.status 23.14 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index 6db8dae2..5d87a683 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3666,11 +3666,6 @@ static int thread_create(struct thr_info *thr, void* func) void get_defconfig_path(char *out, size_t bufsize, char *argv0); - -#include "simd-utils.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "compat/aes_helper.c" - int main(int argc, char *argv[]) { struct thr_info *thr; diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 02b3deb5..a34801aa 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -322,6 +322,7 @@ static inline __m128i v128_neg1_fn() #define mm128_xim_32( v1, v0, c ) \ _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \ _mm_castsi128_ps( v0 ), c ) ) +#define v128_xim32 mm128_xim_32 // Examples of simple operations using xim: /* diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index 2e7a4bc5..21e503f8 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -68,7 +68,7 @@ #define v128_mul32 vmulq_u32 #define v128_mul16 vmulq_u16 -// slow, tested with argon2d +// Widening, shuffle high element to align with Intel static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) { return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ), @@ -86,7 +86,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) // Not yet needed //#define v128_cmpeq1 - +// Signed #define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 ) #define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 ) #define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 ) @@ -406,34 +406,15 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) v1 = vorrq_u32( v1, t1 ); \ } -// Cross lane shuffles, no programmable shuffle in NEON - -// vector mask, use as last resort. prefer rev, alignr, etc +// vector mask, use as last resort. prefer tbl, rev, alignr, etc #define v128_shufflev32( v, vmask ) \ v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \ ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \ ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \ ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \ -// compatible with x86_64, but very slow, avoid #define v128_shuffle8( v, vmask ) \ - v128_set8( ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[15] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[14] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[13] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[12] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[11] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[10] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 9] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 8] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 7] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 6] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 5] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 4] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 3] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 2] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \ - ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] ) - + vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask ); // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster. // Bit rotation already promotes faster widths. Usage is context sensitive.