diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 4f532716..06cb8000 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -73,6 +73,14 @@ If not what makes it happen or not happen? Change Log ---------- +v23.10 + +x86_64: Fixed scrypt, scryptn2 algos SSE2. +Fixed sha512d256d algo AVX2, SSE2, NEON. +Fixed a bug in Skein N-way that reduced performance. +ARM: Skein algo optimized for NEON & SHA2. +Skein2 algo 2-way optimized for NEON & SSE2. + v23.9 x86_64: fixed minotaurx crash, broken in 23.7. diff --git a/algo/blake/blake256-hash.c b/algo/blake/blake256-hash.c index 4e146ae3..e60376f5 100644 --- a/algo/blake/blake256-hash.c +++ b/algo/blake/blake256-hash.c @@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, #define BLAKE256_4X32_BLOCK_BSWAP32 \ { \ v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203 ); \ + 0x0405060700010203 ); \ M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \ M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \ M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \ @@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate, const v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); - H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 ); - H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 ); - H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 ); - H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 ); - H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 ); - H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 ); - H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 ); - H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 ); + H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 ); + H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 ); + H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 ); + H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 ); + H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 ); + H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 ); + H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 ); + H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 ); #else diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index 3bb21175..40bcfa30 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -131,47 +131,7 @@ V[7] = v128_alignr64( V6, V7, 1 ); \ } -/* -#elif defined(__SSE2__) -// always true - -#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \ -{ \ - Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \ - _mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \ - Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \ - Vc = _mm_add_epi64( Vc, Vd ); \ - Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \ -\ - Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \ - _mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \ - Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \ - Vc = _mm_add_epi64( Vc, Vd ); \ - Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \ -} - -#define BLAKE2B_ROUND( R ) \ -{ \ - v128_t *V = (v128_t*)v; \ - v128_t V2, V3, V6, V7; \ - const uint8_t *sigmaR = sigma[R]; \ - BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ - BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ - V2 = mm128_alignr_64( V[3], V[2], 1 ); \ - V3 = mm128_alignr_64( V[2], V[3], 1 ); \ - V6 = mm128_alignr_64( V[6], V[7], 1 ); \ - V7 = mm128_alignr_64( V[7], V[6], 1 ); \ - BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \ - BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \ - V[2] = mm128_alignr_64( V2, V3, 1 ); \ - V[3] = mm128_alignr_64( V3, V2, 1 ); \ - V[6] = mm128_alignr_64( V7, V6, 1 ); \ - V[7] = mm128_alignr_64( V6, V7, 1 ); \ -} -*/ - #else -// never used, SSE2 is always available #ifndef ROTR64 #define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index d15890b0..38acd699 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -62,78 +62,78 @@ static const uint32_t IV256[] = { */ #define ss0(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \ - _mm_slli_epi32( (x), 3) ), \ - _mm_xor_si128( mm128_rol_32( (x), 4), \ - mm128_rol_32( (x), 19) ) ) + v128_xor( v128_xor( v128_sr32( (x), 1), \ + v128_sl32( (x), 3) ), \ + v128_xor( v128_rol32( (x), 4), \ + v128_rol32( (x), 19) ) ) #define ss1(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \ - _mm_slli_epi32( (x), 2) ), \ - _mm_xor_si128( mm128_rol_32( (x), 8), \ - mm128_rol_32( (x), 23) ) ) + v128_xor( v128_xor( v128_sr32( (x), 1), \ + v128_sl32( (x), 2) ), \ + v128_xor( v128_rol32( (x), 8), \ + v128_rol32( (x), 23) ) ) #define ss2(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \ - _mm_slli_epi32( (x), 1) ), \ - _mm_xor_si128( mm128_rol_32( (x), 12), \ - mm128_rol_32( (x), 25) ) ) + v128_xor( v128_xor( v128_sr32( (x), 2), \ + v128_sl32( (x), 1) ), \ + v128_xor( v128_rol32( (x), 12), \ + v128_rol32( (x), 25) ) ) #define ss3(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \ - _mm_slli_epi32( (x), 2) ), \ - _mm_xor_si128( mm128_rol_32( (x), 15), \ - mm128_rol_32( (x), 29) ) ) + v128_xor( v128_xor( v128_sr32( (x), 2), \ + v128_sl32( (x), 2) ), \ + v128_xor( v128_rol32( (x), 15), \ + v128_rol32( (x), 29) ) ) #define ss4(x) \ - _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) ) + v128_xor( (x), v128_sr32( (x), 1 ) ) #define ss5(x) \ - _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) ) + v128_xor( (x), v128_sr32( (x), 2 ) ) -#define rs1(x) mm128_rol_32( x, 3 ) -#define rs2(x) mm128_rol_32( x, 7 ) -#define rs3(x) mm128_rol_32( x, 13 ) -#define rs4(x) mm128_rol_32( x, 16 ) -#define rs5(x) mm128_rol_32( x, 19 ) -#define rs6(x) mm128_rol_32( x, 23 ) -#define rs7(x) mm128_rol_32( x, 27 ) +#define rs1(x) v128_rol32( x, 3 ) +#define rs2(x) v128_rol32( x, 7 ) +#define rs3(x) v128_rol32( x, 13 ) +#define rs4(x) v128_rol32( x, 16 ) +#define rs5(x) v128_rol32( x, 19 ) +#define rs6(x) v128_rol32( x, 23 ) +#define rs7(x) v128_rol32( x, 27 ) #define rol_off_32( M, j, off ) \ - mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \ + v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \ ( ( (j) + (off) ) & 0xF ) + 1 ) #define add_elt_s( M, H, j ) \ - _mm_xor_si128( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \ + v128_xor( \ + v128_add32( \ + v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \ rol_off_32( M, j, 3 ) ), \ rol_off_32( M, j, 10 ) ), \ - _mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \ + v128_32( ( (j)+16 ) * 0x05555555UL ) ), \ H[ ( (j)+7 ) & 0xF ] ) #define expand1s( qt, M, H, i ) \ - _mm_add_epi32( mm128_add4_32( \ - mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \ + v128_add32( v128_add4_32( \ + v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \ ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \ - mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \ + v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \ ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \ - mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \ + v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \ ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \ - mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \ + v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \ ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \ add_elt_s( M, H, (i)-16 ) ) #define expand2s( qt, M, H, i) \ - _mm_add_epi32( mm128_add4_32( \ - mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \ + v128_add32( v128_add4_32( \ + v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \ qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \ - mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \ + v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \ qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \ - mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \ + v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \ qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \ - mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \ + v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \ ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \ add_elt_s( M, H, (i)-16 ) ) @@ -141,169 +141,169 @@ static const uint32_t IV256[] = { // resulting in some sign changes compared to the reference code. #define Ws0 \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \ - _mm_xor_si128( M[14], H[14] ) ) ) + v128_add32( \ + v128_add32( \ + v128_sub32( v128_xor( M[ 5], H[ 5] ), \ + v128_xor( M[ 7], H[ 7] ) ), \ + v128_xor( M[10], H[10] ) ), \ + v128_add32( v128_xor( M[13], H[13] ), \ + v128_xor( M[14], H[14] ) ) ) #define Ws1 \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \ - _mm_xor_si128( M[15], H[15] ) ) ) + v128_add32( \ + v128_add32( \ + v128_sub32( v128_xor( M[ 6], H[ 6] ), \ + v128_xor( M[ 8], H[ 8] ) ), \ + v128_xor( M[11], H[11] ) ), \ + v128_sub32( v128_xor( M[14], H[14] ), \ + v128_xor( M[15], H[15] ) ) ) #define Ws2 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ - _mm_xor_si128( M[15], H[15] ) ) ) + v128_sub32( \ + v128_add32( \ + v128_add32( v128_xor( M[ 0], H[ 0] ), \ + v128_xor( M[ 7], H[ 7] ) ), \ + v128_xor( M[ 9], H[ 9] ) ), \ + v128_sub32( v128_xor( M[12], H[12] ), \ + v128_xor( M[15], H[15] ) ) ) #define Ws3 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \ - _mm_xor_si128( M[13], H[13] ) ) ) + v128_sub32( \ + v128_add32( \ + v128_sub32( v128_xor( M[ 0], H[ 0] ), \ + v128_xor( M[ 1], H[ 1] ) ), \ + v128_xor( M[ 8], H[ 8] ) ), \ + v128_sub32( v128_xor( M[10], H[10] ), \ + v128_xor( M[13], H[13] ) ) ) #define Ws4 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \ - _mm_xor_si128( M[14], H[14] ) ) ) + v128_sub32( \ + v128_add32( \ + v128_add32( v128_xor( M[ 1], H[ 1] ), \ + v128_xor( M[ 2], H[ 2] ) ), \ + v128_xor( M[ 9], H[ 9] ) ), \ + v128_add32( v128_xor( M[11], H[11] ), \ + v128_xor( M[14], H[14] ) ) ) #define Ws5 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ - _mm_xor_si128( M[15], H[15] ) ) ) + v128_sub32( \ + v128_add32( \ + v128_sub32( v128_xor( M[ 3], H[ 3] ), \ + v128_xor( M[ 2], H[ 2] ) ), \ + v128_xor( M[10], H[10] ) ), \ + v128_sub32( v128_xor( M[12], H[12] ), \ + v128_xor( M[15], H[15] ) ) ) #define Ws6 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \ - _mm_xor_si128( M[13], H[13] ) ) ) + v128_sub32( \ + v128_sub32( \ + v128_sub32( v128_xor( M[ 4], H[ 4] ), \ + v128_xor( M[ 0], H[ 0] ) ), \ + v128_xor( M[ 3], H[ 3] ) ), \ + v128_sub32( v128_xor( M[11], H[11] ), \ + v128_xor( M[13], H[13] ) ) ) #define Ws7 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \ - _mm_xor_si128( M[14], H[14] ) ) ) + v128_sub32( \ + v128_sub32( \ + v128_sub32( v128_xor( M[ 1], H[ 1] ), \ + v128_xor( M[ 4], H[ 4] ) ), \ + v128_xor( M[ 5], H[ 5] ) ), \ + v128_add32( v128_xor( M[12], H[12] ), \ + v128_xor( M[14], H[14] ) ) ) #define Ws8 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \ - _mm_xor_si128( M[15], H[15] ) ) ) + v128_add32( \ + v128_sub32( \ + v128_sub32( v128_xor( M[ 2], H[ 2] ), \ + v128_xor( M[ 5], H[ 5] ) ), \ + v128_xor( M[ 6], H[ 6] ) ), \ + v128_sub32( v128_xor( M[13], H[13] ), \ + v128_xor( M[15], H[15] ) ) ) #define Ws9 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \ - _mm_xor_si128( M[14], H[14] ) ) ) + v128_sub32( \ + v128_add32( \ + v128_sub32( v128_xor( M[ 0], H[ 0] ), \ + v128_xor( M[ 3], H[ 3] ) ), \ + v128_xor( M[ 6], H[ 6] ) ), \ + v128_sub32( v128_xor( M[ 7], H[ 7] ), \ + v128_xor( M[14], H[14] ) ) ) #define Ws10 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \ - _mm_xor_si128( M[15], H[15] ) ) ) + v128_sub32( \ + v128_sub32( \ + v128_sub32( v128_xor( M[ 8], H[ 8] ), \ + v128_xor( M[ 1], H[ 1] ) ), \ + v128_xor( M[ 4], H[ 4] ) ), \ + v128_sub32( v128_xor( M[ 7], H[ 7] ), \ + v128_xor( M[15], H[15] ) ) ) #define Ws11 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ) ) + v128_sub32( \ + v128_sub32( \ + v128_sub32( v128_xor( M[ 8], H[ 8] ), \ + v128_xor( M[ 0], H[ 0] ) ), \ + v128_xor( M[ 2], H[ 2] ) ), \ + v128_sub32( v128_xor( M[ 5], H[ 5] ), \ + v128_xor( M[ 9], H[ 9] ) ) ) #define Ws12 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \ - _mm_xor_si128( M[10], H[10] ) ) ) + v128_sub32( \ + v128_sub32( \ + v128_add32( v128_xor( M[ 1], H[ 1] ), \ + v128_xor( M[ 3], H[ 3] ) ), \ + v128_xor( M[ 6], H[ 6] ) ), \ + v128_sub32( v128_xor( M[ 9], H[ 9] ), \ + v128_xor( M[10], H[10] ) ) ) #define Ws13 \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \ - _mm_xor_si128( M[11], H[11] ) ) ) + v128_add32( \ + v128_add32( \ + v128_add32( v128_xor( M[ 2], H[ 2] ), \ + v128_xor( M[ 4], H[ 4] ) ), \ + v128_xor( M[ 7], H[ 7] ) ), \ + v128_add32( v128_xor( M[10], H[10] ), \ + v128_xor( M[11], H[11] ) ) ) #define Ws14 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \ - _mm_xor_si128( M[12], H[12] ) ) ) + v128_sub32( \ + v128_add32( \ + v128_sub32( v128_xor( M[ 3], H[ 3] ), \ + v128_xor( M[ 5], H[ 5] ) ), \ + v128_xor( M[ 8], H[ 8] ) ), \ + v128_add32( v128_xor( M[11], H[11] ), \ + v128_xor( M[12], H[12] ) ) ) #define Ws15 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ - _mm_xor_si128( M[ 4], H[4] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \ - _mm_xor_si128( M[13], H[13] ) ) ) + v128_sub32( \ + v128_sub32( \ + v128_sub32( v128_xor( M[12], H[12] ), \ + v128_xor( M[ 4], H[4] ) ), \ + v128_xor( M[ 6], H[ 6] ) ), \ + v128_sub32( v128_xor( M[ 9], H[ 9] ), \ + v128_xor( M[13], H[13] ) ) ) -void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] ) +void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] ) { - __m128i qt[32], xl, xh; \ - - qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] ); - qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] ); - qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] ); - qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] ); - qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] ); - qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] ); - qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] ); - qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] ); - qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] ); - qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] ); - qt[10] = _mm_add_epi32( ss0( Ws10), H[11] ); - qt[11] = _mm_add_epi32( ss1( Ws11), H[12] ); - qt[12] = _mm_add_epi32( ss2( Ws12), H[13] ); - qt[13] = _mm_add_epi32( ss3( Ws13), H[14] ); - qt[14] = _mm_add_epi32( ss4( Ws14), H[15] ); - qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] ); + v128u64_t qt[32], xl, xh; \ + + qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] ); + qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] ); + qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] ); + qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] ); + qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] ); + qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] ); + qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] ); + qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] ); + qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] ); + qt[ 9] = v128_add32( ss4( Ws9 ), H[10] ); + qt[10] = v128_add32( ss0( Ws10), H[11] ); + qt[11] = v128_add32( ss1( Ws11), H[12] ); + qt[12] = v128_add32( ss2( Ws12), H[13] ); + qt[13] = v128_add32( ss3( Ws13), H[14] ); + qt[14] = v128_add32( ss4( Ws14), H[15] ); + qt[15] = v128_add32( ss0( Ws15), H[ 0] ); qt[16] = expand1s( qt, M, H, 16 ); qt[17] = expand1s( qt, M, H, 17 ); qt[18] = expand2s( qt, M, H, 18 ); @@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] ) qt[30] = expand2s( qt, M, H, 30 ); qt[31] = expand2s( qt, M, H, 31 ); - xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm_xor_si128( xl, _mm_xor_si128( - mm128_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); - - dH[ 0] = _mm_add_epi32( - _mm_xor_si128( M[0], - _mm_xor_si128( _mm_slli_epi32( xh, 5 ), - _mm_srli_epi32( qt[16], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] )); - dH[ 1] = _mm_add_epi32( - _mm_xor_si128( M[1], - _mm_xor_si128( _mm_srli_epi32( xh, 7 ), - _mm_slli_epi32( qt[17], 8 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] )); - dH[ 2] = _mm_add_epi32( - _mm_xor_si128( M[2], - _mm_xor_si128( _mm_srli_epi32( xh, 5 ), - _mm_slli_epi32( qt[18], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] )); - dH[ 3] = _mm_add_epi32( - _mm_xor_si128( M[3], - _mm_xor_si128( _mm_srli_epi32( xh, 1 ), - _mm_slli_epi32( qt[19], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] )); - dH[ 4] = _mm_add_epi32( - _mm_xor_si128( M[4], - _mm_xor_si128( _mm_srli_epi32( xh, 3 ), - _mm_slli_epi32( qt[20], 0 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] )); - dH[ 5] = _mm_add_epi32( - _mm_xor_si128( M[5], - _mm_xor_si128( _mm_slli_epi32( xh, 6 ), - _mm_srli_epi32( qt[21], 6 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] )); - dH[ 6] = _mm_add_epi32( - _mm_xor_si128( M[6], - _mm_xor_si128( _mm_srli_epi32( xh, 4 ), - _mm_slli_epi32( qt[22], 6 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] )); - dH[ 7] = _mm_add_epi32( - _mm_xor_si128( M[7], - _mm_xor_si128( _mm_srli_epi32( xh, 11 ), - _mm_slli_epi32( qt[23], 2 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] )); - dH[ 8] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[4], 9 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )), - _mm_xor_si128( _mm_slli_epi32( xl, 8 ), - _mm_xor_si128( qt[23], qt[ 8] ) ) ); - dH[ 9] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[5], 10 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )), - _mm_xor_si128( _mm_srli_epi32( xl, 6 ), - _mm_xor_si128( qt[16], qt[ 9] ) ) ); - dH[10] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[6], 11 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )), - _mm_xor_si128( _mm_slli_epi32( xl, 6 ), - _mm_xor_si128( qt[17], qt[10] ) ) ); - dH[11] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[7], 12 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )), - _mm_xor_si128( _mm_slli_epi32( xl, 4 ), - _mm_xor_si128( qt[18], qt[11] ) ) ); - dH[12] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[0], 13 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )), - _mm_xor_si128( _mm_srli_epi32( xl, 3 ), - _mm_xor_si128( qt[19], qt[12] ) ) ); - dH[13] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[1], 14 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )), - _mm_xor_si128( _mm_srli_epi32( xl, 4 ), - _mm_xor_si128( qt[20], qt[13] ) ) ); - dH[14] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[2], 15 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )), - _mm_xor_si128( _mm_srli_epi32( xl, 7 ), - _mm_xor_si128( qt[21], qt[14] ) ) ); - dH[15] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[3], 16 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )), - _mm_xor_si128( _mm_srli_epi32( xl, 2 ), - _mm_xor_si128( qt[22], qt[15] ) ) ); + xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ), + v128_xor4( qt[20], qt[21], qt[22], qt[23] ) ); + xh = v128_xor( xl, v128_xor( + v128_xor4( qt[24], qt[25], qt[26], qt[27] ), + v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + + dH[ 0] = v128_add32( + v128_xor( M[0], + v128_xor( v128_sl32( xh, 5 ), + v128_sr32( qt[16], 5 ) ) ), + v128_xor( v128_xor( xl, qt[24] ), qt[ 0] )); + dH[ 1] = v128_add32( + v128_xor( M[1], + v128_xor( v128_sr32( xh, 7 ), + v128_sl32( qt[17], 8 ) ) ), + v128_xor( v128_xor( xl, qt[25] ), qt[ 1] )); + dH[ 2] = v128_add32( + v128_xor( M[2], + v128_xor( v128_sr32( xh, 5 ), + v128_sl32( qt[18], 5 ) ) ), + v128_xor( v128_xor( xl, qt[26] ), qt[ 2] )); + dH[ 3] = v128_add32( + v128_xor( M[3], + v128_xor( v128_sr32( xh, 1 ), + v128_sl32( qt[19], 5 ) ) ), + v128_xor( v128_xor( xl, qt[27] ), qt[ 3] )); + dH[ 4] = v128_add32( + v128_xor( M[4], + v128_xor( v128_sr32( xh, 3 ), + v128_sl32( qt[20], 0 ) ) ), + v128_xor( v128_xor( xl, qt[28] ), qt[ 4] )); + dH[ 5] = v128_add32( + v128_xor( M[5], + v128_xor( v128_sl32( xh, 6 ), + v128_sr32( qt[21], 6 ) ) ), + v128_xor( v128_xor( xl, qt[29] ), qt[ 5] )); + dH[ 6] = v128_add32( + v128_xor( M[6], + v128_xor( v128_sr32( xh, 4 ), + v128_sl32( qt[22], 6 ) ) ), + v128_xor( v128_xor( xl, qt[30] ), qt[ 6] )); + dH[ 7] = v128_add32( + v128_xor( M[7], + v128_xor( v128_sr32( xh, 11 ), + v128_sl32( qt[23], 2 ) ) ), + v128_xor( v128_xor( xl, qt[31] ), qt[ 7] )); + dH[ 8] = v128_add32( v128_add32( + v128_rol32( dH[4], 9 ), + v128_xor( v128_xor( xh, qt[24] ), M[ 8] )), + v128_xor( v128_sl32( xl, 8 ), + v128_xor( qt[23], qt[ 8] ) ) ); + dH[ 9] = v128_add32( v128_add32( + v128_rol32( dH[5], 10 ), + v128_xor( v128_xor( xh, qt[25] ), M[ 9] )), + v128_xor( v128_sr32( xl, 6 ), + v128_xor( qt[16], qt[ 9] ) ) ); + dH[10] = v128_add32( v128_add32( + v128_rol32( dH[6], 11 ), + v128_xor( v128_xor( xh, qt[26] ), M[10] )), + v128_xor( v128_sl32( xl, 6 ), + v128_xor( qt[17], qt[10] ) ) ); + dH[11] = v128_add32( v128_add32( + v128_rol32( dH[7], 12 ), + v128_xor( v128_xor( xh, qt[27] ), M[11] )), + v128_xor( v128_sl32( xl, 4 ), + v128_xor( qt[18], qt[11] ) ) ); + dH[12] = v128_add32( v128_add32( + v128_rol32( dH[0], 13 ), + v128_xor( v128_xor( xh, qt[28] ), M[12] )), + v128_xor( v128_sr32( xl, 3 ), + v128_xor( qt[19], qt[12] ) ) ); + dH[13] = v128_add32( v128_add32( + v128_rol32( dH[1], 14 ), + v128_xor( v128_xor( xh, qt[29] ), M[13] )), + v128_xor( v128_sr32( xl, 4 ), + v128_xor( qt[20], qt[13] ) ) ); + dH[14] = v128_add32( v128_add32( + v128_rol32( dH[2], 15 ), + v128_xor( v128_xor( xh, qt[30] ), M[14] )), + v128_xor( v128_sr32( xl, 7 ), + v128_xor( qt[21], qt[14] ) ) ); + dH[15] = v128_add32( v128_add32( + v128_rol32( dH[3], 16 ), + v128_xor( v128_xor( xh, qt[31] ), M[15] )), + v128_xor( v128_sr32( xl, 2 ), + v128_xor( qt[22], qt[15] ) ) ); } static const uint32_t final_s[16][4] = @@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] = { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf } }; /* -static const __m128i final_s[16] = +static const v128u64_t final_s[16] = { { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 }, { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 }, @@ -451,26 +451,26 @@ static const __m128i final_s[16] = */ void bmw256_4way_init( bmw256_4way_context *ctx ) { - ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 ); - ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 ); - ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B ); - ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F ); - ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 ); - ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 ); - ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B ); - ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F ); - ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 ); - ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 ); - ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B ); - ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F ); - ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 ); - ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 ); - ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B ); - ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F ); + ctx->H[ 0] = v128_64( 0x4041424340414243 ); + ctx->H[ 1] = v128_64( 0x4445464744454647 ); + ctx->H[ 2] = v128_64( 0x48494A4B48494A4B ); + ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F ); + ctx->H[ 4] = v128_64( 0x5051525350515253 ); + ctx->H[ 5] = v128_64( 0x5455565754555657 ); + ctx->H[ 6] = v128_64( 0x58595A5B58595A5B ); + ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F ); + ctx->H[ 8] = v128_64( 0x6061626360616263 ); + ctx->H[ 9] = v128_64( 0x6465666764656667 ); + ctx->H[10] = v128_64( 0x68696A6B68696A6B ); + ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F ); + ctx->H[12] = v128_64( 0x7071727370717273 ); + ctx->H[13] = v128_64( 0x7475767774757677 ); + ctx->H[14] = v128_64( 0x78797A7B78797A7B ); + ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F ); // for ( int i = 0; i < 16; i++ ) -// sc->H[i] = _mm_set1_epi32( iv[i] ); +// sc->H[i] = v128_32( iv[i] ); ctx->ptr = 0; ctx->bit_count = 0; } @@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx ) static void bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) { - __m128i *vdata = (__m128i*)data; - __m128i *buf; - __m128i htmp[16]; - __m128i *h1, *h2; + v128u64_t *vdata = (v128u64_t*)data; + v128u64_t *buf; + v128u64_t htmp[16]; + v128u64_t *h1, *h2; size_t ptr; const int buf_size = 64; // bytes of one lane, compatible with len @@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) clen = buf_size - ptr; if ( clen > len ) clen = len; - memcpy_128( buf + (ptr>>2), vdata, clen >> 2 ); + v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 ); vdata += ( clen >> 2 ); len -= clen; ptr += clen; if ( ptr == buf_size ) { - __m128i *ht; + v128u64_t *ht; compress_small( buf, h1, h2 ); ht = h1; h1 = h2; @@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) } sc->ptr = ptr; - if ( h1 != sc->H ) - memcpy_128( sc->H, h1, 16 ); + v128_memcpy( sc->H, h1, 16 ); } static void bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32) { - __m128i *buf; - __m128i h1[16], h2[16], *h; + v128u64_t *buf; + v128u64_t h1[16], h2[16], *h; size_t ptr, u, v; const int buf_size = 64; // bytes of one lane, compatible with len buf = sc->buf; ptr = sc->ptr; - buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 ); + buf[ ptr>>2 ] = v128_64( 0x0000008000000080 ); ptr += 4; h = sc->H; // assume bit_count fits in 32 bits if ( ptr > buf_size - 4 ) { - memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 ); + v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 ); compress_small( buf, h, h1 ); ptr = 0; h = h1; } - memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); - buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n ); - buf[ (buf_size - 4) >> 2 ] = m128_zero; + v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); + buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n ); + buf[ (buf_size - 4) >> 2 ] = v128_zero; compress_small( buf, h, h2 ); for ( u = 0; u < 16; u ++ ) buf[u] = h2[u]; - compress_small( buf, (__m128i*)final_s, h1 ); + compress_small( buf, (v128u64_t*)final_s, h1 ); for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) - casti_m128i( dst, u ) = h1[v]; + casti_v128( dst, u ) = h1[v]; } /* diff --git a/algo/fugue/fugue-aesni.c b/algo/fugue/fugue-aesni.c index 42d7b1d5..b3f5ba59 100644 --- a/algo/fugue/fugue-aesni.c +++ b/algo/fugue/fugue-aesni.c @@ -146,7 +146,7 @@ MYALIGN const unsigned int _IV512[] = { #define SUBSTITUTE(r0, _t2 )\ _t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\ - _t2 = _mm_aesenclast_si128( _t2, m128_zero ) + _t2 = _mm_aesenclast_si128( _t2, v128_zero ) #define SUPERMIX(t0, t1, t2, t3, t4)\ t2 = t0;\ @@ -162,16 +162,16 @@ MYALIGN const unsigned int _IV512[] = { t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\ t4 = _mm_xor_si128(t4, t1);\ t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\ - t2 = mm128_xor3(t2, t3, t0 );\ + t2 = v128_xor3(t2, t3, t0 );\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\ - t4 = mm128_xor3( t4, t1, t2 ); \ + t4 = v128_xor3( t4, t1, t2 ); \ t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\ t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\ t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\ t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\ - t4 = mm128_xor3( t4, t2, t1 ); \ + t4 = v128_xor3( t4, t2, t1 ); \ t0 = _mm_xor_si128(t0, t3);\ - t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c))); + t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c))); /* #define SUPERMIX(t0, t1, t2, t3, t4)\ @@ -188,7 +188,7 @@ MYALIGN const unsigned int _IV512[] = { t4 = _mm_xor_si128(t4, t1);\ t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\ t4 = _mm_xor_si128(t4, t1);\ - t2 = mm128_xor3(t2, t3, t0 );\ + t2 = v128_xor3(t2, t3, t0 );\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\ t4 = _mm_xor_si128(t4, t2);\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\ @@ -485,7 +485,7 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize) ctx->uBlockLength = 4; for(i = 0; i < 6; i++) - ctx->state[i] = m128_zero; + ctx->state[i] = v128_zero; ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0); ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1); diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index ceb69ce8..ec27470f 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -66,7 +66,40 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff }; #define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK ) -//#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 ) +/* +#define TRANSP_MASK \ + 0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2 +#define SUBSH_MASK0 \ + 0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8 +#define SUBSH_MASK1 \ + 0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9 +#define SUBSH_MASK2 \ + 0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa +#define SUBSH_MASK3 \ + 0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb +#define SUBSH_MASK4 \ + 0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc +#define SUBSH_MASK5 \ + 0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd +#define SUBSH_MASK6 \ + 0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe +#define SUBSH_MASK7 \ + 0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3 + +//#define gr_shuffle8( v, c ) v128_shullfev8( v, c ) + + +#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \ + c07, c06, c05, c04, c03, c02, c01, c00 ) \ + v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ + v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ + v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ + v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \ + v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \ + 11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \ + 7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \ + 3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 ) +*/ #else diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index 1981a69b..a4ffb645 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY = #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* AddRoundConstant */\ - b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \ + b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \ a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\ a1 = _mm256_xor_si256( a1, b1 );\ a2 = _mm256_xor_si256( a2, b1 );\ diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h index d01b615f..4850ca12 100644 --- a/algo/hamsi/hamsi-hash-4way.h +++ b/algo/hamsi/hamsi-hash-4way.h @@ -38,7 +38,7 @@ #include #include "simd-utils.h" -// SSE2 or NEON Hamsi-512 2x64 +#if defined(__SSE4_2__) || defined(__ARM_NEON) typedef struct { @@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data, size_t len ); void hamsi512_2x64( void *dst, const void *data, size_t len ); +#endif + #if defined (__AVX2__) // Hamsi-512 4x64 diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index e8296bc1..ef06313a 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -75,16 +75,16 @@ #define SUBCRUMB( a0, a1, a2, a3 ) \ { \ v128_t t = a0; \ - a0 = mm128_xoror( a3, a0, a1 ); \ + a0 = v128_xoror( a3, a0, a1 ); \ a2 = v128_xor( a2, a3 ); \ a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ - a3 = mm128_xorand( a2, a3, t ); \ - a2 = mm128_xorand( a1, a2, a0 ); \ + a3 = v128_xorand( a2, a3, t ); \ + a2 = v128_xorand( a1, a2, a0 ); \ a1 = v128_or( a1, a3 ); \ a3 = v128_xor( a3, a2 ); \ t = v128_xor( t, a1 ); \ a2 = v128_and( a2, a1 ); \ - a1 = mm128_xnor( a1, a0 ); \ + a1 = v128_xnor( a1, a0 ); \ a0 = t; \ } diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c index 2a98afb1..d03bd57d 100644 --- a/algo/ripemd/ripemd-hash-4way.c +++ b/algo/ripemd/ripemd-hash-4way.c @@ -35,13 +35,13 @@ static const uint32_t IV[5] = _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z ) #define F3(x, y, z) \ - _mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z ) + _mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z ) #define F4(x, y, z) \ _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y ) #define F5(x, y, z) \ - _mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) ) + _mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) ) #define RR(a, b, c, d, e, f, s, r, k) \ do{ \ diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index 224e6ba9..5e37c0b1 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data, v128_t A, B, C, D, E, F, G, H, T0, T1, T2; v128_t vmask, targ, hash; int t6_mask, flip; - v128_t W[16]; memcpy_128( W, data, 16 ); + v128_t W[16]; v128_memcpy( W, data, 16 ); A = v128_load( state_in ); B = v128_load( state_in+1 ); diff --git a/algo/sha/sha512256d-4way.c b/algo/sha/sha512256d-4way.c index ec376df3..8d38da76 100644 --- a/algo/sha/sha512256d-4way.c +++ b/algo/sha/sha512256d-4way.c @@ -5,11 +5,11 @@ #include #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - #define SHA512256D_8WAY 1 +#define SHA512256D_8WAY 1 #elif defined(__AVX2__) - #define SHA512256D_4WAY 1 +#define SHA512256D_4WAY 1 #elif defined(__SSE2__) || defined(__ARM_NEON) - #define SHA512256D_2WAY 1 +#define SHA512256D_2WAY 1 #endif #if defined(SHA512256D_8WAY) @@ -110,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; const int thr_id = mythr->id; const bool bench = opt_benchmark; const __m256i four = v256_64( 0x0000000400000000 ); mm256_bswap32_intrlv80_4x64( vdata, pdata ); - *noncev = mm256_intrlv_blend_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); + casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32( + n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) ); do { sha512256d_4way_init( &ctx ); @@ -138,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce, submit_solution( work, lane_hash, mythr ); } } - *noncev = _mm256_add_epi32( *noncev, four ); + casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four ); n += 4; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); @@ -180,11 +179,10 @@ int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce, v128u64_t *noncev = (v128u64_t*)vdata + 9; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const v128u64_t two = v128_64( 0x0000000200000000 ); + const v128_t two = v128_64( 0x0000000200000000 ); v128_bswap32_intrlv80_2x64( vdata, pdata ); - *noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev ); -// *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); do { @@ -279,7 +277,7 @@ int scanhash_sha512256d( struct work *work, uint32_t max_nonce, bool register_sha512256d_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; #if defined(SHA512256D_8WAY) gate->scanhash = (void*)&scanhash_sha512256d_8way; #elif defined(SHA512256D_4WAY) diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index 26a9ab25..66288ed4 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -71,7 +71,7 @@ static const uint32_t IV512[] = static void c512_2way( shavite512_2way_context *ctx, const void *msg ) { - const __m128i zero = _mm_setzero_si128(); + const v128_t zero = v128_zero; __m256i p0, p1, p2, p3, x; __m256i k00, k01, k02, k03, k10, k11, k12, k13; __m256i *m = (__m256i*)msg; @@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) void shavite512_2way_init( shavite512_2way_context *ctx ) { __m256i *h = (__m256i*)ctx->h; - __m128i *iv = (__m128i*)IV512; + v128_t *iv = (v128_t*)IV512; h[0] = mm256_bcast_m128( iv[0] ); h[1] = mm256_bcast_m128( iv[1] ); @@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst ) count.u32[3] = ctx->count3; casti_m256i( buf, 6 ) = mm256_bcast_m128( - _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); @@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst, } casti_m256i( buf, 6 ) = mm256_bcast_m128( - _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); @@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst, const void *data, size_t len ) { __m256i *h = (__m256i*)ctx->h; - __m128i *iv = (__m128i*)IV512; + v128_t *iv = (v128_t*)IV512; h[0] = mm256_bcast_m128( iv[0] ); h[1] = mm256_bcast_m128( iv[1] ); @@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst, } casti_m256i( buf, 6 ) = mm256_bcast_m128( - _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c index 2e95e93d..9d3956db 100644 --- a/algo/shavite/shavite-hash-4way.c +++ b/algo/shavite/shavite-hash-4way.c @@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst ) count.u32[3] = ctx->count3; casti_m512i( buf, 6 ) = mm512_bcast_m128( - _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); @@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst, } casti_m512i( buf, 6 ) = mm512_bcast_m128( - _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); @@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst, } casti_m512i( buf, 6 ) = mm512_bcast_m128( - _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index b66c6c14..4c0be169 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined(SKEIN_2WAY) + +static __thread skein512_2x64_context skein512_2x64_ctx + __attribute__ ((aligned (64))); + +void skeinhash_2x64( void *state, const void *input ) +{ + uint64_t vhash64[8*2] __attribute__ ((aligned (32))); + uint32_t hash0[16] __attribute__ ((aligned (32))); + uint32_t hash1[16] __attribute__ ((aligned (32))); + skein512_2x64_context ctx_skein; + memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) ); + + skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) ); + + dintrlv_2x64( hash0, hash1, vhash64, 512 ); + + sha256_full( hash0, hash0, 64 ); + sha256_full( hash1, hash1, 64 ); + + intrlv_2x32( state, hash0, hash1, 256 ); +} + +int scanhash_skein_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[20*2] __attribute__ ((aligned (32))); + uint32_t hash[8*2] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash_d7 = &(hash[7<<1]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t targ_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + v128u32_t *noncev = (v128u32_t*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + v128_bswap32_intrlv80_2x64( vdata, pdata ); + skein512_2x64_prehash64( &skein512_2x64_ctx, vdata ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + do + { + skeinhash_2x64( hash, vdata ); + for ( int lane = 0; lane < 2; lane++ ) + if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) ) + { + extr_lane_2x32( lane_hash, hash, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) + { + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) ); + n += 2; + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + #endif diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c index 191aa154..ee964d04 100644 --- a/algo/skein/skein-gate.c +++ b/algo/skein/skein-gate.c @@ -3,16 +3,20 @@ bool register_skein_algo( algo_gate_t* gate ) { -#if defined (SKEIN_8WAY) - gate->optimizations = AVX2_OPT | AVX512_OPT; +#if defined(SKEIN_8WAY) + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; gate->scanhash = (void*)&scanhash_skein_8way; gate->hash = (void*)&skeinhash_8way; -#elif defined (SKEIN_4WAY) - gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; +#elif defined(SKEIN_4WAY) + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_skein_4way; gate->hash = (void*)&skeinhash_4way; +#elif defined(SKEIN_2WAY) + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT; + gate->scanhash = (void*)&scanhash_skein_2x64; + gate->hash = (void*)&skeinhash_2x64; #else - gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_skein; gate->hash = (void*)&skeinhash; #endif @@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate ) bool register_skein2_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT | AVX512_OPT; -#if defined (SKEIN_8WAY) + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; +#if defined(SKEIN_8WAY) gate->scanhash = (void*)&scanhash_skein2_8way; - gate->hash = (void*)&skein2hash_8way; -#elif defined (SKEIN_4WAY) +#elif defined(SKEIN_4WAY) gate->scanhash = (void*)&scanhash_skein2_4way; - gate->hash = (void*)&skein2hash_4way; +#elif defined(SKEIN_2WAY) + gate->scanhash = (void*)&scanhash_skein2_2x64; #else gate->scanhash = (void*)&scanhash_skein2; - gate->hash = (void*)&skein2hash; #endif return true; }; diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h index eba535ea..1bdae7f4 100644 --- a/algo/skein/skein-gate.h +++ b/algo/skein/skein-gate.h @@ -7,6 +7,8 @@ #define SKEIN_8WAY 1 #elif defined(__AVX2__) #define SKEIN_4WAY 1 +#elif defined(__SSE2__) || defined(__ARM_NEON) + #define SKEIN_2WAY 1 #endif #if defined(SKEIN_8WAY) @@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input ); int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, uint64_t* hashes_done, struct thr_info *mythr ); +#elif defined(SKEIN_2WAY) + +void skeinhash_2x64( void *output, const void *input ); +int scanhash_skein_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +void skein2hash_2x64( void *output, const void *input ); +int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce, + uint64_t* hashes_done, struct thr_info *mythr ); + #else void skeinhash( void *output, const void *input ); diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index e96e59aa..3a2327f8 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data, // Close - unsigned et; - - memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 ); - et = 352 + ((bcount == 0) << 7); - UBI_BIG_8WAY( et, ptr ); + if ( ptr ) + { + unsigned et; + memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + et = 352 + ((bcount == 0) << 7); + UBI_BIG_8WAY( et, ptr ); + } memset_zero_512( buf, buf_size >> 3 ); bcount = 0; @@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data, // Close - unsigned et; - - memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); - et = 352 + ((bcount == 0) << 7); - UBI_BIG_4WAY( et, ptr ); + if ( ptr ) + { + unsigned et; + memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + et = 352 + ((bcount == 0) << 7); + UBI_BIG_4WAY( et, ptr ); + } memset_zero_256( buf, buf_size >> 3 ); bcount = 0; @@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data, // Close - unsigned et; - - v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 ); - et = 352 + ((bcount == 0) << 7); - UBI_BIG_2WAY( et, ptr ); + if ( ptr ) + { + unsigned et; + v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + et = 352 + ((bcount == 0) << 7); + UBI_BIG_2WAY( et, ptr ); + } v128_memset_zero( buf, buf_size >> 3 ); bcount = 0; diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c index 16b1627f..697d64b0 100644 --- a/algo/skein/skein2-4way.c +++ b/algo/skein/skein2-4way.c @@ -5,19 +5,6 @@ #if defined(SKEIN_8WAY) - static __thread skein512_8way_context skein512_8way_ctx - __attribute__ ((aligned (64))); - -void skein2hash_8way( void *output, const void *input ) -{ - uint64_t hash[16*8] __attribute__ ((aligned (128))); - skein512_8way_context ctx; - memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) ); - - skein512_8way_final16( &ctx, hash, input + (64*8) ); - skein512_8way_full( &ctx, output, hash, 64 ); -} - int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { @@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, #elif defined(SKEIN_4WAY) -static __thread skein512_4way_context skein512_4way_ctx - __attribute__ ((aligned (64))); - -void skein2hash_4way( void *output, const void *input ) -{ - skein512_4way_context ctx; - memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) ); - uint64_t hash[16*4] __attribute__ ((aligned (64))); - - skein512_4way_final16( &ctx, hash, input + (64*4) ); - skein512_4way_full( &ctx, output, hash, 64 ); -} - int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { @@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined(SKEIN_2WAY) + +int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint64_t hash[8*2] __attribute__ ((aligned (64))); + uint32_t vdata[20*2] __attribute__ ((aligned (64))); + skein512_2x64_context ctx; + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint64_t *hash_q3 = &(hash[3*2]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint64_t targ_q3 = ((uint64_t*)ptarget)[3]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + uint32_t n = first_nonce; + v128u64_t *noncev = (v128u64_t*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const v128u64_t two = v128_64( 0x0000000200000000 ); + + v128_bswap32_intrlv80_2x64( vdata, pdata ); + skein512_2x64_prehash64( &ctx, vdata ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + + do + { + skein512_2x64_final16( &ctx, hash, vdata + (16*2) ); + skein512_2x64_full( &ctx, hash, hash, 64 ); + + for ( int lane = 0; lane < 2; lane++ ) + if ( hash_q3[ lane ] <= targ_q3 ) + { + extr_lane_2x64( lane_hash, hash, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) && !bench ) + { + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = v128_add32( *noncev, two ); + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + #endif diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c index 2b70f93d..cbd8d84d 100644 --- a/algo/verthash/Verthash.c +++ b/algo/verthash/Verthash.c @@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p ) *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \ *(__m256i*)hash, *(__m256i*)blob_off ), k ); -#elif defined(__SSE4_1__) // || defined(__ARM_NEON) +#elif defined(__SSE4_1__) || defined(__ARM_NEON) #define MULXOR \ casti_v128( hash, 0 ) = v128_mul32( v128_xor( \ @@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size, / VH_BYTE_ALIGNMENT ) + 1; #if defined (__AVX2__) const __m256i k = _mm256_set1_epi32( 0x1000193 ); -#elif defined(__SSE4_1__) // || defined(__ARM_NEON) +#elif defined(__SSE4_1__) || defined(__ARM_NEON) const v128u32_t k = v128_32( 0x1000193 ); #endif diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c index d816984c..3238fb7d 100644 --- a/algo/verthash/verthash-gate.c +++ b/algo/verthash/verthash-gate.c @@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate ) { opt_target_factor = 256.0; gate->scanhash = (void*)&scanhash_verthash; - gate->optimizations = SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT; const char *verthash_data_file = opt_data_file ? opt_data_file : default_verthash_data_file; diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index da97d218..d1e3e772 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -11,7 +11,9 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/simd/simd-hash-2way.h" #if defined(__aarch64__) #include "algo/simd/sph_simd.h" #endif @@ -31,8 +33,6 @@ #else #include "algo/fugue/sph_fugue.h" #endif -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/simd/nist.h" // Config #define MINOTAUR_ALGO_COUNT 16 @@ -69,11 +69,7 @@ struct TortureGarden cubehashParam cube; shavite512_context shavite; hashState_luffa luffa; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -165,13 +161,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden, sph_shavite512_close( &garden->shavite, hash ); break; case 13: -#if defined(__aarch64__) - sph_simd512_init( &garden->simd ); - sph_simd512( &garden->simd, input, 64); - sph_simd512_close( &garden->simd, hash ); -#else - simd_full( &garden->simd, (BitSequence *)hash, input, 512 ); -#endif + simd512_ctx( &garden->simd, hash, input, 64 ); break; case 14: sph_skein512_init( &garden->skein ); diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 75626f2f..edc9a3c1 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -931,15 +931,19 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce, // Need sph in some cases #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/sph_simd.h" -#include "algo/simd/nist.h" -#include "algo/hamsi/sph_hamsi.h" +//#include "algo/simd/sph_simd.h" +//#include "algo/simd/nist.h" +#if !( defined(__SSE4_2__) || defined(__ARM_NEON) ) + #include "algo/hamsi/sph_hamsi.h" +#endif #include "algo/shabal/sph_shabal.h" #include "algo/haval/sph-haval.h" -//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) ) +#if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) ) #include "algo/groestl/sph_groestl.h" +#endif +#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) ) #include "algo/echo/sph_echo.h" -//#endif +#endif #include "algo/fugue/sph_fugue.h" union _x17_context_overlay @@ -967,12 +971,8 @@ union _x17_context_overlay hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__x86_64__) simd512_context simd; -#else - sph_simd512_context simd; -#endif -#if defined(__SSE4_2__) // || defined(__ARM_NEON) +#if defined(__SSE4_2__) || defined(__ARM_NEON) hamsi_2x64_context hamsi; #else sph_hamsi512_context hamsi; @@ -1033,17 +1033,8 @@ int x17_2x64_hash( void *output, const void *input, int thr_id ) sph_shavite512( &ctx.shavite, hash1, 64 ); sph_shavite512_close( &ctx.shavite, hash1 ); -#if defined(__x86_64__) simd512_ctx( &ctx.simd, hash0, hash0, 64 ); simd512_ctx( &ctx.simd, hash1, hash1, 64 ); -#else - sph_simd512_init( &ctx.simd ); - sph_simd512( &ctx.simd, hash0, 64 ); - sph_simd512_close( &ctx.simd, hash0 ); - sph_simd512_init( &ctx.simd ); - sph_simd512( &ctx.simd, hash1, 64 ); - sph_simd512_close( &ctx.simd, hash1 ); -#endif #if defined(__AES__) || defined(__ARM_FEATURE_AES) echo_full( &ctx.echo, hash0, 512, hash0, 64 ); @@ -1057,7 +1048,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id ) sph_echo512_close( &ctx.echo, hash1 ); #endif -#if defined(__SSE4_2__) // || defined(__ARM_NEON) +#if defined(__SSE4_2__) || defined(__ARM_NEON) intrlv_2x64( vhash, hash0, hash1, 512 ); hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 ); dintrlv_2x64( hash0, hash1, vhash, 512 ); diff --git a/configure b/configure index 94a78192..1dc92efb 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.9. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.10. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.9' -PACKAGE_STRING='cpuminer-opt 23.9' +PACKAGE_VERSION='23.10' +PACKAGE_STRING='cpuminer-opt 23.10' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.9 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 23.10 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.9:";; + short | recursive ) echo "Configuration of cpuminer-opt 23.10:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.9 +cpuminer-opt configure 23.10 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.9, which was +It was created by cpuminer-opt $as_me 23.10, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.9' + VERSION='23.10' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.9, which was +This file was extended by cpuminer-opt $as_me 23.10, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.9 +cpuminer-opt config.status 23.10 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 37033443..63be3584 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [23.9]) +AC_INIT([cpuminer-opt], [23.10]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index 95a00e24..94a78192 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.8. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.9. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.8' -PACKAGE_STRING='cpuminer-opt 23.8' +PACKAGE_VERSION='23.9' +PACKAGE_STRING='cpuminer-opt 23.9' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.8 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 23.9 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.8:";; + short | recursive ) echo "Configuration of cpuminer-opt 23.9:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.8 +cpuminer-opt configure 23.9 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.8, which was +It was created by cpuminer-opt $as_me 23.9, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.8' + VERSION='23.9' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.8, which was +This file was extended by cpuminer-opt $as_me 23.9, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.8 +cpuminer-opt config.status 23.9 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index bbcd0ab6..9e5b33c6 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2968,8 +2968,12 @@ static bool cpu_capability( bool display_only ) printf(" Linux\n"); #elif defined(WIN32) printf(" Windows\n"); + #elif defined(__APPLE__) + printf(" MacOS\n"); +#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + printf(" Unix\n"); #else - printf("\n"); + printf("\n"); #endif printf("CPU features: "); diff --git a/miner.h b/miner.h index 0c6d70f1..cfa54a2d 100644 --- a/miner.h +++ b/miner.h @@ -3,12 +3,16 @@ #include +#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) ) +#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON." +#endif + #if defined(__x86_64__) #define USER_AGENT_ARCH "x64" // Intel, AMD x86_64 #elif defined(__aarch64__) #define USER_AGENT_ARCH "arm" // AArch64 //#elif -// #define USER_AGENT_ARCH "R5" // RISC-V +// #define USER_AGENT_ARCH "r5" // RISC-V #else #define USER_AGENT_ARCH #endif diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index e9813575..dfb9c71b 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -411,11 +411,11 @@ static inline void v128_bswap32_80( void *d, void *s ) { const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); - casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf ); - casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf ); - casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf ); - casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf ); - casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf ); + casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf ); + casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf ); + casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf ); + casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf ); + casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf ); } #elif defined(__aarch64__) && defined(__ARM_NEON) @@ -461,11 +461,11 @@ static inline void v128_bswap32_80( void *d, void *s ) static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) { - v128_t s0 = casti_m128i( src,0 ); - v128_t s1 = casti_m128i( src,1 ); - v128_t s2 = casti_m128i( src,2 ); - v128_t s3 = casti_m128i( src,3 ); - v128_t s4 = casti_m128i( src,4 ); + v128_t s0 = casti_v128( src,0 ); + v128_t s1 = casti_v128( src,1 ); + v128_t s2 = casti_v128( src,2 ); + v128_t s3 = casti_v128( src,3 ); + v128_t s4 = casti_v128( src,4 ); #if defined(__SSSE3__) @@ -480,38 +480,38 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) #else - s0 = mm128_bswap_32( s0 ); - s1 = mm128_bswap_32( s1 ); - s2 = mm128_bswap_32( s2 ); - s3 = mm128_bswap_32( s3 ); - s4 = mm128_bswap_32( s4 ); + s0 = v128_bswap32( s0 ); + s1 = v128_bswap32( s1 ); + s2 = v128_bswap32( s2 ); + s3 = v128_bswap32( s3 ); + s4 = v128_bswap32( s4 ); #endif - casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 ); - casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 ); - casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa ); - casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xff ); + casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 ); + casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 ); + casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa ); + casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff ); - casti_m128i( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 ); - casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 ); - casti_m128i( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa ); - casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xff ); + casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 ); + casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 ); + casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa ); + casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff ); - casti_m128i( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 ); - casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 ); - casti_m128i( d,10 ) = _mm_shuffle_epi32( s2, 0xaa ); - casti_m128i( d,11 ) = _mm_shuffle_epi32( s2, 0xff ); + casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 ); + casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 ); + casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa ); + casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff ); - casti_m128i( d,12 ) = _mm_shuffle_epi32( s3, 0x00 ); - casti_m128i( d,13 ) = _mm_shuffle_epi32( s3, 0x55 ); - casti_m128i( d,14 ) = _mm_shuffle_epi32( s3, 0xaa ); - casti_m128i( d,15 ) = _mm_shuffle_epi32( s3, 0xff ); + casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 ); + casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 ); + casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa ); + casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff ); - casti_m128i( d,16 ) = _mm_shuffle_epi32( s4, 0x00 ); - casti_m128i( d,17 ) = _mm_shuffle_epi32( s4, 0x55 ); - casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa ); - casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff ); + casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 ); + casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 ); + casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa ); + casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff ); } #elif defined(__aarch64__) && defined(__ARM_NEON) @@ -797,11 +797,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) const __m256i c1 = v256_32( 0x04050607 ); const __m256i c2 = v256_32( 0x08090a0b ); const __m256i c3 = v256_32( 0x0c0d0e0f ); - const v128_t s0 = casti_m128i( src,0 ); - const v128_t s1 = casti_m128i( src,1 ); - const v128_t s2 = casti_m128i( src,2 ); - const v128_t s3 = casti_m128i( src,3 ); - const v128_t s4 = casti_m128i( src,4 ); + const v128_t s0 = casti_v128( src,0 ); + const v128_t s1 = casti_v128( src,1 ); + const v128_t s2 = casti_v128( src,2 ); + const v128_t s3 = casti_v128( src,3 ); + const v128_t s4 = casti_v128( src,4 ); casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0, _mm256_castsi128_si256( s0 ) ); @@ -855,11 +855,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) const __m256i c2 = _mm256_add_epi32( c1, c1 ); const __m256i c3 = _mm256_add_epi32( c2, c1 ); - v128_t s0 = casti_m128i( src,0 ); - v128_t s1 = casti_m128i( src,1 ); - v128_t s2 = casti_m128i( src,2 ); - v128_t s3 = casti_m128i( src,3 ); - v128_t s4 = casti_m128i( src,4 ); + v128_t s0 = casti_v128( src,0 ); + v128_t s1 = casti_v128( src,1 ); + v128_t s2 = casti_v128( src,2 ); + v128_t s3 = casti_v128( src,3 ); + v128_t s4 = casti_v128( src,4 ); s0 = _mm_shuffle_epi8( s0, bswap_shuf ); s1 = _mm_shuffle_epi8( s1, bswap_shuf ); @@ -1303,11 +1303,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) const __m512i c1 = v512_32( 0x04050607 ); const __m512i c2 = v512_32( 0x08090a0b ); const __m512i c3 = v512_32( 0x0c0d0e0f ); - const v128_t s0 = casti_m128i( src,0 ); - const v128_t s1 = casti_m128i( src,1 ); - const v128_t s2 = casti_m128i( src,2 ); - const v128_t s3 = casti_m128i( src,3 ); - const v128_t s4 = casti_m128i( src,4 ); + const v128_t s0 = casti_v128( src,0 ); + const v128_t s1 = casti_v128( src,1 ); + const v128_t s2 = casti_v128( src,2 ); + const v128_t s3 = casti_v128( src,3 ); + const v128_t s4 = casti_v128( src,4 ); casti_m512i( d, 0 ) = _mm512_permutexvar_epi8( c0, _mm512_castsi128_si512( s0 ) ); @@ -1360,11 +1360,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) const __m512i c1 = v512_32( 1 ); const __m512i c2 = _mm512_add_epi32( c1, c1 ); const __m512i c3 = _mm512_add_epi32( c2, c1 ); - v128_t s0 = casti_m128i( src,0 ); - v128_t s1 = casti_m128i( src,1 ); - v128_t s2 = casti_m128i( src,2 ); - v128_t s3 = casti_m128i( src,3 ); - v128_t s4 = casti_m128i( src,4 ); + v128_t s0 = casti_v128( src,0 ); + v128_t s1 = casti_v128( src,1 ); + v128_t s2 = casti_v128( src,2 ); + v128_t s3 = casti_v128( src,3 ); + v128_t s4 = casti_v128( src,4 ); s0 = _mm_shuffle_epi8( s0, bswap_shuf ); s1 = _mm_shuffle_epi8( s1, bswap_shuf ); @@ -1492,20 +1492,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src ) #if defined(__SSE2__) - casti_m128i( d,0 ) = _mm_shuffle_epi32( s0, 0x44 ); - casti_m128i( d,1 ) = _mm_shuffle_epi32( s0, 0xee ); + casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 ); + casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee ); - casti_m128i( d,2 ) = _mm_shuffle_epi32( s1, 0x44 ); - casti_m128i( d,3 ) = _mm_shuffle_epi32( s1, 0xee ); + casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 ); + casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee ); - casti_m128i( d,4 ) = _mm_shuffle_epi32( s2, 0x44 ); - casti_m128i( d,5 ) = _mm_shuffle_epi32( s2, 0xee ); + casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 ); + casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee ); - casti_m128i( d,6 ) = _mm_shuffle_epi32( s3, 0x44 ); - casti_m128i( d,7 ) = _mm_shuffle_epi32( s3, 0xee ); + casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 ); + casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee ); - casti_m128i( d,8 ) = _mm_shuffle_epi32( s4, 0x44 ); - casti_m128i( d,9 ) = _mm_shuffle_epi32( s4, 0xee ); + casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 ); + casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee ); #elif defined(__ARM_NEON) @@ -1719,7 +1719,7 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src ) { __m256i s0 = casti_m256i( src,0 ); __m256i s1 = casti_m256i( src,1 ); - v128_t s4 = casti_m128i( src,4 ); + v128_t s4 = casti_v128( src,4 ); casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 ); casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 ); @@ -1747,11 +1747,11 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { const __m256i c0 = v256_64( 0x0405060700010203 ); const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b ); - const v128_t s0 = casti_m128i( src,0 ); - const v128_t s1 = casti_m128i( src,1 ); - const v128_t s2 = casti_m128i( src,2 ); - const v128_t s3 = casti_m128i( src,3 ); - const v128_t s4 = casti_m128i( src,4 ); + const v128_t s0 = casti_v128( src,0 ); + const v128_t s1 = casti_v128( src,1 ); + const v128_t s2 = casti_v128( src,2 ); + const v128_t s3 = casti_v128( src,3 ); + const v128_t s4 = casti_v128( src,4 ); casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0, _mm256_castsi128_si256( s0 ) ); @@ -1783,7 +1783,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); __m256i s0 = casti_m256i( src,0 ); __m256i s1 = casti_m256i( src,1 ); - v128_t s4 = casti_m128i( src,4 ); + v128_t s4 = casti_v128( src,4 ); s0 = _mm256_shuffle_epi8( s0, bswap_shuf ); s1 = _mm256_shuffle_epi8( s1, bswap_shuf ); @@ -2162,11 +2162,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) { const __m512i c0 = v512_64( 0x0405060700010203 ); const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b ); - const v128_t s0 = casti_m128i( src,0 ); - const v128_t s1 = casti_m128i( src,1 ); - const v128_t s2 = casti_m128i( src,2 ); - const v128_t s3 = casti_m128i( src,3 ); - const v128_t s4 = casti_m128i( src,4 ); + const v128_t s0 = casti_v128( src,0 ); + const v128_t s1 = casti_v128( src,1 ); + const v128_t s2 = casti_v128( src,2 ); + const v128_t s3 = casti_v128( src,3 ); + const v128_t s4 = casti_v128( src,4 ); casti_m512i( d,0 ) = _mm512_permutexvar_epi8( c0, _mm512_castsi128_si512( s0 ) ); @@ -2197,11 +2197,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); const __m512i c1 = v512_64( 1 ); - v128_t s0 = casti_m128i( src,0 ); - v128_t s1 = casti_m128i( src,1 ); - v128_t s2 = casti_m128i( src,2 ); - v128_t s3 = casti_m128i( src,3 ); - v128_t s4 = casti_m128i( src,4 ); + v128_t s0 = casti_v128( src,0 ); + v128_t s1 = casti_v128( src,1 ); + v128_t s2 = casti_v128( src,2 ); + v128_t s3 = casti_v128( src,3 ); + v128_t s4 = casti_v128( src,4 ); s0 = _mm_shuffle_epi8( s0, bswap_shuf ); s1 = _mm_shuffle_epi8( s1, bswap_shuf ); @@ -2391,11 +2391,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src ) { const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); - const v128_t s0 = casti_m128i( src,0 ); - const v128_t s1 = casti_m128i( src,1 ); - const v128_t s2 = casti_m128i( src,2 ); - const v128_t s3 = casti_m128i( src,3 ); - const v128_t s4 = casti_m128i( src,4 ); + const v128_t s0 = casti_v128( src,0 ); + const v128_t s1 = casti_v128( src,1 ); + const v128_t s2 = casti_v128( src,2 ); + const v128_t s3 = casti_v128( src,3 ); + const v128_t s4 = casti_v128( src,4 ); casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ), bswap_shuf ); @@ -2415,11 +2415,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src ) { const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); - v128_t s0 = casti_m128i( src,0 ); - v128_t s1 = casti_m128i( src,1 ); - v128_t s2 = casti_m128i( src,2 ); - v128_t s3 = casti_m128i( src,3 ); - v128_t s4 = casti_m128i( src,4 ); + v128_t s0 = casti_v128( src,0 ); + v128_t s1 = casti_v128( src,1 ); + v128_t s2 = casti_v128( src,2 ); + v128_t s3 = casti_v128( src,3 ); + v128_t s4 = casti_v128( src,4 ); s0 = _mm_shuffle_epi8( s0, bswap_shuf ); s1 = _mm_shuffle_epi8( s1, bswap_shuf ); @@ -2489,44 +2489,44 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src, const v128_t *s = (const v128_t*)src; v128_t *d = (v128_t*)dst; - d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 ); - d[ 1] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd ); - d[ 2] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 ); - d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd ); - d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 ); - d[ 5] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd ); - d[ 6] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 ); - d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd ); + d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 ); + d[ 1] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd ); + d[ 2] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 ); + d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd ); + d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 ); + d[ 5] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd ); + d[ 6] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 ); + d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd ); if ( bit_len <= 256 ) return; - d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 ); - d[ 9] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd ); - d[10] = mm128_shuffle2_32( s[10], s[11], 0x88 ); - d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd ); - d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 ); - d[13] = mm128_shuffle2_32( s[12], s[13], 0xdd ); - d[14] = mm128_shuffle2_32( s[14], s[15], 0x88 ); - d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd ); + d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 ); + d[ 9] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd ); + d[10] = v128_shuffle2_32( s[10], s[11], 0x88 ); + d[11] = v128_shuffle2_32( s[10], s[11], 0xdd ); + d[12] = v128_shuffle2_32( s[12], s[13], 0x88 ); + d[13] = v128_shuffle2_32( s[12], s[13], 0xdd ); + d[14] = v128_shuffle2_32( s[14], s[15], 0x88 ); + d[15] = v128_shuffle2_32( s[14], s[15], 0xdd ); if ( bit_len <= 512 ) return; - d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 ); - d[17] = mm128_shuffle2_32( s[16], s[17], 0xdd ); - d[18] = mm128_shuffle2_32( s[18], s[19], 0x88 ); - d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd ); - d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 ); - d[21] = mm128_shuffle2_32( s[20], s[21], 0xdd ); - d[22] = mm128_shuffle2_32( s[22], s[23], 0x88 ); - d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd ); - d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 ); - d[25] = mm128_shuffle2_32( s[24], s[25], 0xdd ); - d[26] = mm128_shuffle2_32( s[26], s[27], 0x88 ); - d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd ); - d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 ); - d[29] = mm128_shuffle2_32( s[28], s[29], 0xdd ); - d[30] = mm128_shuffle2_32( s[30], s[31], 0x88 ); - d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd ); + d[16] = v128_shuffle2_32( s[16], s[17], 0x88 ); + d[17] = v128_shuffle2_32( s[16], s[17], 0xdd ); + d[18] = v128_shuffle2_32( s[18], s[19], 0x88 ); + d[19] = v128_shuffle2_32( s[18], s[19], 0xdd ); + d[20] = v128_shuffle2_32( s[20], s[21], 0x88 ); + d[21] = v128_shuffle2_32( s[20], s[21], 0xdd ); + d[22] = v128_shuffle2_32( s[22], s[23], 0x88 ); + d[23] = v128_shuffle2_32( s[22], s[23], 0xdd ); + d[24] = v128_shuffle2_32( s[24], s[25], 0x88 ); + d[25] = v128_shuffle2_32( s[24], s[25], 0xdd ); + d[26] = v128_shuffle2_32( s[26], s[27], 0x88 ); + d[27] = v128_shuffle2_32( s[26], s[27], 0xdd ); + d[28] = v128_shuffle2_32( s[28], s[29], 0x88 ); + d[29] = v128_shuffle2_32( s[28], s[29], 0xdd ); + d[30] = v128_shuffle2_32( s[30], s[31], 0x88 ); + d[31] = v128_shuffle2_32( s[30], s[31], 0xdd ); // if ( bit_len <= 1024 ) return; } @@ -2537,77 +2537,77 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src, const v128_t *s = (const v128_t*)src; v128_t *d = (v128_t*)dst; - d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 ); - d[ 1] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 ); - d[ 2] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd ); - d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd ); - d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 ); - d[ 5] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 ); - d[ 6] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd ); - d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd ); - d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 ); - d[ 9] = mm128_shuffle2_32( s[10], s[11], 0x88 ); - d[10] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd ); - d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd ); - d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 ); - d[13] = mm128_shuffle2_32( s[14], s[15], 0x88 ); - d[14] = mm128_shuffle2_32( s[12], s[13], 0xdd ); - d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd ); + d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 ); + d[ 1] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 ); + d[ 2] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd ); + d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd ); + d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 ); + d[ 5] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 ); + d[ 6] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd ); + d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd ); + d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 ); + d[ 9] = v128_shuffle2_32( s[10], s[11], 0x88 ); + d[10] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd ); + d[11] = v128_shuffle2_32( s[10], s[11], 0xdd ); + d[12] = v128_shuffle2_32( s[12], s[13], 0x88 ); + d[13] = v128_shuffle2_32( s[14], s[15], 0x88 ); + d[14] = v128_shuffle2_32( s[12], s[13], 0xdd ); + d[15] = v128_shuffle2_32( s[14], s[15], 0xdd ); if ( bit_len <= 256 ) return; - d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 ); - d[17] = mm128_shuffle2_32( s[18], s[19], 0x88 ); - d[18] = mm128_shuffle2_32( s[16], s[17], 0xdd ); - d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd ); - d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 ); - d[21] = mm128_shuffle2_32( s[22], s[23], 0x88 ); - d[22] = mm128_shuffle2_32( s[20], s[21], 0xdd ); - d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd ); - d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 ); - d[25] = mm128_shuffle2_32( s[26], s[27], 0x88 ); - d[26] = mm128_shuffle2_32( s[24], s[25], 0xdd ); - d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd ); - d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 ); - d[29] = mm128_shuffle2_32( s[30], s[31], 0x88 ); - d[30] = mm128_shuffle2_32( s[28], s[29], 0xdd ); - d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd ); + d[16] = v128_shuffle2_32( s[16], s[17], 0x88 ); + d[17] = v128_shuffle2_32( s[18], s[19], 0x88 ); + d[18] = v128_shuffle2_32( s[16], s[17], 0xdd ); + d[19] = v128_shuffle2_32( s[18], s[19], 0xdd ); + d[20] = v128_shuffle2_32( s[20], s[21], 0x88 ); + d[21] = v128_shuffle2_32( s[22], s[23], 0x88 ); + d[22] = v128_shuffle2_32( s[20], s[21], 0xdd ); + d[23] = v128_shuffle2_32( s[22], s[23], 0xdd ); + d[24] = v128_shuffle2_32( s[24], s[25], 0x88 ); + d[25] = v128_shuffle2_32( s[26], s[27], 0x88 ); + d[26] = v128_shuffle2_32( s[24], s[25], 0xdd ); + d[27] = v128_shuffle2_32( s[26], s[27], 0xdd ); + d[28] = v128_shuffle2_32( s[28], s[29], 0x88 ); + d[29] = v128_shuffle2_32( s[30], s[31], 0x88 ); + d[30] = v128_shuffle2_32( s[28], s[29], 0xdd ); + d[31] = v128_shuffle2_32( s[30], s[31], 0xdd ); if ( bit_len <= 512 ) return; - d[32] = mm128_shuffle2_32( s[32], s[33], 0x88 ); - d[33] = mm128_shuffle2_32( s[34], s[35], 0x88 ); - d[34] = mm128_shuffle2_32( s[32], s[33], 0xdd ); - d[35] = mm128_shuffle2_32( s[34], s[35], 0xdd ); - d[36] = mm128_shuffle2_32( s[36], s[37], 0x88 ); - d[37] = mm128_shuffle2_32( s[38], s[39], 0x88 ); - d[38] = mm128_shuffle2_32( s[36], s[37], 0xdd ); - d[39] = mm128_shuffle2_32( s[38], s[39], 0xdd ); - d[40] = mm128_shuffle2_32( s[40], s[41], 0x88 ); - d[41] = mm128_shuffle2_32( s[42], s[43], 0x88 ); - d[42] = mm128_shuffle2_32( s[40], s[41], 0xdd ); - d[43] = mm128_shuffle2_32( s[42], s[43], 0xdd ); - d[44] = mm128_shuffle2_32( s[44], s[45], 0x88 ); - d[45] = mm128_shuffle2_32( s[46], s[47], 0x88 ); - d[46] = mm128_shuffle2_32( s[44], s[45], 0xdd ); - d[47] = mm128_shuffle2_32( s[46], s[47], 0xdd ); - - d[48] = mm128_shuffle2_32( s[48], s[49], 0x88 ); - d[49] = mm128_shuffle2_32( s[50], s[51], 0x88 ); - d[50] = mm128_shuffle2_32( s[48], s[49], 0xdd ); - d[51] = mm128_shuffle2_32( s[50], s[51], 0xdd ); - d[52] = mm128_shuffle2_32( s[52], s[53], 0x88 ); - d[53] = mm128_shuffle2_32( s[54], s[55], 0x88 ); - d[54] = mm128_shuffle2_32( s[52], s[53], 0xdd ); - d[55] = mm128_shuffle2_32( s[54], s[55], 0xdd ); - d[56] = mm128_shuffle2_32( s[56], s[57], 0x88 ); - d[57] = mm128_shuffle2_32( s[58], s[59], 0x88 ); - d[58] = mm128_shuffle2_32( s[56], s[57], 0xdd ); - d[59] = mm128_shuffle2_32( s[58], s[59], 0xdd ); - d[60] = mm128_shuffle2_32( s[60], s[61], 0x88 ); - d[61] = mm128_shuffle2_32( s[62], s[63], 0x88 ); - d[62] = mm128_shuffle2_32( s[60], s[61], 0xdd ); - d[63] = mm128_shuffle2_32( s[62], s[63], 0xdd ); + d[32] = v128_shuffle2_32( s[32], s[33], 0x88 ); + d[33] = v128_shuffle2_32( s[34], s[35], 0x88 ); + d[34] = v128_shuffle2_32( s[32], s[33], 0xdd ); + d[35] = v128_shuffle2_32( s[34], s[35], 0xdd ); + d[36] = v128_shuffle2_32( s[36], s[37], 0x88 ); + d[37] = v128_shuffle2_32( s[38], s[39], 0x88 ); + d[38] = v128_shuffle2_32( s[36], s[37], 0xdd ); + d[39] = v128_shuffle2_32( s[38], s[39], 0xdd ); + d[40] = v128_shuffle2_32( s[40], s[41], 0x88 ); + d[41] = v128_shuffle2_32( s[42], s[43], 0x88 ); + d[42] = v128_shuffle2_32( s[40], s[41], 0xdd ); + d[43] = v128_shuffle2_32( s[42], s[43], 0xdd ); + d[44] = v128_shuffle2_32( s[44], s[45], 0x88 ); + d[45] = v128_shuffle2_32( s[46], s[47], 0x88 ); + d[46] = v128_shuffle2_32( s[44], s[45], 0xdd ); + d[47] = v128_shuffle2_32( s[46], s[47], 0xdd ); + + d[48] = v128_shuffle2_32( s[48], s[49], 0x88 ); + d[49] = v128_shuffle2_32( s[50], s[51], 0x88 ); + d[50] = v128_shuffle2_32( s[48], s[49], 0xdd ); + d[51] = v128_shuffle2_32( s[50], s[51], 0xdd ); + d[52] = v128_shuffle2_32( s[52], s[53], 0x88 ); + d[53] = v128_shuffle2_32( s[54], s[55], 0x88 ); + d[54] = v128_shuffle2_32( s[52], s[53], 0xdd ); + d[55] = v128_shuffle2_32( s[54], s[55], 0xdd ); + d[56] = v128_shuffle2_32( s[56], s[57], 0x88 ); + d[57] = v128_shuffle2_32( s[58], s[59], 0x88 ); + d[58] = v128_shuffle2_32( s[56], s[57], 0xdd ); + d[59] = v128_shuffle2_32( s[58], s[59], 0xdd ); + d[60] = v128_shuffle2_32( s[60], s[61], 0x88 ); + d[61] = v128_shuffle2_32( s[62], s[63], 0x88 ); + d[62] = v128_shuffle2_32( s[60], s[61], 0xdd ); + d[63] = v128_shuffle2_32( s[62], s[63], 0xdd ); // if ( bit_len <= 1024 ) return; } @@ -3248,12 +3248,21 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0, // blend 2 vectors while interleaving: { hi[n], lo[n-1], ... hi[1], lo[0] } #if defined(__SSE4_1__) -// No SSE2 implementation. -//#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f ) -//#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 ) +#define v128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f ) +#define v128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 ) + +#elif defined(__SSE2__) || defined(__ARM_NEON) -#endif // SSE4_1 +#define v128_intrlv_blend_64( hi, lo ) \ + v128_blendv( hi, lo, v128_set64( 0ull, 0xffffffffffffffffull ) ) + +#define v128_intrlv_blend_32( hi, lo ) \ + v128_blendv( hi, lo, v128_set64( 0xffffffffull, 0xffffffffull ) ) + +#else +// unknown, unsupported architecture +#endif #if defined(__AVX2__) diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 787cd676..02b3deb5 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -35,17 +35,17 @@ /////////////////////////////////////////////////////////////////////////////// // New architecturally agnostic syntax: -// All users of 128 bit SIMD should use new syntax or protect SSE2 only -// code segments. -// Other vector sizes continue with old syntax for now. -// Definitionns here will gradually be converted to new synytax. -// For consistency the larger vector utilities should do the same. - +// +// __m128i -> v128_t +// _mm_ -> v128_ +// mm128_ -> v128_ +// +// There is also new syntax to accomodate ARM's stricter type checking of +// vector element size. They have no effect on x86_64. // direct translation of native intrinsics #define v128_t __m128i -// Needed for ARM #define v128u64_t v128_t #define v128u32_t v128_t #define v128u16_t v128_t @@ -56,9 +56,9 @@ // Needed for ARM, Doesn't do anything special on x86_64 #define v128_load1_64(p) _mm_set1_epi64x(*(uint64_t*)(p) ) -#define v128_load1_32(p) _mm_set_epi32( *(uint32_t*)(p) ) -#define v128_load1_16(p) _mm_set_epi16( *(uint16_t*)(p) ) -#define v128_load1_8( p) _mm_set_epi8( *(uint8_t*) (p) ) +#define v128_load1_32(p) _mm_set1_epi32( *(uint32_t*)(p) ) +#define v128_load1_16(p) _mm_set1_epi16( *(uint16_t*)(p) ) +#define v128_load1_8( p) _mm_set1_epi8( *(uint8_t*) (p) ) // arithmetic #define v128_add64 _mm_add_epi64 @@ -80,7 +80,7 @@ #define v128_mulw32 _mm_mul_epu32 #define v128_mulw16 _mm_mul_epu16 -// compare +// signed compare #define v128_cmpeq64 _mm_cmpeq_epi64 #define v128_cmpeq32 _mm_cmpeq_epi32 #define v128_cmpeq16 _mm_cmpeq_epi16 @@ -118,20 +118,6 @@ #define v128_xor _mm_xor_si128 #define v128_xorq _mm_xor_si128 #define v128_andnot _mm_andnot_si128 -#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) ) -#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) ) - -// ternary -#define v128_xorandnot( v2, v1, v0 ) \ - _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) ) -#define v128_xor3( v2, v1, v0 ) \ - _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) ) -#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) ) -#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) ) -#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) ) -#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c )) -#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) ) -#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) ) // unpack #define v128_unpacklo64 _mm_unpacklo_epi64 @@ -234,24 +220,22 @@ static inline __m128i mm128_mov32_128( const uint32_t n ) // Pseudo constants #define v128_zero _mm_setzero_si128() -#define m128_zero _mm_setzero_si128() - #if defined(__SSE4_1__) // Bitwise AND, return 1 if result is all bits clear. -#define v128_and_eq0 _mm_testz_si128 +#define v128_and_eq0(v1, v0) _mm_testz_si128(v1, v0) +// v128_is_zero? static inline int v128_cmpeq0( v128_t v ) { return v128_and_eq0( v, v ); } #endif // Bitwise compare return 1 if all bits set. -#define v128_cmpeq1 _mm_test_all ones +#define v128_cmpeq1(v) _mm_test_all ones(v) -#define v128_one mm128_mov64_128( 1 ) -#define m128_one_128 v128_one +#define v128_one mm128_mov64_128(1) // ASM avoids the need to initialize return variable to avoid compiler warning. // Macro abstracts function parentheses to look like an identifier. @@ -265,17 +249,14 @@ static inline __m128i v128_neg1_fn() #endif return a; } -#define m128_neg1_fn v128_neg1_fn #define v128_neg1 v128_neg1_fn() -#define m128_neg1 v128_neg1 // // Vector pointer cast // p = any aligned pointer // returns p as pointer to vector type -#define castp_m128i(p) ((__m128i*)(p)) -#define castp_v128 castp_m128i +#define castp_v128(p) ((__m128i*)(p)) #define castp_v128u64 castp_v128 #define castp_v128u32 castp_v128 #define castp_v128u16 castp_v128 @@ -283,8 +264,7 @@ static inline __m128i v128_neg1_fn() // p = any aligned pointer // returns *p, watch your pointer arithmetic -#define cast_m128i(p) (*((__m128i*)(p))) -#define cast_v128 cast_m128i +#define cast_v128(p) (*((__m128i*)(p))) #define cast_v128u64 cast_v128 #define cast_v128u32 cast_v128 #define cast_v128u16 cast_v128 @@ -292,8 +272,8 @@ static inline __m128i v128_neg1_fn() // p = any aligned pointer, i = scaled array index // returns value p[i] -#define casti_m128i(p,i) (((__m128i*)(p))[(i)]) -#define casti_v128 casti_m128i +#define casti_v128(p,i) (((__m128i*)(p))[(i)]) +#define casti_m128i casti_v128 // deprecated #define casti_v128u64 casti_v128 #define casti_v128u32 casti_v128 #define casti_v128u16 casti_v128 @@ -301,7 +281,7 @@ static inline __m128i v128_neg1_fn() // p = any aligned pointer, o = scaled offset // returns pointer p+o -#define casto_m128i(p,o) (((__m128i*)(p))+(o)) +#define casto_v128(p,o) (((__m128i*)(p))+(o)) #if defined(__SSE4_1__) #define v128_get64( v, l ) _mm_extract_epi64( v, l ) @@ -316,7 +296,7 @@ static inline __m128i v128_neg1_fn() ///////////////////////////////////////////////////////////// // -// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c ) +// _mm_insert_ps( __m128i v1, __m128i v2, imm8 c ) // // Fast and powerful but very limited in its application. // It requires SSE4.1 but only works with 128 bit vectors with 32 bit @@ -371,37 +351,31 @@ static inline __m128i v128_neg1_fn() #if defined(__AVX512VL__) //TODO Enable for AVX10_256 -static inline __m128i mm128_not( const __m128i v ) +static inline __m128i v128_not( const __m128i v ) { return _mm_ternarylogic_epi64( v, v, v, 1 ); } #else -#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 ) +#define v128_not( v ) _mm_xor_si128( v, v128_neg1 ) #endif -#define v128_not mm128_not -static inline __m128i mm128_negate_64( __m128i v ) +static inline v128u64_t v128_negate_64( v128u64_t v ) { return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); } -#define v128_negate64 mm128_negate_64 -static inline __m128i mm128_negate_32( __m128i v ) +static inline v128u32_t v128_negate_32( v128u32_t v ) { return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); } -#define v128_negate32 mm128_negate_32 -static inline __m128i mm128_negate_16( __m128i v ) +static inline v128u16_t v128_negate_16( v128u16_t v ) { return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); } -#define v128_negate16 mm128_negate_16 // Add 4 values, fewer dependencies than sequential addition. #define v128_add4_64( a, b, c, d ) \ _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) ) -#define mm128_add4_64 v128_add4_64 #define v128_add4_32( a, b, c, d ) \ _mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) ) -#define mm128_add4_32 v128_add4_32 #define v128_add4_16( a, b, c, d ) \ _mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) ) @@ -411,7 +385,6 @@ static inline __m128i mm128_negate_16( __m128i v ) #define v128_xor4( a, b, c, d ) \ _mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) ) -#define mm128_xor4 v128_xor4 // Memory functions @@ -419,70 +392,71 @@ static inline __m128i mm128_negate_16( __m128i v ) // Assumes data is alinged and integral. // n = number of __m128i, bytes/16 -static inline void memset_zero_128( __m128i *dst, const int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; } -#define v128_memset_zero memset_zero_128 +static inline void v128_memset_zero( v128_t *dst, const int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = v128_zero; } +#define memset_zero_128 v128_memset_zero -static inline void memset_128( __m128i *dst, const __m128i a, const int n ) +static inline void v128_memset( v128_t *dst, const v128_t a, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = a; } -#define v128_memset memset_128 -static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) +static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } -#define v128_memcpy memcpy_128 +#define memcpy_128 v128_memcpy #if defined(__AVX512VL__) //TODO Enable for AVX10_256 // a ^ b ^ c -#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 ) +#define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 ) // a & b & c -#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 ) +#define v128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 ) // a | b | c -#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe ) +#define v128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe ) // a ^ ( b & c ) -#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 ) +#define v128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 ) // a & ( b ^ c ) -#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 ) +#define v128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 ) // a ^ ( b | c ) -#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e ) +#define v128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e ) // a ^ ( ~b & c ) -#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 ) +#define v128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 ) // a | ( b & c ) -#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 ) +#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 ) // ~( a ^ b ), same as (~a) ^ b -#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 ) +#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 ) #else -#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) ) +#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) ) -#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) ) +#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) ) -#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) ) +#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) ) -#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) ) +#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) ) -#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c )) +#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c )) -#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) ) +#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) ) -#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) ) +#define v128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) ) -#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) ) +#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) ) -#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) ) +#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) ) #endif +#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) ) + // Mask making // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask. // Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements. @@ -508,7 +482,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define v128_qrev16(v) v128_shuffle16( v, 0x1b ) #define v128_lrev16(v) v128_shuffle16( v, 0xb1 ) -// These sgould never be callled from application code, use rol/ror. +// These should never be callled from application code, use rol/ror. #define v128_ror64_sse2( v, c ) \ _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ) @@ -524,12 +498,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #if defined(__AVX512VL__) // AVX512 fastest all rotations. -#define mm128_ror_64 _mm_ror_epi64 -#define mm128_rol_64 _mm_rol_epi64 -#define mm128_ror_32 _mm_ror_epi32 -#define mm128_rol_32 _mm_rol_epi32 +#define v128_ror64 _mm_ror_epi64 +#define v128_rol64 _mm_rol_epi64 +#define v128_ror32 _mm_ror_epi32 +#define v128_rol32 _mm_rol_epi32 -// ror/rol will alway find the fastest but these names may fit better with +// ror/rol will always find the fastest but these names may fit better with // application code performing shuffles rather than bit rotations. #define v128_shuflr64_8( v) _mm_ror_epi64( v, 8 ) #define v128_shufll64_8( v) _mm_rol_epi64( v, 8 ) @@ -543,7 +517,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define v128_shufll32_16(v) _mm_rol_epi32( v, 16 ) #elif defined(__SSSE3__) -// SSE2: fastest 32 bit, very fast 16, fast 8 +// SSSE3: fastest 32 bit, very fast 16, fast 8 #define v128_shuflr64_8( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( \ @@ -569,7 +543,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) _mm_shuffle_epi8( v, _mm_set_epi64x( \ 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) -#define mm128_ror_64( v, c ) \ +#define v128_ror64( v, c ) \ ( (c) == 8 ) ? v128_shuflr64_8( v ) \ : ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \ : ( (c) == 24 ) ? v128_shuflr64_24( v ) \ @@ -579,7 +553,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) : ( (c) == 56 ) ? v128_shufll64_8( v ) \ : v128_ror64_sse2( v, c ) -#define mm128_rol_64( v, c ) \ +#define v128_rol64( v, c ) \ ( (c) == 8 ) ? v128_shufll64_8( v ) \ : ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \ : ( (c) == 24 ) ? v128_shufll64_24( v ) \ @@ -589,13 +563,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) : ( (c) == 56 ) ? v128_shuflr64_8( v ) \ : v128_rol64_sse2( v, c ) -#define mm128_ror_32( v, c ) \ +#define v128_ror32( v, c ) \ ( (c) == 8 ) ? v128_shuflr32_8( v ) \ : ( (c) == 16 ) ? v128_lrev16( v ) \ : ( (c) == 24 ) ? v128_shufll32_8( v ) \ : v128_ror32_sse2( v, c ) -#define mm128_rol_32( v, c ) \ +#define v128_rol32( v, c ) \ ( (c) == 8 ) ? v128_shufll32_8( v ) \ : ( (c) == 16 ) ? v128_lrev16( v ) \ : ( (c) == 24 ) ? v128_shuflr32_8( v ) \ @@ -604,42 +578,41 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #elif defined(__SSE2__) // SSE2: fastest 32 bit, very fast 16 -#define mm128_ror_64( v, c ) \ +#define v128_ror64( v, c ) \ ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \ : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \ : ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \ : v128_ror64_sse2( v, c ) -#define mm128_rol_64( v, c ) \ +#define v128_rol64( v, c ) \ ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \ : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \ : ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \ : v128_rol64_sse2( v, c ) -#define mm128_ror_32( v, c ) \ +#define v128_ror32( v, c ) \ ( (c) == 16 ) ? v128_lrev16( v ) \ : v128_ror32_sse2( v, c ) -#define mm128_rol_32( v, c ) \ +#define v128_rol32( v, c ) \ ( (c) == 16 ) ? v128_lrev16( v ) \ : v128_rol32_sse2( v, c ) #else -#define mm128_ror_64 v128_ror64_sse2 -#define mm128_rol_64 v128_rol64_sse2 -#define mm128_ror_32 v128_ror32_sse2 -#define mm128_rol_32 v128_rol32_sse2 +#define v128_ror64 v128_ror64_sse2 +#define v128_rol64 v128_rol64_sse2 +#define v128_ror32 v128_ror32_sse2 +#define v128_rol32 v128_rol32_sse2 #endif -// Generic names for portable code -#define v128_ror64 mm128_ror_64 -#define v128_rol64 mm128_rol_64 -#define v128_ror32 mm128_ror_32 -#define v128_rol32 mm128_rol_32 - +//#define v128_ror64 mm128_ror_64 +//#define v128_rol64 mm128_rol_64 +//#define v128_ror32 mm128_ror_32 +#define mm128_rol_32 v128_rol32 +/* not used // x2 rotates elements in 2 individual vectors in a double buffered // optimization for SSE2, does nothing for AVX512 but is there for // transparency. @@ -647,25 +620,25 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #if defined(__AVX512VL__) //TODO Enable for AVX10_256 -#define mm128_rorx2_64( v1, v0, c ) \ +#define v128_2ror64( v1, v0, c ) \ _mm_ror_epi64( v0, c ); \ _mm_ror_epi64( v1, c ) -#define mm128_rolx2_64( v1, v0, c ) \ +#define v128_2rol64( v1, v0, c ) \ _mm_rol_epi64( v0, c ); \ _mm_rol_epi64( v1, c ) -#define mm128_rorx2_32( v1, v0, c ) \ +#define v128_2ror32( v1, v0, c ) \ _mm_ror_epi32( v0, c ); \ _mm_ror_epi32( v1, c ) -#define mm128_rolx2_32( v1, v0, c ) \ +#define mm128_2rol32( v1, v0, c ) \ _mm_rol_epi32( v0, c ); \ _mm_rol_epi32( v1, c ) #else // SSE2 -#define mm128_rorx2_64( v1, v0, c ) \ +#define v128_2ror64( v1, v0, c ) \ { \ __m128i t0 = _mm_srli_epi64( v0, c ); \ __m128i t1 = _mm_srli_epi64( v1, c ); \ @@ -675,7 +648,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) v1 = _mm_or_si256( v1, t1 ); \ } -#define mm128_rolx2_64( v1, v0, c ) \ +#define v128_2rol64( v1, v0, c ) \ { \ __m128i t0 = _mm_slli_epi64( v0, c ); \ __m128i t1 = _mm_slli_epi64( v1, c ); \ @@ -685,7 +658,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) v1 = _mm_or_si256( v1, t1 ); \ } -#define mm128_rorx2_32( v1, v0, c ) \ +#define v128_2ror32( v1, v0, c ) \ { \ __m128i t0 = _mm_srli_epi32( v0, c ); \ __m128i t1 = _mm_srli_epi32( v1, c ); \ @@ -695,7 +668,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) v1 = _mm_or_si256( v1, t1 ); \ } -#define mm128_rolx2_32( v1, v0, c ) \ +#define v128_2rol32( v1, v0, c ) \ { \ __m128i t0 = _mm_slli_epi32( v0, c ); \ __m128i t1 = _mm_slli_epi32( v1, c ); \ @@ -706,12 +679,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) } #endif // AVX512 else SSE2 - -#define v128_2ror64 mm128_rorx2_64 -#define v128_2rol64 mm128_rolx2_64 -#define v128_2ror32 mm128_rorx2_32 -#define v128_2rol32 mm128_rolx2_32 - +*/ // Cross lane shuffles @@ -750,95 +718,76 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define v128_shuflr16(v) v128_shuffle16( v, 0x39 ) #define v128_shufll16(v) v128_shuffle16( v, 0x93 ) - -//TODO fix this -// alias bswap -//#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) ) -//#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) ) -//#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) ) - -// reverse bits, can it be done? -//#define v128_bitrev8( v ) vrbitq_u8 - -/* Not used -#if defined(__SSSE3__) - -// Rotate right by c bytes, no SSE2 equivalent. -static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) -{ return _mm_alignr_epi8( v, v, c ); } - -#endif -*/ - // Endian byte swap. #if defined(__SSSE3__) -#define mm128_bswap_128( v ) \ +#define v128_bswap128( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \ 0x08090a0b0c0d0e0f ) ) -#define mm128_bswap_64( v ) \ +#define v128_bswap64( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \ 0x0001020304050607 ) ) -#define mm128_bswap_32( v ) \ +#define v128_bswap32( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ 0x0405060700010203 ) ) +#define mm128_bswap_32 v128_bswap32 -#define mm128_bswap_16( v ) \ +#define v128_bswap16( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \ 0x0607040502030001 ) // 8 byte qword * 8 qwords * 2 lanes = 128 bytes #define mm128_block_bswap_64( d, s ) \ { \ - __m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ - casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \ - casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \ - casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \ - casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \ - casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \ - casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \ - casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \ - casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \ + v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ + casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ + casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \ + casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \ + casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \ + casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \ + casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \ + casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \ + casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \ } #define mm128_block_bswap64_512 mm128_block_bswap_64 #define v128_block_bswap64_512 mm128_block_bswap_64 #define v128_block_bswap64_1024( d, s ) \ { \ - __m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ - casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \ - casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \ - casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \ - casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \ - casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \ - casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \ - casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \ - casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \ - casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \ - casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \ - casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \ - casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \ - casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \ - casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \ - casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \ - casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \ + v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ + casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \ + casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \ + casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \ + casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \ + casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \ + casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \ + casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \ + casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \ + casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \ + casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \ + casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \ + casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \ + casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \ + casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \ + casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \ + casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \ } // 4 byte dword * 8 dwords * 4 lanes = 128 bytes #define mm128_block_bswap_32( d, s ) \ { \ - __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \ - casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \ - casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \ - casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \ - casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \ - casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \ - casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \ - casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \ + v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ + casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ + casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \ + casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \ + casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \ + casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \ + casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \ + casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \ + casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \ } #define mm128_block_bswap32_256 mm128_block_bswap_32 #define v128_block_bswap32_256 mm128_block_bswap_32 @@ -846,129 +795,127 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) #define mm128_block_bswap32_128( d, s ) \ { \ - __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \ - casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \ - casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \ - casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \ + v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ + casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ + casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \ + casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \ + casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \ } #define v128_block_bswap32_512( d, s ) \ { \ - __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \ - casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \ - casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \ - casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \ - casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \ - casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \ - casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \ - casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \ - casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \ - casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \ - casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \ - casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \ - casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \ - casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \ - casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \ - casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \ + v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ + casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \ + casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \ + casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \ + casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \ + casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \ + casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \ + casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \ + casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \ + casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \ + casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \ + casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \ + casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \ + casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \ + casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \ + casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \ + casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \ } #else // SSE2 -static inline __m128i mm128_bswap_64( __m128i v ) +static inline v128_t v128_bswap64( __m128i v ) { v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) ); return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) ); } -static inline __m128i mm128_bswap_32( __m128i v ) +static inline v128_t v128_bswap32( __m128i v ) { v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) ); return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) ); } +#define mm128_bswap_32 v128_bswap32 -static inline __m128i mm128_bswap_16( __m128i v ) +static inline v128_t v128_bswap16( __m128i v ) { return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); } -#define mm128_bswap_128( v ) v128_qrev32( v128_bswap64( v ) ) +#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) ) static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s ) { - d[0] = mm128_bswap_64( s[0] ); - d[1] = mm128_bswap_64( s[1] ); - d[2] = mm128_bswap_64( s[2] ); - d[3] = mm128_bswap_64( s[3] ); - d[4] = mm128_bswap_64( s[4] ); - d[5] = mm128_bswap_64( s[5] ); - d[6] = mm128_bswap_64( s[6] ); - d[7] = mm128_bswap_64( s[7] ); + d[0] = v128_bswap64( s[0] ); + d[1] = v128_bswap64( s[1] ); + d[2] = v128_bswap64( s[2] ); + d[3] = v128_bswap64( s[3] ); + d[4] = v128_bswap64( s[4] ); + d[5] = v128_bswap64( s[5] ); + d[6] = v128_bswap64( s[6] ); + d[7] = v128_bswap64( s[7] ); } #define v128_block_bswap64_512 mm128_block_bswap_64 static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s ) { - d[ 0] = mm128_bswap_64( s[ 0] ); - d[ 1] = mm128_bswap_64( s[ 1] ); - d[ 2] = mm128_bswap_64( s[ 2] ); - d[ 3] = mm128_bswap_64( s[ 3] ); - d[ 4] = mm128_bswap_64( s[ 4] ); - d[ 5] = mm128_bswap_64( s[ 5] ); - d[ 6] = mm128_bswap_64( s[ 6] ); - d[ 7] = mm128_bswap_64( s[ 7] ); - d[ 8] = mm128_bswap_64( s[ 8] ); - d[ 9] = mm128_bswap_64( s[ 9] ); - d[10] = mm128_bswap_64( s[10] ); - d[11] = mm128_bswap_64( s[11] ); - d[14] = mm128_bswap_64( s[12] ); - d[13] = mm128_bswap_64( s[13] ); - d[14] = mm128_bswap_64( s[14] ); - d[15] = mm128_bswap_64( s[15] ); + d[ 0] = v128_bswap64( s[ 0] ); + d[ 1] = v128_bswap64( s[ 1] ); + d[ 2] = v128_bswap64( s[ 2] ); + d[ 3] = v128_bswap64( s[ 3] ); + d[ 4] = v128_bswap64( s[ 4] ); + d[ 5] = v128_bswap64( s[ 5] ); + d[ 6] = v128_bswap64( s[ 6] ); + d[ 7] = v128_bswap64( s[ 7] ); + d[ 8] = v128_bswap64( s[ 8] ); + d[ 9] = v128_bswap64( s[ 9] ); + d[10] = v128_bswap64( s[10] ); + d[11] = v128_bswap64( s[11] ); + d[14] = v128_bswap64( s[12] ); + d[13] = v128_bswap64( s[13] ); + d[14] = v128_bswap64( s[14] ); + d[15] = v128_bswap64( s[15] ); } static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) { - d[0] = mm128_bswap_32( s[0] ); - d[1] = mm128_bswap_32( s[1] ); - d[2] = mm128_bswap_32( s[2] ); - d[3] = mm128_bswap_32( s[3] ); - d[4] = mm128_bswap_32( s[4] ); - d[5] = mm128_bswap_32( s[5] ); - d[6] = mm128_bswap_32( s[6] ); - d[7] = mm128_bswap_32( s[7] ); + d[0] = v128_bswap32( s[0] ); + d[1] = v128_bswap32( s[1] ); + d[2] = v128_bswap32( s[2] ); + d[3] = v128_bswap32( s[3] ); + d[4] = v128_bswap32( s[4] ); + d[5] = v128_bswap32( s[5] ); + d[6] = v128_bswap32( s[6] ); + d[7] = v128_bswap32( s[7] ); } #define mm128_block_bswap32_256 mm128_block_bswap_32 #define v128_block_bswap32_256 mm128_block_bswap_32 static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) { - d[ 0] = mm128_bswap_32( s[ 0] ); - d[ 1] = mm128_bswap_32( s[ 1] ); - d[ 2] = mm128_bswap_32( s[ 2] ); - d[ 3] = mm128_bswap_32( s[ 3] ); - d[ 4] = mm128_bswap_32( s[ 4] ); - d[ 5] = mm128_bswap_32( s[ 5] ); - d[ 6] = mm128_bswap_32( s[ 6] ); - d[ 7] = mm128_bswap_32( s[ 7] ); - d[ 8] = mm128_bswap_32( s[ 8] ); - d[ 9] = mm128_bswap_32( s[ 9] ); - d[10] = mm128_bswap_32( s[10] ); - d[11] = mm128_bswap_32( s[11] ); - d[12] = mm128_bswap_32( s[12] ); - d[13] = mm128_bswap_32( s[13] ); - d[14] = mm128_bswap_32( s[14] ); - d[15] = mm128_bswap_32( s[15] ); + d[ 0] = v128_bswap32( s[ 0] ); + d[ 1] = v128_bswap32( s[ 1] ); + d[ 2] = v128_bswap32( s[ 2] ); + d[ 3] = v128_bswap32( s[ 3] ); + d[ 4] = v128_bswap32( s[ 4] ); + d[ 5] = v128_bswap32( s[ 5] ); + d[ 6] = v128_bswap32( s[ 6] ); + d[ 7] = v128_bswap32( s[ 7] ); + d[ 8] = v128_bswap32( s[ 8] ); + d[ 9] = v128_bswap32( s[ 9] ); + d[10] = v128_bswap32( s[10] ); + d[11] = v128_bswap32( s[11] ); + d[12] = v128_bswap32( s[12] ); + d[13] = v128_bswap32( s[13] ); + d[14] = v128_bswap32( s[14] ); + d[15] = v128_bswap32( s[15] ); } #endif // SSSE3 else SSE2 -#define v128_bswap32 mm128_bswap_32 -#define v128_bswap64 mm128_bswap_64 -#define v128_bswap128 mm128_bswap_128 #define v128_block_bswap32 mm128_block_bswap_32 #define v128_block_bswap64 mm128_block_bswap_64 @@ -991,16 +938,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) ) #endif -#define mm128_alignr_64 v128_alignr64 -#define mm128_alignr_32 v128_alignr32 -#define mm128_alignr_8 v128_alignr32 - -// NEON only uses vector mask. x86 blend selects second arg when control bit -// is set. Blendv selects second arg when sign bit is set. And masking is the -// opposite, elements are selected from the first arg if the mask bits are set. -// Arm blend is a bit by bit blend while x76 is an elenet blend. -// Reverse the logic so the use mask is consistent with both formats. #if defined(__SSE4_1__) #define v128_blendv _mm_blendv_epi8 @@ -1008,7 +946,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) #else #define v128_blendv( v1, v0, mask ) \ - v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) ) + v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) ) #endif diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 64895229..1a894c15 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -90,7 +90,7 @@ typedef union // code and therefore can't be used as compile time initializers. #define m256_zero _mm256_setzero_si256() -#define m256_one_128 mm256_bcast_m128( m128_one_128 ) +#define m256_one_128 mm256_bcast_m128( v128_one ) static inline __m256i mm256_neg1_fn() { diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index 9f08802c..94df1dce 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -21,36 +21,36 @@ // // vornq( v1, v0 ) or( v1, not( v0 ) ) -#define v128_t uint32x4_t // default, -#define v128u64_t uint64x2_t -#define v128u32_t uint32x4_t -#define v128u16_t uint16x8_t -#define v128u8_t uint8x16_t +#define v128_t uint32x4_t // default, +#define v128u64_t uint64x2_t +#define v128u32_t uint32x4_t +#define v128u16_t uint16x8_t +#define v128u8_t uint8x16_t // load & store -#define v128_load( p ) vld1q_u32( (uint32_t*)(p) ) -#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v ) - -#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) ) -#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v ) -#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) ) -#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v ) -#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) ) -#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v ) -#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) ) -#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v ) +#define v128_load( p ) vld1q_u32( (uint32_t*)(p) ) +#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v ) + +#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) ) +#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v ) +#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) ) +#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v ) +#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) ) +#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v ) +#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) ) +#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v ) // load & set1 combined -#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) ) -#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) ) -#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) ) -#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) ) +#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) ) +#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) ) +#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) ) +#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) ) // arithmetic -#define v128_add64 vaddq_u64 -#define v128_add32 vaddq_u32 -#define v128_add16 vaddq_u16 -#define v128_add8 vaddq_u8 +#define v128_add64 vaddq_u64 +#define v128_add32 vaddq_u32 +#define v128_add16 vaddq_u16 +#define v128_add8 vaddq_u8 #define v128_add4_64( v3, v2, v1, v0 ) \ vaddq_u64( vaddq_u64( v3, v2 ), vaddq_u64( v1, v0 ) ) @@ -58,15 +58,15 @@ #define v128_add4_32( v3, v2, v1, v0 ) \ vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) ) -#define v128_sub64 vsubq_u64 -#define v128_sub32 vsubq_u32 -#define v128_sub16 vsubq_u16 -#define v128_sub8 vsubq_u8 +#define v128_sub64 vsubq_u64 +#define v128_sub32 vsubq_u32 +#define v128_sub16 vsubq_u16 +#define v128_sub8 vsubq_u8 // returns low half, u64 undocumented, may not exist. -#define v128_mul64 vmulq_u64 -#define v128_mul32 vmulq_u32 -#define v128_mul16 vmulq_u16 +#define v128_mul64 vmulq_u64 +#define v128_mul32 vmulq_u32 +#define v128_mul16 vmulq_u16 // slow, tested with argon2d static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) @@ -76,101 +76,102 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) } // compare -#define v128_cmpeq64 vceqq_u64 -#define v128_cmpeq32 vceqq_u32 -#define v128_cmpeq16 vceqq_u16 -#define v128_cmpeq8 vceqq_u8 +#define v128_cmpeq64 vceqq_u64 +#define v128_cmpeq32 vceqq_u32 +#define v128_cmpeq16 vceqq_u16 +#define v128_cmpeq8 vceqq_u8 -#define v128_iszero vceqzq_u64 +// v128_cmp0, v128_cmpz, v128 testz +#define v128_iszero vceqzq_u64 // Not yet needed //#define v128_cmpeq1 -#define v128_cmpgt64 vcgtq_u64 -#define v128_cmpgt32 vcgtq_u32 -#define v128_cmpgt16 vcgtq_u16 -#define v128_cmpgt8 vcgtq_u8 +#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 ) +#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 ) +#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 ) +#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 ) -#define v128_cmplt64 vcltq_u64 -#define v128_cmplt32 vcltq_u32 -#define v128_cmplt16 vcltq_u16 -#define v128_cmplt8 vcltq_u8 +#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 ) +#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 ) +#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 ) +#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 ) // bit shift -#define v128_sl64 vshlq_n_u64 -#define v128_sl32 vshlq_n_u32 -#define v128_sl16 vshlq_n_u16 -#define v128_sl8 vshlq_n_u8 +#define v128_sl64 vshlq_n_u64 +#define v128_sl32 vshlq_n_u32 +#define v128_sl16 vshlq_n_u16 +#define v128_sl8 vshlq_n_u8 -#define v128_sr64 vshrq_n_u64 -#define v128_sr32 vshrq_n_u32 -#define v128_sr16 vshrq_n_u16 -#define v128_sr8 vshrq_n_u8 +#define v128_sr64 vshrq_n_u64 +#define v128_sr32 vshrq_n_u32 +#define v128_sr16 vshrq_n_u16 +#define v128_sr8 vshrq_n_u8 // Unit tested, working. -#define v128_sra64 vshrq_n_s64 -#define v128_sra32 vshrq_n_s32 -#define v128_sra16 vshrq_n_s16 +#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)v, c ) +#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)v, c ) +#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)v, c ) // unary logic -#define v128_not vmvnq_u32 +#define v128_not vmvnq_u32 // binary logic -#define v128_or vorrq_u32 -#define v128_and vandq_u32 -#define v128_xor veorq_u32 +#define v128_or vorrq_u32 +#define v128_and vandq_u32 +#define v128_xor veorq_u32 // ~v1 & v0 -#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 ) +#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 ) // ~( a ^ b ), same as (~a) ^ b -#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) +#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) // ~v1 | v0, x86_64 convention, first arg is not'ed -#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) +#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) // ternary logic // v2 ^ v1 ^ v0 // veorq_u32 not defined //#define v128_xor3 veor3q_u32 -#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) ) +#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) ) // v2 & v1 & v0 -#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) ) +#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) ) // v2 | v1 | v0 -#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) ) +#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) ) // a ^ ( ~b & c ) -#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) ) +#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) ) // a ^ ( b & c ) -#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) ) +#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) ) // a & ( b ^ c ) -#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) ) +#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) ) // a ^ ( b | c ) -#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) ) +#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) ) // v2 | ( v1 & v0 ) -#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) ) +#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) ) // shift 2 concatenated vectors right. -#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c ) -#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c ) -#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c ) +#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c ) +#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c ) +#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c ) // Intetleave high or low half of 2 vectors. -#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 ) -#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 ) -#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 ) -#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 ) -#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 ) -#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 ) -#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 ) -#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 ) +#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 ) +#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 ) +#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 ) +#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 ) +#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 ) +#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 ) +#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 ) +#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 ) // AES @@ -184,19 +185,19 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) #define v128_aesenclast( v, k ) \ v128_xor( k, vaeseq_u8( v, v128_zero ) ) -#define v128_aesenclast_nokey( v, k ) \ +#define v128_aesenclast_nokey( v ) \ vaeseq_u8( v, v128_zero ) #define v128_aesdec( v, k ) \ v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) ) -#define v128_aesdec_nokey( v, k ) \ +#define v128_aesdec_nokey( v ) \ vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) #define v128_aesdeclast( v, k ) \ v128_xor( k, vaesdq_u8( v, v128_zero ) ) -#define v128_aesdeclast_nokey( v, k ) \ +#define v128_aesdeclast_nokey( v ) \ vaesdq_u8( v, v128_zero ) @@ -433,6 +434,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \ ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] ) + // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster. // Bit rotation already promotes faster widths. Usage is context sensitive. // preferred.