diff --git a/Makefile.am b/Makefile.am index db71cf12..82eeb6f6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -289,7 +289,7 @@ cpuminer_SOURCES = \ algo/yescrypt/yescrypt-best.c \ algo/yespower/yespower-gate.c \ algo/yespower/yespower-blake2b.c \ - algo/yespower/crypto/blake2b-yp.c \ + algo/yespower/crypto/hmac-blake2b.c \ algo/yespower/yescrypt-r8g.c \ algo/yespower/yespower-opt.c diff --git a/RELEASE_NOTES b/RELEASE_NOTES index ef93a255..5e3c78b2 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,15 @@ If not what makes it happen or not happen? Change Log ---------- +v3.20.1 + +sph_blake2b optimized 1-way SSSE3 & AVX2. +Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b. +Removed imprecise hash & target display from rejected share log. +Share and target difficulty is now displayed only for low diificulty shares. +Updated configure.ac to check for AVX512 asm support. +Small optimization to Lyra2 SSE2. + v3.20.0 #375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data. diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c index f4824434..d04601f3 100644 --- a/algo/blake/blake2b-hash-4way.c +++ b/algo/blake/blake2b-hash-4way.c @@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] = }; +#define Z00 0 +#define Z01 1 +#define Z02 2 +#define Z03 3 +#define Z04 4 +#define Z05 5 +#define Z06 6 +#define Z07 7 +#define Z08 8 +#define Z09 9 +#define Z0A A +#define Z0B B +#define Z0C C +#define Z0D D +#define Z0E E +#define Z0F F + +#define Z10 E +#define Z11 A +#define Z12 4 +#define Z13 8 +#define Z14 9 +#define Z15 F +#define Z16 D +#define Z17 6 +#define Z18 1 +#define Z19 C +#define Z1A 0 +#define Z1B 2 +#define Z1C B +#define Z1D 7 +#define Z1E 5 +#define Z1F 3 + +#define Z20 B +#define Z21 8 +#define Z22 C +#define Z23 0 +#define Z24 5 +#define Z25 2 +#define Z26 F +#define Z27 D +#define Z28 A +#define Z29 E +#define Z2A 3 +#define Z2B 6 +#define Z2C 7 +#define Z2D 1 +#define Z2E 9 +#define Z2F 4 + +#define Z30 7 +#define Z31 9 +#define Z32 3 +#define Z33 1 +#define Z34 D +#define Z35 C +#define Z36 B +#define Z37 E +#define Z38 2 +#define Z39 6 +#define Z3A 5 +#define Z3B A +#define Z3C 4 +#define Z3D 0 +#define Z3E F +#define Z3F 8 + +#define Z40 9 +#define Z41 0 +#define Z42 5 +#define Z43 7 +#define Z44 2 +#define Z45 4 +#define Z46 A +#define Z47 F +#define Z48 E +#define Z49 1 +#define Z4A B +#define Z4B C +#define Z4C 6 +#define Z4D 8 +#define Z4E 3 +#define Z4F D + +#define Z50 2 +#define Z51 C +#define Z52 6 +#define Z53 A +#define Z54 0 +#define Z55 B +#define Z56 8 +#define Z57 3 +#define Z58 4 +#define Z59 D +#define Z5A 7 +#define Z5B 5 +#define Z5C F +#define Z5D E +#define Z5E 1 +#define Z5F 9 + +#define Z60 C +#define Z61 5 +#define Z62 1 +#define Z63 F +#define Z64 E +#define Z65 D +#define Z66 4 +#define Z67 A +#define Z68 0 +#define Z69 7 +#define Z6A 6 +#define Z6B 3 +#define Z6C 9 +#define Z6D 2 +#define Z6E 8 +#define Z6F B + +#define Z70 D +#define Z71 B +#define Z72 7 +#define Z73 E +#define Z74 C +#define Z75 1 +#define Z76 3 +#define Z77 9 +#define Z78 5 +#define Z79 0 +#define Z7A F +#define Z7B 4 +#define Z7C 8 +#define Z7D 6 +#define Z7E 2 +#define Z7F A + +#define Z80 6 +#define Z81 F +#define Z82 E +#define Z83 9 +#define Z84 B +#define Z85 3 +#define Z86 0 +#define Z87 8 +#define Z88 C +#define Z89 2 +#define Z8A D +#define Z8B 7 +#define Z8C 1 +#define Z8D 4 +#define Z8E A +#define Z8F 5 + +#define Z90 A +#define Z91 2 +#define Z92 8 +#define Z93 4 +#define Z94 7 +#define Z95 6 +#define Z96 1 +#define Z97 5 +#define Z98 F +#define Z99 B +#define Z9A 9 +#define Z9B E +#define Z9C 3 +#define Z9D C +#define Z9E D +#define Z9F 0 + +#define Mx(r, i) Mx_(Z ## r ## i) +#define Mx_(n) Mx__(n) +#define Mx__(n) M ## n + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define B2B8W_G(a, b, c, d, x, y) \ diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c index 246e3af0..5c947e18 100644 --- a/algo/blake/blake512-hash-4way.c +++ b/algo/blake/blake512-hash-4way.c @@ -757,7 +757,6 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash, GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD); GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE); - // remaining rounds ROUND_B_8WAY(2); ROUND_B_8WAY(3); diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index a17d7d75..9e13fe4f 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -30,16 +30,10 @@ #include #include #include - +#include "simd-utils.h" #include "algo/sha/sph_types.h" #include "sph_blake2b.h" -// Cyclic right rotation. - -#ifndef ROTR64 -#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) -#endif - // Little-endian byte access. #define B2B_GET64(p) \ @@ -54,45 +48,131 @@ // G Mixing function. -#define B2B_G(a, b, c, d, x, y) { \ - v[a] = v[a] + v[b] + x; \ - v[d] = ROTR64(v[d] ^ v[a], 32); \ - v[c] = v[c] + v[d]; \ - v[b] = ROTR64(v[b] ^ v[c], 24); \ - v[a] = v[a] + v[b] + y; \ - v[d] = ROTR64(v[d] ^ v[a], 16); \ - v[c] = v[c] + v[d]; \ - v[b] = ROTR64(v[b] ^ v[c], 63); } +#if defined(__AVX2__) + +#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \ +{ \ + V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \ + _mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \ + m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \ + V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \ + V[2] = _mm256_add_epi64( V[2], V[3] ); \ + V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \ +} + +#define BLAKE2B_ROUND( R ) \ +{ \ + __m256i *V = (__m256i*)v; \ + BLAKE2B_G( R, 0, 2, 4, 6, 32, 24 ); \ + BLAKE2B_G( R, 1, 3, 5, 7, 16, 63 ); \ + V[3] = mm256_shufll_64( V[3] ); \ + V[2] = mm256_swap_128( V[2] ); \ + V[1] = mm256_shuflr_64( V[1] ); \ + BLAKE2B_G( R, 8, 10, 12, 14, 32, 24 ); \ + BLAKE2B_G( R, 9, 11, 13, 15, 16, 63 ); \ + V[3] = mm256_shuflr_64( V[3] ); \ + V[2] = mm256_swap_128( V[2] ); \ + V[1] = mm256_shufll_64( V[1] ); \ +} + +#elif defined(__SSSE3__) + +#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \ +{ \ + Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \ + _mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \ + Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \ + Vc = _mm_add_epi64( Vc, Vd ); \ + Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \ +} + +#define BLAKE2B_ROUND( R ) \ +{ \ + __m128i *V = (__m128i*)v; \ + __m128i V2, V3, V6, V7; \ + BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \ + BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \ + BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \ + BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \ + V2 = mm128_shufl2r_64( V[2], V[3] ); \ + V3 = mm128_shufl2r_64( V[3], V[2] ); \ + V6 = mm128_shufl2l_64( V[6], V[7] ); \ + V7 = mm128_shufl2l_64( V[7], V[6] ); \ + BLAKE2B_G( R, V[0], V2, V[5], V6, 8, 10, 32, 24 ); \ + BLAKE2B_G( R, V[0], V2, V[5], V6, 9, 11, 16, 63 ); \ + BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \ + BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \ + V[2] = mm128_shufl2l_64( V2, V3 ); \ + V[3] = mm128_shufl2l_64( V3, V2 ); \ + V[6] = mm128_shufl2r_64( V6, V7 ); \ + V[7] = mm128_shufl2r_64( V7, V6 ); \ +} + +#else + +#ifndef ROTR64 +#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) +#endif + +#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \ +{ \ + Va = Va + Vb + m[ sigma[R][Sa] ]; \ + Vd = ROTR64( Vd ^ Va, 32 ); \ + Vc = Vc + Vd; \ + Vb = ROTR64( Vb ^ Vc, 24 ); \ + Va = Va + Vb + m[ sigma[R][Sb] ]; \ + Vd = ROTR64( Vd ^ Va, 16 ); \ + Vc = Vc + Vd; \ + Vb = ROTR64( Vb ^ Vc, 63 ); \ +} + +#define BLAKE2B_ROUND( R ) \ +{ \ + BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12], 0, 1 ); \ + BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13], 2, 3 ); \ + BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14], 4, 5 ); \ + BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15], 6, 7 ); \ + BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15], 8, 9 ); \ + BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \ + BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \ + BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \ +} + +#endif // Initialization Vector. -static const uint64_t blake2b_iv[8] = { +static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) = +{ 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 }; +static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + // Compression function. "last" flag indicates last block. static void blake2b_compress( sph_blake2b_ctx *ctx, int last ) { - const uint8_t sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } - }; - int i; - uint64_t v[16], m[16]; + uint64_t v[16] __attribute__ ((aligned (32))); + uint64_t m[16] __attribute__ ((aligned (32))); + int i; for (i = 0; i < 8; i++) { // init work variables v[i] = ctx->h[i]; @@ -106,16 +186,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last ) for (i = 0; i < 16; i++) // get little-endian words m[i] = B2B_GET64(&ctx->b[8 * i]); - for (i = 0; i < 12; i++) { // twelve rounds - B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); - B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); - B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); - B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); - B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); - B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); - B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); - B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); - } + for (i = 0; i < 12; i++) + BLAKE2B_ROUND( i ); for( i = 0; i < 8; ++i ) ctx->h[i] ^= v[i] ^ v[i + 8]; diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 1c904447..2385640e 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -150,12 +150,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ G_2X64( s1, s3, s5, s7 ); \ mm128_vrol256_64( s6, s7 ); \ mm128_vror256_64( s2, s3 ); \ - mm128_swap256_128( s4, s5 ); \ - G_2X64( s0, s2, s4, s6 ); \ - G_2X64( s1, s3, s5, s7 ); \ + G_2X64( s0, s2, s5, s6 ); \ + G_2X64( s1, s3, s4, s7 ); \ mm128_vror256_64( s6, s7 ); \ - mm128_vrol256_64( s2, s3 ); \ - mm128_swap256_128( s4, s5 ); + mm128_vrol256_64( s2, s3 ); #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ diff --git a/algo/yespower/crypto/blake2b-yp.c b/algo/yespower/crypto/blake2b-yp.c deleted file mode 100644 index dc6eee6a..00000000 --- a/algo/yespower/crypto/blake2b-yp.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright 2009 Colin Percival, 2014 savale - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include -#include -#include -#include "simd-utils.h" -#include -#include "blake2b-yp.h" - -// Cyclic right rotation. -//#ifndef ROTR64 -//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) -//#endif - -#define ROTR64(x, y) ror64( x, y ) - -// Little-endian byte access. -#define B2B_GET64(p) \ - (((uint64_t) ((uint8_t *) (p))[0]) ^ \ - (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ - (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ - (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ - (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ - (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ - (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ - (((uint64_t) ((uint8_t *) (p))[7]) << 56)) - -// G Mixing function. -#define B2B_G(a, b, c, d, x, y) { \ - v[a] = v[a] + v[b] + x; \ - v[d] = ROTR64(v[d] ^ v[a], 32); \ - v[c] = v[c] + v[d]; \ - v[b] = ROTR64(v[b] ^ v[c], 24); \ - v[a] = v[a] + v[b] + y; \ - v[d] = ROTR64(v[d] ^ v[a], 16); \ - v[c] = v[c] + v[d]; \ - v[b] = ROTR64(v[b] ^ v[c], 63); } - -// Initialization Vector. -static const uint64_t blake2b_iv[8] = { - 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, - 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, - 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, - 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 -}; - -// Compression function. "last" flag indicates last block. -static void blake2b_compress(blake2b_yp_ctx *ctx, int last) -{ - const uint8_t sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } - }; - int i; - uint64_t v[16], m[16]; - - // init work variables - for (i = 0; i < 8; i++) { - v[i] = ctx->h[i]; - v[i + 8] = blake2b_iv[i]; - } - - v[12] ^= ctx->t[0]; // low 64 bits of offset - v[13] ^= ctx->t[1]; // high 64 bits - - // last block flag set ? - if (last) { - v[14] = ~v[14]; - } - - // get little-endian words - for (i = 0; i < 16; i++) { - m[i] = B2B_GET64(&ctx->b[8 * i]); - } - - // twelve rounds - for (i = 0; i < 12; i++) { - B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); - B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); - B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); - B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); - B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); - B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); - B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); - B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); - } - - for(i = 0; i < 8; ++i) { - ctx->h[i] ^= v[i] ^ v[i + 8]; - } -} - -// Initialize the hashing context "ctx" with optional key "key". -// 1 <= outlen <= 64 gives the digest size in bytes. -// Secret key (also <= 64 bytes) is optional (keylen = 0). -int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, - const void *key, size_t keylen) // (keylen=0: no key) -{ - size_t i; - - // illegal parameters - if (outlen == 0 || outlen > 64 || keylen > 64) { - return -1; - } - - // state, "param block" - for (i = 0; i < 8; i++) { - ctx->h[i] = blake2b_iv[i]; - } - - ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; - - ctx->t[0] = 0; // input count low word - ctx->t[1] = 0; // input count high word - ctx->c = 0; // pointer within buffer - ctx->outlen = outlen; - - // zero input block - for (i = keylen; i < 128; i++) { - ctx->b[i] = 0; - } - - if (keylen > 0) { - blake2b_yp_update(ctx, key, keylen); - ctx->c = 128; // at the end - } - - return 0; -} - -// Add "inlen" bytes from "in" into the hash. -void blake2b_yp_update(blake2b_yp_ctx *ctx, - const void *in, size_t inlen) // data bytes -{ - size_t i; - for (i = 0; i < inlen; i++) { - if (ctx->c == 128) { // buffer full ? - ctx->t[0] += ctx->c; // add counters - if (ctx->t[0] < ctx->c) // carry overflow ? - ctx->t[1]++; // high word - blake2b_compress(ctx, 0); // compress (not last) - ctx->c = 0; // counter to zero - } - ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; - } -} - -// Generate the message digest (size given in init). -// Result placed in "out". -void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out) -{ - size_t i; - - ctx->t[0] += ctx->c; // mark last block offset - // carry overflow - if (ctx->t[0] < ctx->c) { - ctx->t[1]++; // high word - } - - // fill up with zeros - while (ctx->c < 128) { - ctx->b[ctx->c++] = 0; - } - - blake2b_compress(ctx, 1); // final block flag = 1 - - // little endian convert and store - for (i = 0; i < ctx->outlen; i++) { - ((uint8_t *) out)[i] = - (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; - } -} - -// inlen = number of bytes -void blake2b_yp_hash(void *out, const void *in, size_t inlen) { - blake2b_yp_ctx ctx; - blake2b_yp_init(&ctx, 32, NULL, 0); - blake2b_yp_update(&ctx, in, inlen); - blake2b_yp_final(&ctx, out); -} - -// // keylen = number of bytes -void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) { - const uint8_t *key = _key; - uint8_t keyhash[32]; - uint8_t pad[64]; - uint64_t i; - - if (keylen > 64) { - blake2b_yp_hash(keyhash, key, keylen); - key = keyhash; - keylen = 32; - } - - blake2b_yp_init(&hctx->inner, 32, NULL, 0); - memset(pad, 0x36, 64); - for (i = 0; i < keylen; ++i) { - pad[i] ^= key[i]; - } - - blake2b_yp_update(&hctx->inner, pad, 64); - blake2b_yp_init(&hctx->outer, 32, NULL, 0); - memset(pad, 0x5c, 64); - for (i = 0; i < keylen; ++i) { - pad[i] ^= key[i]; - } - - blake2b_yp_update(&hctx->outer, pad, 64); - memset(keyhash, 0, 32); -} - -// datalen = number of bits -void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) { - // update the inner state - blake2b_yp_update(&hctx->inner, data, datalen); -} - -void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) { - uint8_t ihash[32]; - blake2b_yp_final(&hctx->inner, ihash); - blake2b_yp_update(&hctx->outer, ihash, 32); - blake2b_yp_final(&hctx->outer, digest); - memset(ihash, 0, 32); -} - -// // keylen = number of bytes; inlen = number of bytes -void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) { - hmac_yp_ctx hctx; - hmac_blake2b_yp_init(&hctx, key, keylen); - hmac_blake2b_yp_update(&hctx, in, inlen); - hmac_blake2b_yp_final(&hctx, out); -} - -void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, - size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) -{ - hmac_yp_ctx PShctx, hctx; - size_t i; - uint32_t ivec; - uint8_t U[32]; - uint8_t T[32]; - uint64_t j; - int k; - size_t clen; - - /* Compute HMAC state after processing P and S. */ - hmac_blake2b_yp_init(&PShctx, passwd, passwdlen); - hmac_blake2b_yp_update(&PShctx, salt, saltlen); - - /* Iterate through the blocks. */ - for (i = 0; i * 32 < dkLen; i++) { - /* Generate INT(i + 1). */ - ivec = bswap_32( i+1 ); - - /* Compute U_1 = PRF(P, S || INT(i)). */ - memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx)); - hmac_blake2b_yp_update(&hctx, &ivec, 4); - hmac_blake2b_yp_final(&hctx, U); - - /* T_i = U_1 ... */ - memcpy(T, U, 32); - - for (j = 2; j <= c; j++) { - /* Compute U_j. */ - hmac_blake2b_yp_init(&hctx, passwd, passwdlen); - hmac_blake2b_yp_update(&hctx, U, 32); - hmac_blake2b_yp_final(&hctx, U); - - /* ... xor U_j ... */ - for (k = 0; k < 32; k++) { - T[k] ^= U[k]; - } - } - - /* Copy as many bytes as necessary into buf. */ - clen = dkLen - i * 32; - if (clen > 32) { - clen = 32; - } - - memcpy(&buf[i * 32], T, clen); - } - - /* Clean PShctx, since we never called _Final on it. */ - memset(&PShctx, 0, sizeof(hmac_yp_ctx)); -} diff --git a/algo/yespower/crypto/blake2b-yp.h b/algo/yespower/crypto/blake2b-yp.h deleted file mode 100644 index a240bc6a..00000000 --- a/algo/yespower/crypto/blake2b-yp.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once -#ifndef __BLAKE2B_H__ -#define __BLAKE2B_H__ - -#include -#include - -#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__) -#define NATIVE_LITTLE_ENDIAN -#endif - -// state context -typedef struct { - uint8_t b[128]; // input buffer - uint64_t h[8]; // chained state - uint64_t t[2]; // total number of bytes - size_t c; // pointer for b[] - size_t outlen; // digest size -} blake2b_yp_ctx; - -typedef struct { - blake2b_yp_ctx inner; - blake2b_yp_ctx outer; -} hmac_yp_ctx; - -#if defined(__cplusplus) -extern "C" { -#endif - -int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen); -void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen); -void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out); -void blake2b_yp_hash(void *out, const void *in, size_t inlen); -void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen); -void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, - size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen); - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/yespower/crypto/hmac-blake2b.c b/algo/yespower/crypto/hmac-blake2b.c new file mode 100644 index 00000000..83b9e58a --- /dev/null +++ b/algo/yespower/crypto/hmac-blake2b.c @@ -0,0 +1,150 @@ +/* + * Copyright 2009 Colin Percival, 2014 savale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include +#include +#include +#include "simd-utils.h" +#include "hmac-blake2b.h" + +// keylen = number of bytes +void hmac_blake2b_init( hmac_blake2b_ctx *hctx, const void *_key, + size_t keylen ) +{ + const uint8_t *key = _key; + uint8_t keyhash[32]; + uint8_t pad[64]; + uint64_t i; + + if (keylen > 64) + { + sph_blake2b_ctx ctx; + sph_blake2b_init( &ctx, 32, NULL, 0 ); + sph_blake2b_update( &ctx, key, keylen ); + sph_blake2b_final( &ctx, keyhash ); + key = keyhash; + keylen = 32; + } + + sph_blake2b_init( &hctx->inner, 32, NULL, 0 ); + memset( pad, 0x36, 64 ); + for ( i = 0; i < keylen; ++i ) + pad[i] ^= key[i]; + + sph_blake2b_update( &hctx->inner, pad, 64 ); + sph_blake2b_init( &hctx->outer, 32, NULL, 0 ); + memset( pad, 0x5c, 64 ); + for ( i = 0; i < keylen; ++i ) + pad[i] ^= key[i]; + + sph_blake2b_update( &hctx->outer, pad, 64 ); + memset( keyhash, 0, 32 ); +} + +// datalen = number of bits +void hmac_blake2b_update( hmac_blake2b_ctx *hctx, const void *data, + size_t datalen ) +{ + // update the inner state + sph_blake2b_update( &hctx->inner, data, datalen ); +} + +void hmac_blake2b_final( hmac_blake2b_ctx *hctx, uint8_t *digest ) +{ + uint8_t ihash[32]; + sph_blake2b_final( &hctx->inner, ihash ); + sph_blake2b_update( &hctx->outer, ihash, 32 ); + sph_blake2b_final( &hctx->outer, digest ); + memset( ihash, 0, 32 ); +} + +// // keylen = number of bytes; inlen = number of bytes +void hmac_blake2b_hash( void *out, const void *key, size_t keylen, + const void *in, size_t inlen ) +{ + hmac_blake2b_ctx hctx; + hmac_blake2b_init( &hctx, key, keylen ); + hmac_blake2b_update( &hctx, in, inlen ); + hmac_blake2b_final( &hctx, out ); +} + +void pbkdf2_blake2b( const uint8_t *passwd, size_t passwdlen, + const uint8_t *salt, size_t saltlen, uint64_t c, + uint8_t *buf, size_t dkLen ) +{ + hmac_blake2b_ctx PShctx, hctx; + size_t i; + uint32_t ivec; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Compute HMAC state after processing P and S. */ + hmac_blake2b_init( &PShctx, passwd, passwdlen ); + hmac_blake2b_update( &PShctx, salt, saltlen ); + + /* Iterate through the blocks. */ + for ( i = 0; i * 32 < dkLen; i++ ) + { + /* Generate INT(i + 1). */ + ivec = bswap_32( i+1 ); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy( &hctx, &PShctx, sizeof(hmac_blake2b_ctx) ); + hmac_blake2b_update( &hctx, &ivec, 4 ); + hmac_blake2b_final( &hctx, U ); + + /* T_i = U_1 ... */ + memcpy( T, U, 32 ); + + for ( j = 2; j <= c; j++ ) + { + /* Compute U_j. */ + hmac_blake2b_init( &hctx, passwd, passwdlen ); + hmac_blake2b_update( &hctx, U, 32 ); + hmac_blake2b_final( &hctx, U ); + + /* ... xor U_j ... */ + for ( k = 0; k < 32; k++ ) + T[k] ^= U[k]; + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + + memcpy( &buf[i * 32], T, clen ); + } + + /* Clean PShctx, since we never called _Final on it. */ + memset( &PShctx, 0, sizeof(hmac_blake2b_ctx) ); +} diff --git a/algo/yespower/crypto/hmac-blake2b.h b/algo/yespower/crypto/hmac-blake2b.h new file mode 100644 index 00000000..d90b6438 --- /dev/null +++ b/algo/yespower/crypto/hmac-blake2b.h @@ -0,0 +1,34 @@ +#pragma once +#ifndef __HMAC_BLAKE2B_H__ +#define __HMAC_BLAKE2B_H__ + +#include +#include +#include "algo/blake/sph_blake2b.h" + +#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__) +#define NATIVE_LITTLE_ENDIAN +#endif + +typedef struct +{ + sph_blake2b_ctx inner; + sph_blake2b_ctx outer; +} hmac_blake2b_ctx; + +#if defined(__cplusplus) +extern "C" { +#endif + +void hmac_blake2b_hash( void *out, const void *key, size_t keylen, + const void *in, size_t inlen ); + +void pbkdf2_blake2b( const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, uint64_t c, + uint8_t * buf, size_t dkLen ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/algo/yespower/crypto/sph_types.h b/algo/yespower/crypto/sph_types.h deleted file mode 100644 index cef79bde..00000000 --- a/algo/yespower/crypto/sph_types.h +++ /dev/null @@ -1,1976 +0,0 @@ -/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */ -/** - * Basic type definitions. - * - * This header file defines the generic integer types that will be used - * for the implementation of hash functions; it also contains helper - * functions which encode and decode multi-byte integer values, using - * either little-endian or big-endian conventions. - * - * This file contains a compile-time test on the size of a byte - * (the unsigned char C type). If bytes are not octets, - * i.e. if they do not have a size of exactly 8 bits, then compilation - * is aborted. Architectures where bytes are not octets are relatively - * rare, even in the embedded devices market. We forbid non-octet bytes - * because there is no clear convention on how octet streams are encoded - * on such systems. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_types.h - * @author Thomas Pornin - */ - -#ifndef SPH_TYPES_H__ -#define SPH_TYPES_H__ - -#include - -/* - * All our I/O functions are defined over octet streams. We do not know - * how to handle input data if bytes are not octets. - */ -#if CHAR_BIT != 8 -#error This code requires 8-bit bytes -#endif - -/* ============= BEGIN documentation block for Doxygen ============ */ - -#ifdef DOXYGEN_IGNORE - -/** @mainpage sphlib C code documentation - * - * @section overview Overview - * - * sphlib is a library which contains implementations of - * various cryptographic hash functions. These pages have been generated - * with doxygen and - * document the API for the C implementations. - * - * The API is described in appropriate header files, which are available - * in the "Files" section. Each hash function family has its own header, - * whose name begins with "sph_" and contains the family - * name. For instance, the API for the RIPEMD hash functions is available - * in the header file sph_ripemd.h. - * - * @section principles API structure and conventions - * - * @subsection io Input/output conventions - * - * In all generality, hash functions operate over strings of bits. - * Individual bits are rarely encountered in C programming or actual - * communication protocols; most protocols converge on the ubiquitous - * "octet" which is a group of eight bits. Data is thus expressed as a - * stream of octets. The C programming language contains the notion of a - * "byte", which is a data unit managed under the type "unsigned - * char". The C standard prescribes that a byte should hold at - * least eight bits, but possibly more. Most modern architectures, even - * in the embedded world, feature eight-bit bytes, i.e. map bytes to - * octets. - * - * Nevertheless, for some of the implemented hash functions, an extra - * API has been added, which allows the input of arbitrary sequences of - * bits: when the computation is about to be closed, 1 to 7 extra bits - * can be added. The functions for which this API is implemented include - * the SHA-2 functions and all SHA-3 candidates. - * - * sphlib defines hash function which may hash octet streams, - * i.e. streams of bits where the number of bits is a multiple of eight. - * The data input functions in the sphlib API expect data - * as anonymous pointers ("const void *") with a length - * (of type "size_t") which gives the input data chunk length - * in bytes. A byte is assumed to be an octet; the sph_types.h - * header contains a compile-time test which prevents compilation on - * architectures where this property is not met. - * - * The hash function output is also converted into bytes. All currently - * implemented hash functions have an output width which is a multiple of - * eight, and this is likely to remain true for new designs. - * - * Most hash functions internally convert input data into 32-bit of 64-bit - * words, using either little-endian or big-endian conversion. The hash - * output also often consists of such words, which are encoded into output - * bytes with a similar endianness convention. Some hash functions have - * been only loosely specified on that subject; when necessary, - * sphlib has been tested against published "reference" - * implementations in order to use the same conventions. - * - * @subsection shortname Function short name - * - * Each implemented hash function has a "short name" which is used - * internally to derive the identifiers for the functions and context - * structures which the function uses. For instance, MD5 has the short - * name "md5". Short names are listed in the next section, - * for the implemented hash functions. In subsequent sections, the - * short name will be assumed to be "XXX": replace with the - * actual hash function name to get the C identifier. - * - * Note: some functions within the same family share the same core - * elements, such as update function or context structure. Correspondingly, - * some of the defined types or functions may actually be macros which - * transparently evaluate to another type or function name. - * - * @subsection context Context structure - * - * Each implemented hash fonction has its own context structure, available - * under the type name "sph_XXX_context" for the hash function - * with short name "XXX". This structure holds all needed - * state for a running hash computation. - * - * The contents of these structures are meant to be opaque, and private - * to the implementation. However, these contents are specified in the - * header files so that application code which uses sphlib - * may access the size of those structures. - * - * The caller is responsible for allocating the context structure, - * whether by dynamic allocation (malloc() or equivalent), - * static allocation (a global permanent variable), as an automatic - * variable ("on the stack"), or by any other mean which ensures proper - * structure alignment. sphlib code performs no dynamic - * allocation by itself. - * - * The context must be initialized before use, using the - * sph_XXX_init() function. This function sets the context - * state to proper initial values for hashing. - * - * Since all state data is contained within the context structure, - * sphlib is thread-safe and reentrant: several hash - * computations may be performed in parallel, provided that they do not - * operate on the same context. Moreover, a running computation can be - * cloned by copying the context (with a simple memcpy()): - * the context and its clone are then independant and may be updated - * with new data and/or closed without interfering with each other. - * Similarly, a context structure can be moved in memory at will: - * context structures contain no pointer, in particular no pointer to - * themselves. - * - * @subsection dataio Data input - * - * Hashed data is input with the sph_XXX() fonction, which - * takes as parameters a pointer to the context, a pointer to the data - * to hash, and the number of data bytes to hash. The context is updated - * with the new data. - * - * Data can be input in one or several calls, with arbitrary input lengths. - * However, it is best, performance wise, to input data by relatively big - * chunks (say a few kilobytes), because this allows sphlib to - * optimize things and avoid internal copying. - * - * When all data has been input, the context can be closed with - * sph_XXX_close(). The hash output is computed and written - * into the provided buffer. The caller must take care to provide a - * buffer of appropriate length; e.g., when using SHA-1, the output is - * a 20-byte word, therefore the output buffer must be at least 20-byte - * long. - * - * For some hash functions, the sph_XXX_addbits_and_close() - * function can be used instead of sph_XXX_close(). This - * function can take a few extra bits to be added at - * the end of the input message. This allows hashing messages with a - * bit length which is not a multiple of 8. The extra bits are provided - * as an unsigned integer value, and a bit count. The bit count must be - * between 0 and 7, inclusive. The extra bits are provided as bits 7 to - * 0 (bits of numerical value 128, 64, 32... downto 0), in that order. - * For instance, to add three bits of value 1, 1 and 0, the unsigned - * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count - * will be 3. - * - * The SPH_SIZE_XXX macro is defined for each hash function; - * it evaluates to the function output size, expressed in bits. For instance, - * SPH_SIZE_sha1 evaluates to 160. - * - * When closed, the context is automatically reinitialized and can be - * immediately used for another computation. It is not necessary to call - * sph_XXX_init() after a close. Note that - * sph_XXX_init() can still be called to "reset" a context, - * i.e. forget previously input data, and get back to the initial state. - * - * @subsection alignment Data alignment - * - * "Alignment" is a property of data, which is said to be "properly - * aligned" when its emplacement in memory is such that the data can - * be optimally read by full words. This depends on the type of access; - * basically, some hash functions will read data by 32-bit or 64-bit - * words. sphlib does not mandate such alignment for input - * data, but using aligned data can substantially improve performance. - * - * As a rule, it is best to input data by chunks whose length (in bytes) - * is a multiple of eight, and which begins at "generally aligned" - * addresses, such as the base address returned by a call to - * malloc(). - * - * @section functions Implemented functions - * - * We give here the list of implemented functions. They are grouped by - * family; to each family corresponds a specific header file. Each - * individual function has its associated "short name". Please refer to - * the documentation for that header file to get details on the hash - * function denomination and provenance. - * - * Note: the functions marked with a '(64)' in the list below are - * available only if the C compiler provides an integer type of length - * 64 bits or more. Such a type is mandatory in the latest C standard - * (ISO 9899:1999, aka "C99") and is present in several older compilers - * as well, so chances are that such a type is available. - * - * - HAVAL family: file sph_haval.h - * - HAVAL-128/3 (128-bit, 3 passes): short name: haval128_3 - * - HAVAL-128/4 (128-bit, 4 passes): short name: haval128_4 - * - HAVAL-128/5 (128-bit, 5 passes): short name: haval128_5 - * - HAVAL-160/3 (160-bit, 3 passes): short name: haval160_3 - * - HAVAL-160/4 (160-bit, 4 passes): short name: haval160_4 - * - HAVAL-160/5 (160-bit, 5 passes): short name: haval160_5 - * - HAVAL-192/3 (192-bit, 3 passes): short name: haval192_3 - * - HAVAL-192/4 (192-bit, 4 passes): short name: haval192_4 - * - HAVAL-192/5 (192-bit, 5 passes): short name: haval192_5 - * - HAVAL-224/3 (224-bit, 3 passes): short name: haval224_3 - * - HAVAL-224/4 (224-bit, 4 passes): short name: haval224_4 - * - HAVAL-224/5 (224-bit, 5 passes): short name: haval224_5 - * - HAVAL-256/3 (256-bit, 3 passes): short name: haval256_3 - * - HAVAL-256/4 (256-bit, 4 passes): short name: haval256_4 - * - HAVAL-256/5 (256-bit, 5 passes): short name: haval256_5 - * - MD2: file sph_md2.h, short name: md2 - * - MD4: file sph_md4.h, short name: md4 - * - MD5: file sph_md5.h, short name: md5 - * - PANAMA: file sph_panama.h, short name: panama - * - RadioGatun family: file sph_radiogatun.h - * - RadioGatun[32]: short name: radiogatun32 - * - RadioGatun[64]: short name: radiogatun64 (64) - * - RIPEMD family: file sph_ripemd.h - * - RIPEMD: short name: ripemd - * - RIPEMD-128: short name: ripemd128 - * - RIPEMD-160: short name: ripemd160 - * - SHA-0: file sph_sha0.h, short name: sha0 - * - SHA-1: file sph_sha1.h, short name: sha1 - * - SHA-2 family, 32-bit hashes: file sph_sha2.h - * - SHA-224: short name: sha224 - * - SHA-256: short name: sha256 - * - SHA-384: short name: sha384 (64) - * - SHA-512: short name: sha512 (64) - * - Tiger family: file sph_tiger.h - * - Tiger: short name: tiger (64) - * - Tiger2: short name: tiger2 (64) - * - WHIRLPOOL family: file sph_whirlpool.h - * - WHIRLPOOL-0: short name: whirlpool0 (64) - * - WHIRLPOOL-1: short name: whirlpool1 (64) - * - WHIRLPOOL: short name: whirlpool (64) - * - * The fourteen second-round SHA-3 candidates are also implemented; - * when applicable, the implementations follow the "final" specifications - * as published for the third round of the SHA-3 competition (BLAKE, - * Groestl, JH, Keccak and Skein have been tweaked for third round). - * - * - BLAKE family: file sph_blake.h - * - BLAKE-224: short name: blake224 - * - BLAKE-256: short name: blake256 - * - BLAKE-384: short name: blake384 - * - BLAKE-512: short name: blake512 - * - BMW (Blue Midnight Wish) family: file sph_bmw.h - * - BMW-224: short name: bmw224 - * - BMW-256: short name: bmw256 - * - BMW-384: short name: bmw384 (64) - * - BMW-512: short name: bmw512 (64) - * - CubeHash family: file sph_cubehash.h (specified as - * CubeHash16/32 in the CubeHash specification) - * - CubeHash-224: short name: cubehash224 - * - CubeHash-256: short name: cubehash256 - * - CubeHash-384: short name: cubehash384 - * - CubeHash-512: short name: cubehash512 - * - ECHO family: file sph_echo.h - * - ECHO-224: short name: echo224 - * - ECHO-256: short name: echo256 - * - ECHO-384: short name: echo384 - * - ECHO-512: short name: echo512 - * - Fugue family: file sph_fugue.h - * - Fugue-224: short name: fugue224 - * - Fugue-256: short name: fugue256 - * - Fugue-384: short name: fugue384 - * - Fugue-512: short name: fugue512 - * - Groestl family: file sph_groestl.h - * - Groestl-224: short name: groestl224 - * - Groestl-256: short name: groestl256 - * - Groestl-384: short name: groestl384 - * - Groestl-512: short name: groestl512 - * - Hamsi family: file sph_hamsi.h - * - Hamsi-224: short name: hamsi224 - * - Hamsi-256: short name: hamsi256 - * - Hamsi-384: short name: hamsi384 - * - Hamsi-512: short name: hamsi512 - * - JH family: file sph_jh.h - * - JH-224: short name: jh224 - * - JH-256: short name: jh256 - * - JH-384: short name: jh384 - * - JH-512: short name: jh512 - * - Keccak family: file sph_keccak.h - * - Keccak-224: short name: keccak224 - * - Keccak-256: short name: keccak256 - * - Keccak-384: short name: keccak384 - * - Keccak-512: short name: keccak512 - * - Luffa family: file sph_luffa.h - * - Luffa-224: short name: luffa224 - * - Luffa-256: short name: luffa256 - * - Luffa-384: short name: luffa384 - * - Luffa-512: short name: luffa512 - * - Shabal family: file sph_shabal.h - * - Shabal-192: short name: shabal192 - * - Shabal-224: short name: shabal224 - * - Shabal-256: short name: shabal256 - * - Shabal-384: short name: shabal384 - * - Shabal-512: short name: shabal512 - * - SHAvite-3 family: file sph_shavite.h - * - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"): - * short name: shabal224 - * - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"): - * short name: shabal256 - * - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"): - * short name: shabal384 - * - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"): - * short name: shabal512 - * - SIMD family: file sph_simd.h - * - SIMD-224: short name: simd224 - * - SIMD-256: short name: simd256 - * - SIMD-384: short name: simd384 - * - SIMD-512: short name: simd512 - * - Skein family: file sph_skein.h - * - Skein-224 (nominally specified as Skein-512-224): short name: - * skein224 (64) - * - Skein-256 (nominally specified as Skein-512-256): short name: - * skein256 (64) - * - Skein-384 (nominally specified as Skein-512-384): short name: - * skein384 (64) - * - Skein-512 (nominally specified as Skein-512-512): short name: - * skein512 (64) - * - * For the second-round SHA-3 candidates, the functions are as specified - * for round 2, i.e. with the "tweaks" that some candidates added - * between round 1 and round 2. Also, some of the submitted packages for - * round 2 contained errors, in the specification, reference code, or - * both. sphlib implements the corrected versions. - */ - -/** @hideinitializer - * Unsigned integer type whose length is at least 32 bits; on most - * architectures, it will have a width of exactly 32 bits. Unsigned C - * types implement arithmetics modulo a power of 2; use the - * SPH_T32() macro to ensure that the value is truncated - * to exactly 32 bits. Unless otherwise specified, all macros and - * functions which accept sph_u32 values assume that these - * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures - * where sph_u32 is larger than that. - */ -typedef __arch_dependant__ sph_u32; - -/** @hideinitializer - * Signed integer type corresponding to sph_u32; it has - * width 32 bits or more. - */ -typedef __arch_dependant__ sph_s32; - -/** @hideinitializer - * Unsigned integer type whose length is at least 64 bits; on most - * architectures which feature such a type, it will have a width of - * exactly 64 bits. C99-compliant platform will have this type; it - * is also defined when the GNU compiler (gcc) is used, and on - * platforms where unsigned long is large enough. If this - * type is not available, then some hash functions which depends on - * a 64-bit type will not be available (most notably SHA-384, SHA-512, - * Tiger and WHIRLPOOL). - */ -typedef __arch_dependant__ sph_u64; - -/** @hideinitializer - * Signed integer type corresponding to sph_u64; it has - * width 64 bits or more. - */ -typedef __arch_dependant__ sph_s64; - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u32. Depending on - * how this type is defined, a suffix such as UL may - * be appended to the argument. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C32(x) - -/** - * Truncate a 32-bit value to exactly 32 bits. On most systems, this is - * a no-op, recognized as such by the compiler. - * - * @param x the value to truncate (of type sph_u32) - */ -#define SPH_T32(x) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTL32(x, n) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTR32(x, n) - -/** - * This macro is defined on systems for which a 64-bit type has been - * detected, and is used for sph_u64. - */ -#define SPH_64 - -/** - * This macro is defined on systems for the "native" integer size is - * 64 bits (64-bit values fit in one register). - */ -#define SPH_64_TRUE - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u64. Depending on - * how this type is defined, a suffix such as ULL may - * be appended to the argument. This macro is defined only if a - * 64-bit type was detected and used for sph_u64. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C64(x) - -/** - * Truncate a 64-bit value to exactly 64 bits. On most systems, this is - * a no-op, recognized as such by the compiler. This macro is defined only - * if a 64-bit type was detected and used for sph_u64. - * - * @param x the value to truncate (of type sph_u64) - */ -#define SPH_T64(x) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTL64(x, n) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTR64(x, n) - -/** - * This macro evaluates to inline or an equivalent construction, - * if available on the compilation platform, or to nothing otherwise. This - * is used to declare inline functions, for which the compiler should - * endeavour to include the code directly in the caller. Inline functions - * are typically defined in header files as replacement for macros. - */ -#define SPH_INLINE - -/** - * This macro is defined if the platform has been detected as using - * little-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_LITTLE_ENDIAN - -/** - * This macro is defined if the platform has been detected as using - * big-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_BIG_ENDIAN - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in little-endian - * convention. This is the case for little-endian platforms, and also - * for the big-endian platforms which have special little-endian access - * opcodes (e.g. Ultrasparc). - */ -#define SPH_LITTLE_FAST - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in big-endian - * convention. This is the case for little-endian platforms, and also - * for the little-endian platforms which have special big-endian access - * opcodes. - */ -#define SPH_BIG_FAST - -/** - * On some platforms, this macro is defined to an unsigned integer type - * into which pointer values may be cast. The resulting value can then - * be tested for being a multiple of 2, 4 or 8, indicating an aligned - * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses. - */ -#define SPH_UPTR - -/** - * When defined, this macro indicates that unaligned memory accesses - * are possible with only a minor penalty, and thus should be prefered - * over strategies which first copy data to an aligned buffer. - */ -#define SPH_UNALIGNED - -/** - * Byte-swap a 32-bit word (i.e. 0x12345678 becomes - * 0x78563412). This is an inline function which resorts - * to inline assembly on some platforms, for better performance. - * - * @param x the 32-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u32 sph_bswap32(sph_u32 x); - -/** - * Byte-swap a 64-bit word. This is an inline function which resorts - * to inline assembly on some platforms, for better performance. This - * function is defined only if a suitable 64-bit type was found for - * sph_u64 - * - * @param x the 64-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u64 sph_bswap64(sph_u64 x); - -/** - * Decode a 16-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16le(const void *src); - -/** - * Encode a 16-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16le(void *dst, unsigned val); - -/** - * Decode a 16-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16be(const void *src); - -/** - * Encode a 16-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16be(void *dst, unsigned val); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le(const void *src); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32le() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le_aligned(const void *src); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le(void *dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32le() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le_aligned(void *dst, sph_u32 val); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be(const void *src); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32be() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be_aligned(const void *src); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be(void *dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32be() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be_aligned(void *dst, sph_u32 val); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le(const void *src); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64le() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le_aligned(const void *src); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le(void *dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64le() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le_aligned(void *dst, sph_u64 val); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be(const void *src); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64be() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be_aligned(const void *src); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be(void *dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64be() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be_aligned(void *dst, sph_u64 val); - -#endif - -/* ============== END documentation block for Doxygen ============= */ - -#ifndef DOXYGEN_IGNORE - -/* - * We want to define the types "sph_u32" and "sph_u64" which hold - * unsigned values of at least, respectively, 32 and 64 bits. These - * tests should select appropriate types for most platforms. The - * macro "SPH_64" is defined if the 64-bit is supported. - */ - -#undef SPH_64 -#undef SPH_64_TRUE - -#if defined __STDC__ && __STDC_VERSION__ >= 199901L - -/* - * On C99 implementations, we can use to get an exact 64-bit - * type, if any, or otherwise use a wider type (which must exist, for - * C99 conformance). - */ - -#include - -#ifdef UINT32_MAX -typedef uint32_t sph_u32; -typedef int32_t sph_s32; -#else -typedef uint_fast32_t sph_u32; -typedef int_fast32_t sph_s32; -#endif -#if !SPH_NO_64 -#ifdef UINT64_MAX -typedef uint64_t sph_u64; -typedef int64_t sph_s64; -#else -typedef uint_fast64_t sph_u64; -typedef int_fast64_t sph_s64; -#endif -#endif - -#define SPH_C32(x) ((sph_u32)(x)) -#if !SPH_NO_64 -#define SPH_C64(x) ((sph_u64)(x)) -#define SPH_64 1 -#endif - -#else - -/* - * On non-C99 systems, we use "unsigned int" if it is wide enough, - * "unsigned long" otherwise. This supports all "reasonable" architectures. - * We have to be cautious: pre-C99 preprocessors handle constants - * differently in '#if' expressions. Hence the shifts to test UINT_MAX. - */ - -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF - -typedef unsigned int sph_u32; -typedef int sph_s32; - -#define SPH_C32(x) ((sph_u32)(x ## U)) - -#else - -typedef unsigned long sph_u32; -typedef long sph_s32; - -#define SPH_C32(x) ((sph_u32)(x ## UL)) - -#endif - -#if !SPH_NO_64 - -/* - * We want a 64-bit type. We use "unsigned long" if it is wide enough (as - * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), - * "unsigned long long" otherwise, if available. We use ULLONG_MAX to - * test whether "unsigned long long" is available; we also know that - * gcc features this type, even if the libc header do not know it. - */ - -#if ((ULONG_MAX >> 31) >> 31) >= 3 - -typedef unsigned long sph_u64; -typedef long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x ## UL)) - -#define SPH_64 1 - -#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ - -typedef unsigned long long sph_u64; -typedef long long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x ## ULL)) - -#define SPH_64 1 - -#else - -/* - * No 64-bit type... - */ - -#endif - -#endif - -#endif - -/* - * If the "unsigned long" type has length 64 bits or more, then this is - * a "true" 64-bit architectures. This is also true with Visual C on - * amd64, even though the "long" type is limited to 32 bits. - */ -#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64) -#define SPH_64_TRUE 1 -#endif - -/* - * Implementation note: some processors have specific opcodes to perform - * a rotation. Recent versions of gcc recognize the expression above and - * use the relevant opcodes, when appropriate. - */ - -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -#if SPH_64 - -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) -#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) - -#endif - -#ifndef DOXYGEN_IGNORE -/* - * Define SPH_INLINE to be an "inline" qualifier, if available. We define - * some small macro-like functions which benefit greatly from being inlined. - */ -#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__ -#define SPH_INLINE inline -#elif defined _MSC_VER -#define SPH_INLINE __inline -#else -#define SPH_INLINE -#endif -#endif - -/* - * We define some macros which qualify the architecture. These macros - * may be explicit set externally (e.g. as compiler parameters). The - * code below sets those macros if they are not already defined. - * - * Most macros are boolean, thus evaluate to either zero or non-zero. - * The SPH_UPTR macro is special, in that it evaluates to a C type, - * or is not defined. - * - * SPH_UPTR if defined: unsigned type to cast pointers into - * - * SPH_UNALIGNED non-zero if unaligned accesses are efficient - * SPH_LITTLE_ENDIAN non-zero if architecture is known to be little-endian - * SPH_BIG_ENDIAN non-zero if architecture is known to be big-endian - * SPH_LITTLE_FAST non-zero if little-endian decoding is fast - * SPH_BIG_FAST non-zero if big-endian decoding is fast - * - * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit - * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN - * _must_ be non-zero in those situations. The 32-bit and 64-bit types - * _must_ also have an exact width. - * - * SPH_SPARCV9_GCC_32 UltraSPARC-compatible with gcc, 32-bit mode - * SPH_SPARCV9_GCC_64 UltraSPARC-compatible with gcc, 64-bit mode - * SPH_SPARCV9_GCC UltraSPARC-compatible with gcc - * SPH_I386_GCC x86-compatible (32-bit) with gcc - * SPH_I386_MSVC x86-compatible (32-bit) with Microsoft Visual C - * SPH_AMD64_GCC x86-compatible (64-bit) with gcc - * SPH_AMD64_MSVC x86-compatible (64-bit) with Microsoft Visual C - * SPH_PPC32_GCC PowerPC, 32-bit, with gcc - * SPH_PPC64_GCC PowerPC, 64-bit, with gcc - * - * TODO: enhance automatic detection, for more architectures and compilers. - * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with - * some very fast functions (e.g. MD4) when using unaligned input data. - * The CPU-specific-with-GCC macros are useful only for inline assembly, - * normally restrained to this header file. - */ - -/* - * 32-bit x86, aka "i386 compatible". - */ -#if defined __i386__ || defined _M_IX86 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#ifdef __GNUC__ -#define SPH_DETECT_I386_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_I386_MSVC 1 -#endif - -/* - * 64-bit x86, hereafter known as "amd64". - */ -#elif defined __x86_64 || defined _M_X64 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_AMD64_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_AMD64_MSVC 1 -#endif - -/* - * 64-bit Sparc architecture (implies v9). - */ -#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \ - || defined __sparcv9 - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_SPARCV9_GCC_64 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * 32-bit Sparc. - */ -#elif (defined __sparc__ || defined __sparc) \ - && !(defined __sparcv9 || defined __arch64__) - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#if defined __GNUC__ && defined __sparc_v9__ -#define SPH_DETECT_SPARCV9_GCC_32 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * ARM, little-endian. - */ -#elif defined __arm__ && __ARMEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, little-endian. - */ -#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, big-endian. - */ -#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__ - -#define SPH_DETECT_BIG_ENDIAN 1 - -/* - * PowerPC. - */ -#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \ - || defined _ARCH_PPC - -/* - * Note: we do not declare cross-endian access to be "fast": even if - * using inline assembly, implementation should still assume that - * keeping the decoded word in a temporary is faster than decoding - * it again. - */ -#if defined __GNUC__ -#if SPH_64_TRUE -#define SPH_DETECT_PPC64_GCC 1 -#else -#define SPH_DETECT_PPC32_GCC 1 -#endif -#endif - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif - -/* - * Itanium, 64-bit. - */ -#elif defined __ia64 || defined __ia64__ \ - || defined __itanium__ || defined _M_IA64 - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#else -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif -#if defined __LP64__ || defined _LP64 -#define SPH_DETECT_UPTR sph_u64 -#else -#define SPH_DETECT_UPTR sph_u32 -#endif - -#endif - -#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64 -#define SPH_DETECT_SPARCV9_GCC 1 -#endif - -#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED -#define SPH_UNALIGNED SPH_DETECT_UNALIGNED -#endif -#if defined SPH_DETECT_UPTR && !defined SPH_UPTR -#define SPH_UPTR SPH_DETECT_UPTR -#endif -#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN -#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN -#endif -#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN -#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN -#endif -#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST -#endif -#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST -#define SPH_BIG_FAST SPH_DETECT_BIG_FAST -#endif -#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32 -#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32 -#endif -#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64 -#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64 -#endif -#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC -#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC -#endif -#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC -#define SPH_I386_GCC SPH_DETECT_I386_GCC -#endif -#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC -#define SPH_I386_MSVC SPH_DETECT_I386_MSVC -#endif -#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC -#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC -#endif -#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC -#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC -#endif -#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC -#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC -#endif -#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC -#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC -#endif - -#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST 1 -#endif -#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST -#define SPH_BIG_FAST 1 -#endif - -#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN) -#error SPH_UPTR defined, but endianness is not known. -#endif - -#if SPH_I386_GCC && !SPH_NO_ASM - -/* - * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * values. - */ - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) - | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - -#elif SPH_AMD64_GCC && !SPH_NO_ASM - -/* - * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * and 64-bit values. - */ - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - __asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x)); - return x; -} - -#endif - -/* - * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough - * to generate proper opcodes for endianness swapping with the pure C - * implementation below. - * - -#elif SPH_I386_MSVC && !SPH_NO_ASM - -static __inline sph_u32 __declspec(naked) __fastcall -sph_bswap32(sph_u32 x) -{ - __asm { - bswap ecx - mov eax,ecx - ret - } -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) - | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - - * - * [end of disabled code] - */ - -#else - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - x = SPH_T32((x << 16) | (x >> 16)); - x = ((x & SPH_C32(0xFF00FF00)) >> 8) - | ((x & SPH_C32(0x00FF00FF)) << 8); - return x; -} - -#if SPH_64 - -/** - * Byte-swap a 64-bit value. - * - * @param x the input value - * @return the byte-swapped value - */ -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - x = SPH_T64((x << 32) | (x >> 32)); - x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) - | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16); - x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) - | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8); - return x; -} - -#endif - -#endif - -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - -/* - * On UltraSPARC systems, native ordering is big-endian, but it is - * possible to perform little-endian read accesses by specifying the - * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use - * the opcode "lda [%reg]0x88,%dst", where %reg is the register which - * contains the source address and %dst is the destination register, - * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register - * to get the address space name. The latter format is better since it - * combines an addition and the actual access in a single opcode; but - * it requires the setting (and subsequent resetting) of %asi, which is - * slow. Some operations (i.e. MD5 compression function) combine many - * successive little-endian read accesses, which may share the same - * %asi setting. The macros below contain the appropriate inline - * assembly. - */ - -#define SPH_SPARCV9_SET_ASI \ - sph_u32 sph_sparcv9_asi; \ - __asm__ __volatile__ ( \ - "rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi)); - -#define SPH_SPARCV9_RESET_ASI \ - __asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi)); - -#define SPH_SPARCV9_DEC32LE(base, idx) ({ \ - sph_u32 sph_sparcv9_tmp; \ - __asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \ - : "=r" (sph_sparcv9_tmp) : "r" (base)); \ - sph_sparcv9_tmp; \ - }) - -#endif - -static SPH_INLINE void -sph_enc16be(void *dst, unsigned val) -{ - ((unsigned char *)dst)[0] = (val >> 8); - ((unsigned char *)dst)[1] = val; -} - -static SPH_INLINE unsigned -sph_dec16be(const void *src) -{ - return ((unsigned)(((const unsigned char *)src)[0]) << 8) - | (unsigned)(((const unsigned char *)src)[1]); -} - -static SPH_INLINE void -sph_enc16le(void *dst, unsigned val) -{ - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = val >> 8; -} - -static SPH_INLINE unsigned -sph_dec16le(const void *src) -{ - return (unsigned)(((const unsigned char *)src)[0]) - | ((unsigned)(((const unsigned char *)src)[1]) << 8); -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void -sph_enc32be(void *dst, sph_u32 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; - } else { - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; - } -#endif -#else - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc32be_aligned(void *dst, sph_u32 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u32 *)dst = sph_bswap32(val); -#elif SPH_BIG_ENDIAN - *(sph_u32 *)dst = val; -#else - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32be(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif - } else { - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); - } -#endif -#else - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32be_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u32 *)src; -#else - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void -sph_enc32le(void *dst, sph_u32 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; - } else { - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - } -#endif -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc32le_aligned(void *dst, sph_u32 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u32 *)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u32 *)dst = sph_bswap32(val); -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32le(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - /* - * "__volatile__" is needed here because without it, - * gcc-3.4.3 miscompiles the code and performs the - * access before the test on the address, thus triggering - * a bus error... - */ - __asm__ __volatile__ ( - "lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * On PowerPC, this turns out not to be worth the effort: the inline - * assembly makes GCC optimizer uncomfortable, which tends to nullify - * the decoding gains. - * - * For most hash functions, using this inline assembly trick changes - * hashing speed by less than 5% and often _reduces_ it. The biggest - * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is - * less then 10%. The speed gain on CubeHash is probably due to the - * chronic shortage of registers that CubeHash endures; for the other - * functions, the generic code appears to be efficient enough already. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ( - "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32 *)src); -#endif -#else - return *(const sph_u32 *)src; -#endif - } else { - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); - } -#endif -#else - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32le_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return *(const sph_u32 *)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32 *)src); -#endif -#else - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); -#endif -} - -#if SPH_64 - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void -sph_enc64be(void *dst, sph_u64 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; - } else { - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; - } -#endif -#else - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc64be_aligned(void *dst, sph_u64 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u64 *)dst = sph_bswap64(val); -#elif SPH_BIG_ENDIAN - *(sph_u64 *)dst = val; -#else - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64be(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif - } else { - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); - } -#endif -#else - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64be_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u64 *)src; -#else - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void -sph_enc64le(void *dst, sph_u64 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; - } else { - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); - } -#endif -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc64le_aligned(void *dst, sph_u64 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u64 *)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u64 *)dst = sph_bswap64(val); -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64le(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ( - "ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned( - (const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ( - "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64 *)src); -#endif -#else - return *(const sph_u64 *)src; -#endif - } else { - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); - } -#endif -#else - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64le_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return *(const sph_u64 *)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64 *)src); -#endif -#else - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); -#endif -} - -#endif - -#endif /* Doxygen excluded block */ - -#endif diff --git a/algo/yespower/yespower-blake2b.c b/algo/yespower/yespower-blake2b.c index 8dd85c4f..41dec41b 100644 --- a/algo/yespower/yespower-blake2b.c +++ b/algo/yespower/yespower-blake2b.c @@ -95,7 +95,7 @@ #include #include #include -#include "crypto/blake2b-yp.h" +#include "crypto/hmac-blake2b.h" #include "yespower.h" #ifdef __unix__ @@ -1136,6 +1136,7 @@ int yespower_b2b(yespower_local_t *local, salsa20_blk_t *V, *XY; pwxform_ctx_t ctx; uint8_t init_hash[32]; + sph_blake2b_ctx blake2b_ctx; /* Sanity-check parameters */ if ((N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || @@ -1167,7 +1168,9 @@ int yespower_b2b(yespower_local_t *local, ctx.S0 = S; ctx.S1 = S + Swidth_to_Sbytes1(Swidth); - blake2b_yp_hash(init_hash, src, srclen); + sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 ); + sph_blake2b_update( &blake2b_ctx, src, srclen ); + sph_blake2b_final( &blake2b_ctx, init_hash ); ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth); ctx.w = 0; @@ -1181,7 +1184,7 @@ int yespower_b2b(yespower_local_t *local, if ( work_restart[thrid].restart ) return false; - pbkdf2_blake2b_yp(init_hash, sizeof(init_hash), src, srclen, 1, B, 128); + pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, B, 128); if ( work_restart[thrid].restart ) return false; @@ -1190,7 +1193,7 @@ int yespower_b2b(yespower_local_t *local, if ( work_restart[thrid].restart ) return false; - hmac_blake2b_yp_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash)); + hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash)); /* Success! */ return 1; diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index 89680371..a52dea2c 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -249,7 +249,7 @@ bool register_power2b_algo( algo_gate_t* gate ) applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers ); applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen ); - gate->optimizations = SSE2_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT; gate->scanhash = (void*)&scanhash_yespower_b2b; gate->hash = (void*)&yespower_b2b_hash; opt_target_factor = 65536.0; diff --git a/configure b/configure index 8778a4c6..93e54874 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.0. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.20.0' -PACKAGE_STRING='cpuminer-opt 3.20.0' +PACKAGE_VERSION='3.20.1' +PACKAGE_STRING='cpuminer-opt 3.20.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.20.0 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.20.0:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.20.0 +cpuminer-opt configure 3.20.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.20.0, which was +It was created by cpuminer-opt $as_me 3.20.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.20.0' + VERSION='3.20.1' cat >>confdefs.h <<_ACEOF @@ -5820,6 +5820,34 @@ $as_echo "#define USE_AVX2 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 $as_echo "yes" >&6; } + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5 +$as_echo_n "checking whether we can compile AVX512 code... " >&6; } + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}"); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +$as_echo "#define USE_AVX512 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5 +$as_echo "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;} + +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 @@ -6690,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.20.0, which was +This file was extended by cpuminer-opt $as_me 3.20.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6784,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.20.0 +cpuminer-opt config.status 3.20.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 1e0589b0..d0835055 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.20.0]) +AC_INIT([cpuminer-opt], [3.20.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM @@ -93,6 +93,14 @@ then AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])], AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.]) AC_MSG_RESULT(yes) + AC_MSG_CHECKING(whether we can compile AVX512 code) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])], + AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.]) + AC_MSG_RESULT(yes) + , + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the AVX512 instruction set.]) + ) , AC_MSG_RESULT(no) AC_MSG_WARN([The assembler does not support the AVX2 instruction set.]) diff --git a/cpu-miner.c b/cpu-miner.c index 8fc6c7a0..64f30935 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1300,6 +1300,7 @@ static int share_result( int result, struct work *work, my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol, bres, CL_N, share_time, latency ); +/* if ( unlikely( opt_debug || !result || solved ) ) { if ( have_stratum ) @@ -1309,14 +1310,27 @@ static int share_result( int result, struct work *work, applog2( LOG_INFO, "Diff %.5g, Block %d", my_stats.share_diff, work ? work->height : last_block_height ); } +*/ if ( unlikely( !( opt_quiet || result || stale ) ) ) { - uint32_t str[8]; - uint32_t *targ; +// uint32_t str[8]; +// uint32_t *targ; + + if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason ); + { + // The exact hash is not avaiable here, it's just an imprecise + // approximation calculated from the share difficulty. It's useless + // for anything other than low diff rejects. Until and unless a + // solution is implemented to make the hash and targets avaiable + // don't bother displaying them. In the meantime display the diff for + // low diff rejects. + + if ( strstr( reason, "difficulty" ) ) + applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g", + my_stats.share_diff, my_stats.target_diff ); - if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason ); - +/* diff_to_hash( str, my_stats.share_diff ); applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6], str[5], str[4], str[3],str[2], str[1], str[0] ); @@ -1330,6 +1344,8 @@ static int share_result( int result, struct work *work, } applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6], targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] ); +*/ + } } return 1; } diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index c91a35e9..2a52eb2f 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -546,14 +546,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) // Two input shuffle-rotate. -// Concatenate v1 & v2 and rotate as one 256 bit vector. -// Continue to use vror/vrol for now to avoid confusion with -// shufl2r/shufl2l function macros available with AVX512. +// Concatenate v1 & v2 and bit rotate as one 256 bit vector. #if defined(__SSSE3__) -// Function macro with two inputs and one output, inputs are preserved. -// Two input functions are not available without SSSE3. Use procedure +// Function macros with two inputs and one output, inputs are preserved. +// Returns the high 128 bits, ie updated v1. +// These two-input functions are not available without SSSE3. Use procedure // macros below instead. #define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) @@ -568,12 +567,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) #define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) #define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) -// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten. - -// These macros retain the vrol/vror name for now to avoid -// confusion with the shufl2r/shuffle2l function macros above. -// These may be renamed to something like shufl2r2 for 2 nputs and -// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs. +// Procedure macros with 2 inputs and 2 outputs, input args are overwritten. +// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility +// with existing code. The function macros above can be used more effciently. #define mm128_vror256_64( v1, v2 ) \ do { \