diff --git a/Makefile.am b/Makefile.am index f4163820..d5398c00 100644 --- a/Makefile.am +++ b/Makefile.am @@ -196,6 +196,7 @@ cpuminer_SOURCES = \ algo/verthash/Verthash.c \ algo/verthash/fopen_utf8.c \ algo/verthash/tiny_sha3/sha3.c \ + algo/verthash/tiny_sha3/sha3-4way.c \ algo/whirlpool/sph_whirlpool.c \ algo/whirlpool/whirlpool-hash-4way.c \ algo/whirlpool/whirlpool-gate.c \ diff --git a/README.md b/README.md index 65b3f0e7..d740fd7b 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ Supported Algorithms x14 X14 x15 X15 x16r - x16rv2 Ravencoin (RVN) + x16rv2 x16rt Gincoin (GIN) x16rt-veil Veil (VEIL) x16s Pigeoncoin (PGN) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 1c7aca5e..a1133db1 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,14 @@ If not what makes it happen or not happen? Change Log ---------- +v3.16.2 + +Verthash: midstate prehash optimization for all architectures. +Verthash: AVX2 optimization. +GBT: added support for Bech32 addresses, untested. +Linux: added CPU frequency to benchmark log. +Fixed integer overflow in time calculations. + v3.16.1 New options for verthash: @@ -72,16 +80,12 @@ New options for verthash: data file, default is "verthash.dat" in the current directory. --verify to perform the data file integrity check at startup, default is not to verify data file integrity. - Support for creation of default verthash data file if: 1) --data-file option is not used, 2) no default data file is found in the current directory, and, 3) --verify option is used. - More detailed logs related to verthash data file. - Small verthash performance improvement. - Fixed detection of corrupt stats caused by networking issues. v3.16.0 diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c index 072df372..475c79a7 100644 --- a/algo/verthash/Verthash.c +++ b/algo/verthash/Verthash.c @@ -134,87 +134,117 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b) return (a ^ b) * 0x1000193; } -void verthash_hash(const unsigned char* blob_bytes, - const size_t blob_size, - const unsigned char(*input)[VH_HEADER_SIZE], - unsigned char(*output)[VH_HASH_OUT_SIZE]) +void verthash_hash( const unsigned char* blob_bytes, + const size_t blob_size, + const unsigned char(*input)[VH_HEADER_SIZE], + unsigned char(*output)[VH_HASH_OUT_SIZE] ) { - unsigned char p1[VH_HASH_OUT_SIZE] __attribute__ ((aligned (64))); - sha3(&input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE); - - unsigned char p0[VH_N_SUBSET]; - - unsigned char input_header[VH_HEADER_SIZE] __attribute__ ((aligned (64))); - memcpy(input_header, input, VH_HEADER_SIZE); - - for (size_t i = 0; i < VH_N_ITER; ++i) - { - input_header[0] += 1; - sha3(&input_header[0], VH_HEADER_SIZE, p0 + i * VH_P0_SIZE, VH_P0_SIZE); - } - - uint32_t* p0_index = (uint32_t*)p0; + unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64))); + unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64))); uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64))); + uint32_t* p0_index = (uint32_t*)p0; + verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] ); + for ( size_t x = 0; x < VH_N_ROT; ++x ) { memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)), p0, VH_N_SUBSET); -//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -// 512 bit vector processing is actually slower because it reduces the CPU -// clock significantly, which also slows mem access. The AVX512 rol instruction -// is still available for smaller vectors. - -// for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 16 ) -// { -// __m512i *p0_v = (__m512i*)( p0_index + y ); -// *p0_v = mm512_rol_32( *p0_v, 1 ); -// } - #if defined(__AVX2__) - for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 8 ) + for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8) { - __m256i *p0_v = (__m256i*)( p0_index + y ); - *p0_v = mm256_rol_32( *p0_v, 1 ); + casti_m256i( p0_index, y ) = mm256_rol_32( + casti_m256i( p0_index, y ), 1 ); + casti_m256i( p0_index, y+1 ) = mm256_rol_32( + casti_m256i( p0_index, y+1 ), 1 ); + casti_m256i( p0_index, y+2 ) = mm256_rol_32( + casti_m256i( p0_index, y+2 ), 1 ); + casti_m256i( p0_index, y+3 ) = mm256_rol_32( + casti_m256i( p0_index, y+3 ), 1 ); + casti_m256i( p0_index, y+4 ) = mm256_rol_32( + casti_m256i( p0_index, y+4 ), 1 ); + casti_m256i( p0_index, y+5 ) = mm256_rol_32( + casti_m256i( p0_index, y+5 ), 1 ); + casti_m256i( p0_index, y+6 ) = mm256_rol_32( + casti_m256i( p0_index, y+6 ), 1 ); + casti_m256i( p0_index, y+7 ) = mm256_rol_32( + casti_m256i( p0_index, y+7 ), 1 ); } #else - for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 4 ) + for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8) { - __m128i *p0_v = (__m128i*)( p0_index + y ); - *p0_v = mm128_rol_32( *p0_v, 1 ); + casti_m128i( p0_index, y ) = mm128_rol_32( + casti_m128i( p0_index, y ), 1 ); + casti_m128i( p0_index, y+1 ) = mm128_rol_32( + casti_m128i( p0_index, y+1 ), 1 ); + casti_m128i( p0_index, y+2 ) = mm128_rol_32( + casti_m128i( p0_index, y+2 ), 1 ); + casti_m128i( p0_index, y+3 ) = mm128_rol_32( + casti_m128i( p0_index, y+3 ), 1 ); + casti_m128i( p0_index, y+4 ) = mm128_rol_32( + casti_m128i( p0_index, y+4 ), 1 ); + casti_m128i( p0_index, y+5 ) = mm128_rol_32( + casti_m128i( p0_index, y+5 ), 1 ); + casti_m128i( p0_index, y+6 ) = mm128_rol_32( + casti_m128i( p0_index, y+6 ), 1 ); + casti_m128i( p0_index, y+7 ) = mm128_rol_32( + casti_m128i( p0_index, y+7 ), 1 ); } - + #endif -// for (size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); ++y) -// { -// *(p0_index + y) = ( *(p0_index + y) << 1 ) -// | ( 1 & (*(p0_index + y) >> 31) ); -// } } + sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE ); + uint32_t* p1_32 = (uint32_t*)p1; uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes; uint32_t value_accumulator = 0x811c9dc5; - const uint32_t mdiv = ((blob_size - VH_HASH_OUT_SIZE) / VH_BYTE_ALIGNMENT) + 1; - for (size_t i = 0; i < VH_N_INDEXES; i++) + const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE ) + / VH_BYTE_ALIGNMENT ) + 1; +#if defined (__AVX2__) + const __m256i k = _mm256_set1_epi32( 0x1000193 ); +#elif defined(__SSE41__) + const __m128i k = _mm_set1_epi32( 0x1000193 ); +#endif + + for ( size_t i = 0; i < VH_N_INDEXES; i++ ) { - const uint32_t offset = (fnv1a(seek_indexes[i], value_accumulator) % mdiv) * VH_BYTE_ALIGNMENT / sizeof(uint32_t); + const uint32_t offset = + ( fnv1a( seek_indexes[i], value_accumulator) % mdiv ) + * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ); const uint32_t *blob_off = blob_bytes_32 + offset; - for (size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++) - { - const uint32_t value = *( blob_off + i2 ); - uint32_t* p1_ptr = p1_32 + i2; - *p1_ptr = fnv1a( *p1_ptr, value ); - value_accumulator = fnv1a( value_accumulator, value ); - } + + // update value accumulator for next seek index + value_accumulator = fnv1a( value_accumulator, blob_off[0] ); + value_accumulator = fnv1a( value_accumulator, blob_off[1] ); + value_accumulator = fnv1a( value_accumulator, blob_off[2] ); + value_accumulator = fnv1a( value_accumulator, blob_off[3] ); + value_accumulator = fnv1a( value_accumulator, blob_off[4] ); + value_accumulator = fnv1a( value_accumulator, blob_off[5] ); + value_accumulator = fnv1a( value_accumulator, blob_off[6] ); + value_accumulator = fnv1a( value_accumulator, blob_off[7] ); + +#if defined (__AVX2__) + *(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256( + *(__m256i*)p1_32, *(__m256i*)blob_off ), k ); +#elif defined(__SSE41__) + casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128( + casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k ); + casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128( + casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k ); +#else + for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ ) + p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] ); +#endif + } - memcpy(output, p1, VH_HASH_OUT_SIZE); + memcpy( output, p1, VH_HASH_OUT_SIZE ); } //----------------------------------------------------------------------------- diff --git a/algo/verthash/Verthash.h b/algo/verthash/Verthash.h index f81c9750..5cce653a 100644 --- a/algo/verthash/Verthash.h +++ b/algo/verthash/Verthash.h @@ -52,6 +52,8 @@ void verthash_hash(const unsigned char* blob_bytes, const unsigned char(*input)[VH_HEADER_SIZE], unsigned char(*output)[VH_HASH_OUT_SIZE]); +void verthash_sha3_512_prehash_72( const void *input ); +void verthash_sha3_512_final_8( void *hash, const uint64_t nonce ); #endif // !Verthash_INCLUDE_ONCE diff --git a/algo/verthash/tiny_sha3/sha3-4way.c b/algo/verthash/tiny_sha3/sha3-4way.c new file mode 100644 index 00000000..abbc8483 --- /dev/null +++ b/algo/verthash/tiny_sha3/sha3-4way.c @@ -0,0 +1,301 @@ +#if defined(__AVX2__) + +// sha3-4way.c +// 19-Nov-11 Markku-Juhani O. Saarinen +// vectorization by JayDDee 2021-03-27 +// +// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3" +// Revised 03-Sep-15 for portability + OpenSSL - style API + +#include "sha3-4way.h" + +// constants +static const uint64_t keccakf_rndc[24] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 + }; + +void sha3_4way_keccakf( __m256i st[25] ) +{ + int i, j, r; + __m256i t, bc[5]; + + for ( r = 0; r < KECCAKF_ROUNDS; r++ ) + { + // Theta + bc[0] = _mm256_xor_si256( st[0], + mm256_xor4( st[5], st[10], st[15], st[20] ) ); + bc[1] = _mm256_xor_si256( st[1], + mm256_xor4( st[6], st[11], st[16], st[21] ) ); + bc[2] = _mm256_xor_si256( st[2], + mm256_xor4( st[7], st[12], st[17], st[22] ) ); + bc[3] = _mm256_xor_si256( st[3], + mm256_xor4( st[8], st[13], st[18], st[23] ) ); + bc[4] = _mm256_xor_si256( st[4], + mm256_xor4( st[9], st[14], st[19], st[24] ) ); + + for ( i = 0; i < 5; i++ ) + { + t = _mm256_xor_si256( bc[ (i+4) % 5 ], + mm256_rol_64( bc[ (i+1) % 5 ], 1 ) ); + st[ i ] = _mm256_xor_si256( st[ i ], t ); + st[ i+5 ] = _mm256_xor_si256( st[ i+ 5 ], t ); + st[ i+10 ] = _mm256_xor_si256( st[ i+10 ], t ); + st[ i+15 ] = _mm256_xor_si256( st[ i+15 ], t ); + st[ i+20 ] = _mm256_xor_si256( st[ i+20 ], t ); + } + + // Rho Pi +#define RHO_PI( i, c ) \ + bc[0] = st[ i ]; \ + st[ i ] = mm256_rol_64( t, c ); \ + t = bc[0] + + t = st[1]; + + RHO_PI( 10, 1 ); + RHO_PI( 7, 3 ); + RHO_PI( 11, 6 ); + RHO_PI( 17, 10 ); + RHO_PI( 18, 15 ); + RHO_PI( 3, 21 ); + RHO_PI( 5, 28 ); + RHO_PI( 16, 36 ); + RHO_PI( 8, 45 ); + RHO_PI( 21, 55 ); + RHO_PI( 24, 2 ); + RHO_PI( 4, 14 ); + RHO_PI( 15, 27 ); + RHO_PI( 23, 41 ); + RHO_PI( 19, 56 ); + RHO_PI( 13, 8 ); + RHO_PI( 12, 25 ); + RHO_PI( 2, 43 ); + RHO_PI( 20, 62 ); + RHO_PI( 14, 18 ); + RHO_PI( 22, 39 ); + RHO_PI( 9, 61 ); + RHO_PI( 6, 20 ); + RHO_PI( 1, 44 ); + +#undef RHO_PI + + // Chi + for ( j = 0; j < 25; j += 5 ) + { + memcpy( bc, &st[ j ], 5*32 ); + st[ j ] = _mm256_xor_si256( st[ j ], + _mm256_andnot_si256( bc[1], bc[2] ) ); + st[ j+1 ] = _mm256_xor_si256( st[ j+1 ], + _mm256_andnot_si256( bc[2], bc[3] ) ); + st[ j+2 ] = _mm256_xor_si256( st[ j+2 ], + _mm256_andnot_si256( bc[3], bc[4] ) ); + st[ j+3 ] = _mm256_xor_si256( st[ j+3 ], + _mm256_andnot_si256( bc[4], bc[0] ) ); + st[ j+4 ] = _mm256_xor_si256( st[ j+4 ], + _mm256_andnot_si256( bc[0], bc[1] ) ); + } + + // Iota + st[0] = _mm256_xor_si256( st[0], + _mm256_set1_epi64x( keccakf_rndc[ r ] ) ); + } +} + +int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen ) +{ + for ( int i = 0; i < 25; i++ ) c->st[ i ] = m256_zero; + c->mdlen = mdlen; + c->rsiz = 200 - 2 * mdlen; + c->pt = 0; + return 1; +} + +int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len ) +{ + size_t i; + int j = c->pt; + const int rsiz = c->rsiz / 8; + const int l = len / 8; + + for ( i = 0; i < l; i++ ) + { + c->st[ j ] = _mm256_xor_si256( c->st[ j ], + ( (const __m256i*)data )[i] ); + j++; + if ( j >= rsiz ) + { + sha3_4way_keccakf( c->st ); + j = 0; + } + } + c->pt = j; + + return 1; +} + +int sha3_4way_final( void *md, sha3_4way_ctx_t *c ) +{ + c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ], + m256_const1_64( 6 ) ); + c->st[ c->rsiz / 8 - 1 ] = + _mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ], + m256_const1_64( 0x8000000000000000 ) ); + sha3_4way_keccakf( c->st ); + memcpy( md, c->st, c->mdlen * 4 ); + return 1; +} + +void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen ) +{ + sha3_4way_ctx_t ctx; + sha3_4way_init( &ctx, mdlen); + sha3_4way_update( &ctx, in, inlen ); + sha3_4way_final( md, &ctx ); + return md; +} + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +void sha3_8way_keccakf( __m512i st[25] ) +{ + int i, j, r; + __m512i t, bc[5]; + + // actual iteration + for ( r = 0; r < KECCAKF_ROUNDS; r++ ) + { + + // Theta + for ( i = 0; i < 5; i++ ) + bc[i] = _mm512_xor_si512( st[i], + mm512_xor4( st[ i+5 ], st[ i+10 ], st[ i+15 ], st[i+20 ] ) ); + + for ( i = 0; i < 5; i++ ) + { + t = _mm512_xor_si512( bc[(i + 4) % 5], + _mm512_rol_epi64( bc[(i + 1) % 5], 1 ) ); + for ( j = 0; j < 25; j += 5 ) + st[j + i] = _mm512_xor_si512( st[j + i], t ); + } + + // Rho Pi +#define RHO_PI( i, c ) \ + bc[0] = st[ i ]; \ + st[ i ] = _mm512_rol_epi64( t, c ); \ + t = bc[0] + + t = st[1]; + + RHO_PI( 10, 1 ); + RHO_PI( 7, 3 ); + RHO_PI( 11, 6 ); + RHO_PI( 17, 10 ); + RHO_PI( 18, 15 ); + RHO_PI( 3, 21 ); + RHO_PI( 5, 28 ); + RHO_PI( 16, 36 ); + RHO_PI( 8, 45 ); + RHO_PI( 21, 55 ); + RHO_PI( 24, 2 ); + RHO_PI( 4, 14 ); + RHO_PI( 15, 27 ); + RHO_PI( 23, 41 ); + RHO_PI( 19, 56 ); + RHO_PI( 13, 8 ); + RHO_PI( 12, 25 ); + RHO_PI( 2, 43 ); + RHO_PI( 20, 62 ); + RHO_PI( 14, 18 ); + RHO_PI( 22, 39 ); + RHO_PI( 9, 61 ); + RHO_PI( 6, 20 ); + RHO_PI( 1, 44 ); + +#undef RHO_PI + + // Chi + for ( j = 0; j < 25; j += 5 ) + { + for ( i = 0; i < 5; i++ ) + bc[i] = st[j + i]; + for ( i = 0; i < 5; i++ ) + st[ j+i ] = _mm512_xor_si512( st[ j+i ], _mm512_andnot_si512( + bc[ (i+1) % 5 ], bc[ (i+2) % 5 ] ) ); + } + + // Iota + st[0] = _mm512_xor_si512( st[0], _mm512_set1_epi64( keccakf_rndc[r] ) ); + } +} + +// Initialize the context for SHA3 + +int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen ) +{ + for ( int i = 0; i < 25; i++ ) c->st[ i ] = m512_zero; + c->mdlen = mdlen; + c->rsiz = 200 - 2 * mdlen; + c->pt = 0; + return 1; +} + +// update state with more data + +int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len ) +{ + size_t i; + int j = c->pt; + const int rsiz = c->rsiz / 8; + const int l = len / 8; + + for ( i = 0; i < l; i++ ) + { + c->st[ j ] = _mm512_xor_si512( c->st[ j ], + ( (const __m512i*)data )[i] ); + j++; + if ( j >= rsiz ) + { + sha3_8way_keccakf( c->st ); + j = 0; + } + } + c->pt = j; + + return 1; +} + +// finalize and output a hash + +int sha3_8way_final( void *md, sha3_8way_ctx_t *c ) +{ + c->st[ c->pt ] = + _mm512_xor_si512( c->st[ c->pt ], + m512_const1_64( 6 ) ); + c->st[ c->rsiz / 8 - 1 ] = + _mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ], + m512_const1_64( 0x8000000000000000 ) ); + sha3_8way_keccakf( c->st ); + memcpy( md, c->st, c->mdlen * 8 ); + return 1; +} + +// compute a SHA-3 hash (md) of given byte length from "in" + +void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen ) +{ + sha3_8way_ctx_t sha3; + sha3_8way_init( &sha3, mdlen); + sha3_8way_update( &sha3, in, inlen ); + sha3_8way_final( md, &sha3 ); + return md; +} + +#endif // AVX512 +#endif // AVX2 diff --git a/algo/verthash/tiny_sha3/sha3-4way.h b/algo/verthash/tiny_sha3/sha3-4way.h new file mode 100644 index 00000000..6723b73b --- /dev/null +++ b/algo/verthash/tiny_sha3/sha3-4way.h @@ -0,0 +1,67 @@ +// sha3.h +// 19-Nov-11 Markku-Juhani O. Saarinen +// 2021-03-27 JayDDee +// +#ifndef SHA3_4WAY_H +#define SHA3_4WAY_H + +#include +#include +#include "simd-utils.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef KECCAKF_ROUNDS +#define KECCAKF_ROUNDS 24 +#endif + +#if defined(__AVX2__) + +typedef struct +{ + __m256i st[25]; // 64-bit words * 4 lanes + int pt, rsiz, mdlen; // these don't overflow +} sha3_4way_ctx_t __attribute__ ((aligned (64)));; + +// Compression function. +void sha3_4way_keccakf( __m256i st[25] ); + +// OpenSSL - like interfece +int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes +int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len ); +int sha3_4way_final( void *md, sha3_4way_ctx_t *c ); // digest goes to md + +// compute a sha3 hash (md) of given byte length from "in" +void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen ); + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// state context +typedef struct +{ + __m512i st[25]; // 64-bit words * 8 lanes + int pt, rsiz, mdlen; // these don't overflow +} sha3_8way_ctx_t __attribute__ ((aligned (64)));; + +// Compression function. +void sha3_8way_keccakf( __m512i st[25] ); + +// OpenSSL - like interfece +int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes +int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len ); +int sha3_8way_final( void *md, sha3_8way_ctx_t *c ); // digest goes to md + +// compute a sha3 hash (md) of given byte length from "in" +void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen ); + +#endif // AVX512 +#endif // AVX2 + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/algo/verthash/tiny_sha3/sha3.c b/algo/verthash/tiny_sha3/sha3.c index 931ae020..94b06602 100644 --- a/algo/verthash/tiny_sha3/sha3.c +++ b/algo/verthash/tiny_sha3/sha3.c @@ -5,6 +5,7 @@ // Revised 03-Sep-15 for portability + OpenSSL - style API #include "sha3.h" +#include // update the state with given number of rounds @@ -21,6 +22,7 @@ void sha3_keccakf(uint64_t st[25]) 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 }; +/* const int keccakf_rotc[24] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 @@ -29,6 +31,7 @@ void sha3_keccakf(uint64_t st[25]) 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 }; +*/ // variables int i, j, r; @@ -60,14 +63,50 @@ void sha3_keccakf(uint64_t st[25]) st[j + i] ^= t; } + // Rho Pi +#define RHO_PI( i, c ) \ + bc[0] = st[ i ]; \ + st[ i ] = ROTL64( t, c ); \ + t = bc[0] + t = st[1]; + + RHO_PI( 10, 1 ); + RHO_PI( 7, 3 ); + RHO_PI( 11, 6 ); + RHO_PI( 17, 10 ); + RHO_PI( 18, 15 ); + RHO_PI( 3, 21 ); + RHO_PI( 5, 28 ); + RHO_PI( 16, 36 ); + RHO_PI( 8, 45 ); + RHO_PI( 21, 55 ); + RHO_PI( 24, 2 ); + RHO_PI( 4, 14 ); + RHO_PI( 15, 27 ); + RHO_PI( 23, 41 ); + RHO_PI( 19, 56 ); + RHO_PI( 13, 8 ); + RHO_PI( 12, 25 ); + RHO_PI( 2, 43 ); + RHO_PI( 20, 62 ); + RHO_PI( 14, 18 ); + RHO_PI( 22, 39 ); + RHO_PI( 9, 61 ); + RHO_PI( 6, 20 ); + RHO_PI( 1, 44 ); + +#undef RHO_PI + +/* for (i = 0; i < 24; i++) { j = keccakf_piln[i]; bc[0] = st[j]; st[j] = ROTL64(t, keccakf_rotc[i]); t = bc[0]; } +*/ // Chi for (j = 0; j < 25; j += 5) { @@ -118,17 +157,20 @@ int sha3_init(sha3_ctx_t *c, int mdlen) int sha3_update(sha3_ctx_t *c, const void *data, size_t len) { size_t i; - int j; - - j = c->pt; - for (i = 0; i < len; i++) { - c->st.b[j++] ^= ((const uint8_t *) data)[i]; - if (j >= c->rsiz) { - sha3_keccakf(c->st.q); + int j = c->pt / 8; + const int rsiz = c->rsiz / 8; + const int l = len / 8; + + for ( i = 0; i < l; i++ ) + { + c->st.q[ j++ ] ^= ( ((const uint64_t *) data) [i] ); + if ( j >= rsiz ) + { + sha3_keccakf( c->st.q ); j = 0; } } - c->pt = j; + c->pt = j*8; return 1; } @@ -137,16 +179,10 @@ int sha3_update(sha3_ctx_t *c, const void *data, size_t len) int sha3_final(void *md, sha3_ctx_t *c) { - int i; - - c->st.b[c->pt] ^= 0x06; - c->st.b[c->rsiz - 1] ^= 0x80; + c->st.q[ c->pt / 8 ] ^= 6; + c->st.q[ c->rsiz / 8 - 1 ] ^= 0x8000000000000000; sha3_keccakf(c->st.q); - - for (i = 0; i < c->mdlen; i++) { - ((uint8_t *) md)[i] = c->st.b[i]; - } - + memcpy( md, c->st.q, c->mdlen ); return 1; } @@ -155,7 +191,6 @@ int sha3_final(void *md, sha3_ctx_t *c) void *sha3(const void *in, size_t inlen, void *md, int mdlen) { sha3_ctx_t sha3; - sha3_init(&sha3, mdlen); sha3_update(&sha3, in, inlen); sha3_final(md, &sha3); diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c index 00c137fc..029ce462 100644 --- a/algo/verthash/verthash-gate.c +++ b/algo/verthash/verthash-gate.c @@ -1,6 +1,7 @@ #include "algo-gate-api.h" #include "algo/sha/sph_sha2.h" #include "Verthash.h" +#include "tiny_sha3/sha3-4way.h" static verthash_info_t verthashInfo; @@ -12,6 +13,82 @@ static const uint8_t verthashDatFileHash_bytes[32] = 0x29, 0xec, 0xf8, 0x8f, 0x8a, 0xd4, 0x76, 0x39, 0xb6, 0xed, 0xed, 0xaf, 0xd7, 0x21, 0xaa, 0x48 }; +#if defined(__AVX2__) + +static __thread sha3_4way_ctx_t sha3_mid_ctxA; +static __thread sha3_4way_ctx_t sha3_mid_ctxB; + +#else + +static __thread sha3_ctx_t sha3_mid_ctx[8]; + +#endif + +void verthash_sha3_512_prehash_72( const void *input ) +{ +#if defined(__AVX2__) + + __m256i vin[10]; + mm256_intrlv80_4x64( vin, input ); + + sha3_4way_init( &sha3_mid_ctxA, 64 ); + sha3_4way_init( &sha3_mid_ctxB, 64 ); + + vin[0] = _mm256_add_epi8( vin[0], _mm256_set_epi64x( 4,3,2,1 ) ); + sha3_4way_update( &sha3_mid_ctxA, vin, 72 ); + + vin[0] = _mm256_add_epi8( vin[0], _mm256_set1_epi64x( 4 ) ); + sha3_4way_update( &sha3_mid_ctxB, vin, 72 ); + +#else + + char in[80] __attribute__ ((aligned (64))); + memcpy( in, input, 80 ); + for ( int i = 0; i < 8; i++ ) + { + in[0] += 1; + sha3_init( &sha3_mid_ctx[i], 64 ); + sha3_update( &sha3_mid_ctx[i], in, 72 ); + } + +#endif +} + +void verthash_sha3_512_final_8( void *hash, const uint64_t nonce ) +{ +#if defined(__AVX2__) + + __m256i vhashA[ 10 ] __attribute__ ((aligned (64))); + __m256i vhashB[ 10 ] __attribute__ ((aligned (64))); + + sha3_4way_ctx_t ctx; + __m256i vnonce = _mm256_set1_epi64x( nonce ); + + memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx ); + sha3_4way_update( &ctx, &vnonce, 8 ); + sha3_4way_final( vhashA, &ctx ); + + memcpy( &ctx, &sha3_mid_ctxB, sizeof ctx ); + sha3_4way_update( &ctx, &vnonce, 8 ); + sha3_4way_final( vhashB, &ctx ); + + dintrlv_4x64( hash, hash+64, hash+128, hash+192, vhashA, 512 ); + dintrlv_4x64( hash+256, hash+320, hash+384, hash+448, vhashB, 512 ); + +#else + + for ( int i = 0; i < 8; i++ ) + { + sha3_ctx_t ctx; + memcpy( &ctx, &sha3_mid_ctx[i], sizeof ctx ); + sha3_update( &ctx, &nonce, 8 ); + sha3_final( hash + i*64, &ctx ); + } + +#endif +} + + int scanhash_verthash( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { @@ -26,6 +103,8 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; mm128_bswap32_80( edata, pdata ); + verthash_sha3_512_prehash_72( edata ); + do { edata[19] = n; @@ -51,15 +130,14 @@ bool register_verthash_algo( algo_gate_t* gate ) opt_target_factor = 256.0; gate->scanhash = (void*)&scanhash_verthash; + gate->optimizations = AVX2_OPT; - // verthash data file char *verthash_data_file = opt_data_file ? opt_data_file : default_verthash_data_file; int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file ); if (vhLoadResult == 0) // No Error { - // and verify data file(if it was enabled) if ( opt_verify ) { uint8_t vhDataFileHash[32] = { 0 }; @@ -78,7 +156,6 @@ bool register_verthash_algo( algo_gate_t* gate ) } } else - { // Handle Verthash error codes if ( vhLoadResult == 1 ) diff --git a/configure b/configure index 00998223..e18537aa 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.16.1' -PACKAGE_STRING='cpuminer-opt 3.16.1' +PACKAGE_VERSION='3.16.2' +PACKAGE_STRING='cpuminer-opt 3.16.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.16.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.16.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.16.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.16.2:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.16.1 +cpuminer-opt configure 3.16.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.16.1, which was +It was created by cpuminer-opt $as_me 3.16.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.16.1' + VERSION='3.16.2' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.16.1, which was +This file was extended by cpuminer-opt $as_me 3.16.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.16.1 +cpuminer-opt config.status 3.16.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 29b3b9a7..5ee7b2f7 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.16.1]) +AC_INIT([cpuminer-opt], [3.16.2]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 0ed29b92..e52168bd 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -555,7 +555,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work ) if ( !s ) continue; if ( !strcmp( s, "segwit" ) || !strcmp( s, "!segwit" ) ) + { segwit = true; + if ( opt_debug ) + applog( LOG_INFO, "GBT: SegWit is enabled" ); + } } } // Segwit END @@ -954,25 +958,25 @@ void scale_hash_for_display ( double* hashrate, char* prefix ) else { *prefix = 'Y'; *hashrate /= 1e24; } } -static inline void sprintf_et( char *str, int seconds ) +static inline void sprintf_et( char *str, long unsigned int seconds ) { - // sprintf doesn't like uint64_t, Linux thinks it's long, Windows long long. - unsigned int min = seconds / 60; - unsigned int sec = seconds % 60; - unsigned int hrs = min / 60; + long unsigned int min = seconds / 60; + long unsigned int sec = seconds % 60; + long unsigned int hrs = min / 60; + if ( unlikely( hrs ) ) { - unsigned int years = hrs / (24*365); - unsigned int days = hrs / 24; - if ( years ) - sprintf( str, "%uy%ud", years, years % 365 ); - else if ( days ) //0d00h - sprintf( str, "%ud%02uh", days, hrs % 24 ); + long unsigned int days = hrs / 24; + long unsigned int years = days / 365; + if ( years ) // 0y000d + sprintf( str, "%luy%lud", years, years % 365 ); + else if ( days ) // 0d00h + sprintf( str, "%lud%02luh", days, hrs % 24 ); else // 0h00m - sprintf( str, "%uh%02um", hrs, min % 60 ); + sprintf( str, "%luh%02lum", hrs, min % 60 ); } else // 0m00s - sprintf( str, "%um%02us", min, sec ); + sprintf( str, "%lum%02lus", min, sec ); } const long double exp32 = EXP32; // 2**32 @@ -1071,7 +1075,8 @@ void report_summary_log( bool force ) double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6; double ghrate = global_hashrate; - double shrate = safe_div( exp32 * last_targetdiff * (double)(accepts), + double target_diff = exp32 * last_targetdiff; + double shrate = safe_div( target_diff * (double)(accepts), share_time, 0. ); double sess_hrate = safe_div( exp32 * norm_diff_sum, (double)uptime.tv_sec, 0. ); @@ -1099,12 +1104,12 @@ void report_summary_log( bool force ) if ( accepted_share_count < submitted_share_count ) { - double ltd = exp32 * last_targetdiff; double lost_ghrate = uptime.tv_sec == 0 ? 0. - : ltd * (double)(submitted_share_count - accepted_share_count ) + : target_diff + * (double)(submitted_share_count - accepted_share_count ) / (double)uptime.tv_sec; double lost_shrate = share_time == 0. ? 0. - : ltd * (double)(submits - accepts ) / share_time; + : target_diff * (double)(submits - accepts ) / share_time; char lshr_units[4] = {0}; char lghr_units[4] = {0}; scale_hash_for_display( &lost_shrate, lshr_units ); @@ -2437,10 +2442,14 @@ static void *miner_thread( void *userdata ) #if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32)) applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units ); #else - applog( LOG_NOTICE, "Total: %s %sH/s, CPU temp: %dC", - hr, hr_units, (uint32_t)cpu_temp(0) ); + float lo_freq = 0., hi_freq = 0.; + linux_cpu_hilo_freq( &lo_freq, &hi_freq ); + applog( LOG_NOTICE, + "Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz", + hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6, + hi_freq / 1e6 ); #endif - } + } } // benchmark // conditional mining diff --git a/miner.h b/miner.h index 9e2749a3..bea4f68a 100644 --- a/miner.h +++ b/miner.h @@ -900,7 +900,7 @@ Options:\n\ --benchmark run in offline benchmark mode\n\ --cpu-affinity set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\ --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest)\n\ - -b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4048)\n\ + -b, --api-bind=address[:port] IP address for the miner API, default port is 4048)\n\ --api-remote Allow remote control\n\ --max-temp=N Only mine if cpu temp is less than specified value (linux)\n\ --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\ diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 93a5e19b..cedcae34 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -1225,37 +1225,6 @@ static inline void intrlv_4x64( void *dst, const void *src0, d[31] = _mm_unpackhi_epi64( s2[7], s3[7] ); } -/* -static inline void intrlv_4x64( void *dst, void *src0, - void *src1, void *src2, void *src3, int bit_len ) -{ - uint64_t *d = (uint64_t*)dst; - uint64_t *s0 = (uint64_t*)src0; - uint64_t *s1 = (uint64_t*)src1; - uint64_t *s2 = (uint64_t*)src2; - uint64_t *s3 = (uint64_t*)src3; - d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0]; - d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1]; - d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2]; - d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3]; - if ( bit_len <= 256 ) return; - d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4]; - d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5]; - d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6]; - d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7]; - if ( bit_len <= 512 ) return; - d[ 32] = s0[ 8]; d[ 33] = s1[ 8]; d[ 34] = s2[ 8]; d[ 35] = s3[ 8]; - d[ 36] = s0[ 9]; d[ 37] = s1[ 9]; d[ 38] = s2[ 9]; d[ 39] = s3[ 9]; - if ( bit_len <= 640 ) return; - d[ 40] = s0[10]; d[ 41] = s1[10]; d[ 42] = s2[10]; d[ 43] = s3[10]; - d[ 44] = s0[11]; d[ 45] = s1[11]; d[ 46] = s2[11]; d[ 47] = s3[11]; - d[ 48] = s0[12]; d[ 49] = s1[12]; d[ 50] = s2[12]; d[ 51] = s3[12]; - d[ 52] = s0[13]; d[ 53] = s1[13]; d[ 54] = s2[13]; d[ 55] = s3[13]; - d[ 56] = s0[14]; d[ 57] = s1[14]; d[ 58] = s2[14]; d[ 59] = s3[14]; - d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15]; -} -*/ - static inline void intrlv_4x64_512( void *dst, const void *src0, const void *src1, const void *src2, const void *src3 ) { @@ -1282,26 +1251,6 @@ static inline void intrlv_4x64_512( void *dst, const void *src0, d[15] = _mm_unpackhi_epi64( s2[3], s3[3] ); } -/* -static inline void intrlv_4x64_512( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3 ) -{ - uint64_t *d = (uint64_t*)dst; - const uint64_t *s0 = (const uint64_t*)src0; - const uint64_t *s1 = (const uint64_t*)src1; - const uint64_t *s2 = (const uint64_t*)src2; - const uint64_t *s3 = (const uint64_t*)src3; - d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0]; - d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1]; - d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2]; - d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3]; - d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4]; - d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5]; - d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6]; - d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7]; -} -*/ - static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, const int bit_len ) { @@ -1347,38 +1296,6 @@ static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, d3[7] = _mm_unpackhi_epi64( s[29], s[31] ); } - -/* -static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src, int bit_len ) -{ - uint64_t *d0 = (uint64_t*)dst0; - uint64_t *d1 = (uint64_t*)dst1; - uint64_t *d2 = (uint64_t*)dst2; - uint64_t *d3 = (uint64_t*)dst3; - const uint64_t *s = (const uint64_t*)src; - d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3]; - d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7]; - d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11]; - d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15]; - if ( bit_len <= 256 ) return; - d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19]; - d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23]; - d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27]; - d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31]; - if ( bit_len <= 512 ) return; - d0[ 8] = s[32]; d1[ 8] = s[33]; d2[ 8] = s[34]; d3[ 8] = s[35]; - d0[ 9] = s[36]; d1[ 9] = s[37]; d2[ 9] = s[38]; d3[ 9] = s[39]; - if ( bit_len <= 640 ) return; - d0[10] = s[40]; d1[10] = s[41]; d2[10] = s[42]; d3[10] = s[43]; - d0[11] = s[44]; d1[11] = s[45]; d2[11] = s[46]; d3[11] = s[47]; - d0[12] = s[48]; d1[12] = s[49]; d2[12] = s[50]; d3[12] = s[51]; - d0[13] = s[52]; d1[13] = s[53]; d2[13] = s[54]; d3[13] = s[55]; - d0[14] = s[56]; d1[14] = s[57]; d2[14] = s[58]; d3[14] = s[59]; - d0[15] = s[60]; d1[15] = s[61]; d2[15] = s[62]; d3[15] = s[63]; -} -*/ - static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { @@ -1405,26 +1322,6 @@ static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, d3[3] = _mm_unpackhi_epi64( s[13], s[15] ); } -/* -static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src ) -{ - uint64_t *d0 = (uint64_t*)dst0; - uint64_t *d1 = (uint64_t*)dst1; - uint64_t *d2 = (uint64_t*)dst2; - uint64_t *d3 = (uint64_t*)dst3; - const uint64_t *s = (const uint64_t*)src; - d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3]; - d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7]; - d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11]; - d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15]; - d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19]; - d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23]; - d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27]; - d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31]; -} -*/ - static inline void extr_lane_4x64( void *d, const void *s, const int lane, const int bit_len ) { @@ -1440,9 +1337,41 @@ static inline void extr_lane_4x64( void *d, const void *s, } #if defined(__AVX2__) +// Doesn't really need AVX2, just SSSE3, but is only used with AVX2 code. + +static inline void mm256_intrlv80_4x64( void *d, const void *src ) +{ + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); + + casti_m128i( d, 0 ) = + casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 ); + casti_m128i( d, 2 ) = + casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xee ); + + casti_m128i( d, 4 ) = + casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x44 ); + casti_m128i( d, 6 ) = + casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xee ); + + casti_m128i( d, 8 ) = + casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x44 ); + casti_m128i( d, 10 ) = + casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee ); + + casti_m128i( d, 12 ) = + casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 ); + casti_m128i( d, 14 ) = + casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee ); -// There a alignment problems with the source buffer on Wwindows, -// can't use 256 bit bswap. + casti_m128i( d, 16 ) = + casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 ); + casti_m128i( d, 18 ) = + casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee ); +} static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { @@ -1636,40 +1565,6 @@ static inline void intrlv_8x64_512( void *dst, const void *src0, d[31] = _mm_unpackhi_epi64( s6[3], s7[3] ); } -/* -#define ILEAVE_8x64( i ) do \ -{ \ - uint64_t *d = (uint64_t*)(dst) + ( (i) << 3 ); \ - d[0] = *( (const uint64_t*)(s0) +(i) ); \ - d[1] = *( (const uint64_t*)(s1) +(i) ); \ - d[2] = *( (const uint64_t*)(s2) +(i) ); \ - d[3] = *( (const uint64_t*)(s3) +(i) ); \ - d[4] = *( (const uint64_t*)(s4) +(i) ); \ - d[5] = *( (const uint64_t*)(s5) +(i) ); \ - d[6] = *( (const uint64_t*)(s6) +(i) ); \ - d[7] = *( (const uint64_t*)(s7) +(i) ); \ -} while(0) - -static inline void intrlv_8x64( void *dst, const void *s0, - const void *s1, const void *s2, const void *s3, const void *s4, - const void *s5, const void *s6, const void *s7, int bit_len ) -{ - ILEAVE_8x64( 0 ); ILEAVE_8x64( 1 ); - ILEAVE_8x64( 2 ); ILEAVE_8x64( 3 ); - if ( bit_len <= 256 ) return; - ILEAVE_8x64( 4 ); ILEAVE_8x64( 5 ); - ILEAVE_8x64( 6 ); ILEAVE_8x64( 7 ); - if ( bit_len <= 512 ) return; - ILEAVE_8x64( 8 ); ILEAVE_8x64( 9 ); - if ( bit_len <= 640 ) return; - ILEAVE_8x64( 10 ); ILEAVE_8x64( 11 ); - ILEAVE_8x64( 12 ); ILEAVE_8x64( 13 ); - ILEAVE_8x64( 14 ); ILEAVE_8x64( 15 ); -} - -#undef ILEAVE_8x64 -*/ - static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, @@ -1815,39 +1710,6 @@ static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2, d7[3] = _mm_unpackhi_epi64( s[27], s[31] ); } -/* -#define DLEAVE_8x64( i ) do \ -{ \ - const uint64_t *s = (const uint64_t*)(src) + ( (i) << 3 ); \ - *( (uint64_t*)(d0) +(i) ) = s[0]; \ - *( (uint64_t*)(d1) +(i) ) = s[1]; \ - *( (uint64_t*)(d2) +(i) ) = s[2]; \ - *( (uint64_t*)(d3) +(i) ) = s[3]; \ - *( (uint64_t*)(d4) +(i) ) = s[4]; \ - *( (uint64_t*)(d5) +(i) ) = s[5]; \ - *( (uint64_t*)(d6) +(i) ) = s[6]; \ - *( (uint64_t*)(d7) +(i) ) = s[7]; \ -} while(0) - -static inline void dintrlv_8x64( void *d0, void *d1, void *d2, void *d3, - void *d4, void *d5, void *d6, void *d7, const void *src, int bit_len ) -{ - DLEAVE_8x64( 0 ); DLEAVE_8x64( 1 ); - DLEAVE_8x64( 2 ); DLEAVE_8x64( 3 ); - if ( bit_len <= 256 ) return; - DLEAVE_8x64( 4 ); DLEAVE_8x64( 5 ); - DLEAVE_8x64( 6 ); DLEAVE_8x64( 7 ); - if ( bit_len <= 512 ) return; - DLEAVE_8x64( 8 ); DLEAVE_8x64( 9 ); - if ( bit_len <= 640 ) return; - DLEAVE_8x64( 10 ); DLEAVE_8x64( 11 ); - DLEAVE_8x64( 12 ); DLEAVE_8x64( 13 ); - DLEAVE_8x64( 14 ); DLEAVE_8x64( 15 ); -} - -#undef DLEAVE_8x64 -*/ - static inline void extr_lane_8x64( void *d, const void *s, const int lane, const int bit_len ) { diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 35be6109..e166b14d 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -178,7 +178,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m ) // Basic operations without equivalent SIMD intrinsic // Bitwise not (~v) -#define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 ) +#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 ) // Unary negation of elements (-v) #define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v ) @@ -263,7 +263,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX512VL__) +//#if defined(__AVX512F__) && defined(__AVX512VL__) #define mm128_ror_64 _mm_ror_epi64 #define mm128_rol_64 _mm_rol_epi64 @@ -291,16 +292,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) #define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) -//#define mm128_swap_64( v ) _mm_alignr_epi8( v, v, 8 ) -//#define mm128_ror_1x32( v ) _mm_alignr_epi8( v, v, 4 ) -//#define mm128_rol_1x32( v ) _mm_alignr_epi8( v, v, 12 ) // Swap 32 bit elements in 64 bit lanes #define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) #if defined(__SSSE3__) -// Rotate right by c bytes +// Rotate right by c bytes, no SSE2 equivalent. static inline __m128i mm128_ror_x8( const __m128i v, const int c ) { return _mm_alignr_epi8( v, v, c ); } diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 635eb4f2..7a37012a 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -18,7 +18,7 @@ #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) ) #define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) ) -// Mo0ve low element of vector to integer. +// Move low element of vector to integer. #define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) ) #define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) ) @@ -42,7 +42,7 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2, // 128 bit vector argument #define m256_const1_128( v ) \ _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 ) -// 64 bit integer argument +// 64 bit integer argument zero extended to 128 bits. #define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) ) #define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) ) #define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) ) @@ -168,7 +168,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) _mm256_srli_epi32( v, 32-(c) ) ) -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +// The spec says both F & VL are required, but just in case AMD +// decides to implement ROL/R without AVX512F. +#if defined(__AVX512VL__) +//#if defined(__AVX512F__) && defined(__AVX512VL__) // AVX512, control must be 8 bit immediate. @@ -198,21 +201,14 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // // Rotate elements accross all lanes. // -// AVX2 has no full vector permute for elements less than 32 bits. -// AVX512 has finer granularity full vector permutes. -// AVX512 has full vector alignr which might be faster, especially for 32 bit - - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -static inline __m256i mm256_swap_128( const __m256i v ) -{ return _mm256_alignr_epi64( v, v, 2 ); } +// Swap 128 bit elements in 256 bit vector. +#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) -static inline __m256i mm256_ror_1x64( const __m256i v ) -{ return _mm256_alignr_epi64( v, v, 1 ); } +// Rotate 256 bit vector by one 64 bit element +#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) +#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) -static inline __m256i mm256_rol_1x64( const __m256i v ) -{ return _mm256_alignr_epi64( v, v, 3 ); } +#if defined(__AVX512F__) && defined(__AVX512VL__) static inline __m256i mm256_ror_1x32( const __m256i v ) { return _mm256_alignr_epi32( v, v, 1 ); } @@ -220,21 +216,8 @@ static inline __m256i mm256_ror_1x32( const __m256i v ) static inline __m256i mm256_rol_1x32( const __m256i v ) { return _mm256_alignr_epi32( v, v, 7 ); } -static inline __m256i mm256_ror_3x32( const __m256i v ) -{ return _mm256_alignr_epi32( v, v, 3 ); } - -static inline __m256i mm256_rol_3x32( const __m256i v ) -{ return _mm256_alignr_epi32( v, v, 5 ); } - #else // AVX2 -// Swap 128 bit elements in 256 bit vector. -#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) - -// Rotate 256 bit vector by one 64 bit element -#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) -#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) - // Rotate 256 bit vector by one 32 bit element. #define mm256_ror_1x32( v ) \ _mm256_permutevar8x32_epi32( v, \ @@ -246,17 +229,6 @@ static inline __m256i mm256_rol_3x32( const __m256i v ) m256_const_64( 0x0000000600000005, 0x0000000400000003, \ 0x0000000200000001, 0x0000000000000007 ) -// Rotate 256 bit vector by three 32 bit elements (96 bits). -#define mm256_ror_3x32( v ) \ - _mm256_permutevar8x32_epi32( v, \ - m256_const_64( 0x0000000200000001, 0x0000000000000007, \ - 0x0000000600000005, 0x0000000400000003 ) - -#define mm256_rol_3x32( v ) \ - _mm256_permutevar8x32_epi32( v, \ - m256_const_64( 0x0000000400000003, 0x0000000200000001, \ - 0x0000000000000007, 0x0000000600000005 ) - #endif // AVX512 else AVX2 // diff --git a/util.c b/util.c index 6a7a0503..a3b764e2 100644 --- a/util.c +++ b/util.c @@ -943,6 +943,140 @@ bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen) return true; } +static uint32_t bech32_polymod_step(uint32_t pre) { + uint8_t b = pre >> 25; + return ((pre & 0x1FFFFFF) << 5) ^ + (-((b >> 0) & 1) & 0x3b6a57b2UL) ^ + (-((b >> 1) & 1) & 0x26508e6dUL) ^ + (-((b >> 2) & 1) & 0x1ea119faUL) ^ + (-((b >> 3) & 1) & 0x3d4233ddUL) ^ + (-((b >> 4) & 1) & 0x2a1462b3UL); +} + +static const int8_t bech32_charset_rev[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 15, -1, 10, 17, 21, 20, 26, 30, 7, 5, -1, -1, -1, -1, -1, -1, + -1, 29, -1, 24, 13, 25, 9, 8, 23, -1, 18, 22, 31, 27, 19, -1, + 1, 0, 3, 16, 11, 28, 12, 14, 6, 4, 2, -1, -1, -1, -1, -1, + -1, 29, -1, 24, 13, 25, 9, 8, 23, -1, 18, 22, 31, 27, 19, -1, + 1, 0, 3, 16, 11, 28, 12, 14, 6, 4, 2, -1, -1, -1, -1, -1 +}; + +static bool bech32_decode(char *hrp, uint8_t *data, size_t *data_len, const char *input) { + uint32_t chk = 1; + size_t i; + size_t input_len = strlen(input); + size_t hrp_len; + int have_lower = 0, have_upper = 0; + if (input_len < 8 || input_len > 90) { + return false; + } + *data_len = 0; + while (*data_len < input_len && input[(input_len - 1) - *data_len] != '1') { + ++(*data_len); + } + hrp_len = input_len - (1 + *data_len); + if (1 + *data_len >= input_len || *data_len < 6) { + return false; + } + *(data_len) -= 6; + for (i = 0; i < hrp_len; ++i) { + int ch = input[i]; + if (ch < 33 || ch > 126) { + return false; + } + if (ch >= 'a' && ch <= 'z') { + have_lower = 1; + } else if (ch >= 'A' && ch <= 'Z') { + have_upper = 1; + ch = (ch - 'A') + 'a'; + } + hrp[i] = ch; + chk = bech32_polymod_step(chk) ^ (ch >> 5); + } + hrp[i] = 0; + chk = bech32_polymod_step(chk); + for (i = 0; i < hrp_len; ++i) { + chk = bech32_polymod_step(chk) ^ (input[i] & 0x1f); + } + ++i; + while (i < input_len) { + int v = (input[i] & 0x80) ? -1 : bech32_charset_rev[(int)input[i]]; + if (input[i] >= 'a' && input[i] <= 'z') have_lower = 1; + if (input[i] >= 'A' && input[i] <= 'Z') have_upper = 1; + if (v == -1) { + return false; + } + chk = bech32_polymod_step(chk) ^ v; + if (i + 6 < input_len) { + data[i - (1 + hrp_len)] = v; + } + ++i; + } + if (have_lower && have_upper) { + return false; + } + return chk == 1; +} + +static bool convert_bits(uint8_t *out, size_t *outlen, int outbits, const uint8_t *in, size_t inlen, int inbits, int pad) { + uint32_t val = 0; + int bits = 0; + uint32_t maxv = (((uint32_t)1) << outbits) - 1; + while (inlen--) { + val = (val << inbits) | *(in++); + bits += inbits; + while (bits >= outbits) { + bits -= outbits; + out[(*outlen)++] = (val >> bits) & maxv; + } + } + if (pad) { + if (bits) { + out[(*outlen)++] = (val << (outbits - bits)) & maxv; + } + } else if (((val << (outbits - bits)) & maxv) || bits >= inbits) { + return false; + } + return true; +} + +static bool segwit_addr_decode(int *witver, uint8_t *witdata, size_t *witdata_len, const char *addr) { + uint8_t data[84]; + char hrp_actual[84]; + size_t data_len; + if (!bech32_decode(hrp_actual, data, &data_len, addr)) return false; + if (data_len == 0 || data_len > 65) return false; + if (data[0] > 16) return false; + *witdata_len = 0; + if (!convert_bits(witdata, witdata_len, 8, data + 1, data_len - 1, 5, 0)) return false; + if (*witdata_len < 2 || *witdata_len > 40) return false; + if (data[0] == 0 && *witdata_len != 20 && *witdata_len != 32) return false; + *witver = data[0]; + return true; +} + +static size_t bech32_to_script(uint8_t *out, size_t outsz, const char *addr) { + uint8_t witprog[40]; + size_t witprog_len; + int witver; + + if (!segwit_addr_decode(&witver, witprog, &witprog_len, addr)) + return 0; + if (outsz < witprog_len + 2) + return 0; + out[0] = witver ? (0x50 + witver) : 0; + out[1] = witprog_len; + memcpy(out + 2, witprog, witprog_len); + + if ( opt_debug ) + applog( LOG_INFO, "Coinbase address uses Bech32 coding"); + + return witprog_len + 2; +} + size_t address_to_script( unsigned char *out, size_t outsz, const char *addr ) { unsigned char addrbin[ pk_buffer_size_max ]; @@ -950,12 +1084,15 @@ size_t address_to_script( unsigned char *out, size_t outsz, const char *addr ) size_t rv; if ( !b58dec( addrbin, outsz, addr ) ) - return 0; + return bech32_to_script( out, outsz, addr ); addrver = b58check( addrbin, outsz, addr ); if ( addrver < 0 ) return 0; + if ( opt_debug ) + applog( LOG_INFO, "Coinbase address uses B58 coding"); + switch ( addrver ) { case 5: /* Bitcoin script hash */ @@ -1486,9 +1623,6 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i if ( !opt_quiet ) /* pool dynamic change */ applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d", xnonce1, xn2_size); -// if (pndx == 0 && opt_debug) -// applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d", -// xnonce1, xn2_size); return true; out: @@ -1638,8 +1772,6 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p opt_extranonce = false; goto out; } - if ( !opt_quiet ) - applog( LOG_INFO, "Extranonce subscription enabled" ); sret = stratum_recv_line( sctx ); if ( sret ) @@ -1658,8 +1790,8 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p applog( LOG_WARNING, "Stratum answer id is not correct!" ); } res_val = json_object_get( extra, "result" ); -// if (opt_debug && (!res_val || json_is_false(res_val))) -// applog(LOG_DEBUG, "extranonce subscribe not supported"); + if (opt_debug && (!res_val || json_is_false(res_val))) + applog(LOG_DEBUG, "Method extranonce.subscribe is not supported"); json_decref( extra ); } free(sret);