From a053690170cf3c3b0232cb27ee0cf3463330055a Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Wed, 23 Jun 2021 21:52:42 -0400 Subject: [PATCH] v3.16.4 --- RELEASE_NOTES | 5 +++++ algo/sha/sha256-hash-4way.c | 38 +++++++++++++++++++++++++++++++++-- algo/sha/sha512-hash-4way.c | 40 +++++++++++++++++++++++++++++-------- algo/sha/sph_sha2.c | 4 ++-- algo/sha/sph_sha2big.c | 3 ++- configure | 20 +++++++++---------- configure.ac | 2 +- cpu-miner.c | 7 +++++-- 8 files changed, 93 insertions(+), 26 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index e1bd547c..bf9aec58 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,11 @@ If not what makes it happen or not happen? Change Log ---------- +v3.16.4 + +Faster sha512 and sha256 when not using SHA CPU extension. +#329: Fixed GBT incorrect target diff in stats. + v3.16.3 #313 Fix compile error with GCC 11. diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index d9fb503c..a1f657e1 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -74,9 +74,15 @@ static const uint32_t K256[64] = #define CHs(X, Y, Z) \ _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) +/* #define MAJs(X, Y, Z) \ _mm_or_si128( _mm_and_si128( X, Y ), \ _mm_and_si128( _mm_or_si128( X, Y ), Z ) ) +*/ + +#define MAJs(X, Y, Z) \ + _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \ + _mm_xor_si128( Y, Z ) ) ) #define BSG2_0(x) \ _mm_xor_si128( _mm_xor_si128( \ @@ -345,9 +351,20 @@ void sha256_4way_full( void *dst, const void *data, size_t len ) #define CHx(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) +/* #define MAJx(X, Y, Z) \ _mm256_or_si256( _mm256_and_si256( X, Y ), \ _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) +*/ +/* +#define MAJx(X, Y, Z) \ + _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \ + _mm256_xor_si256( Y, Z ) ) ) +*/ + +#define MAJx(X, Y, Z) \ + _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ + Y_xor_Z ) ) #define BSG2_0x(x) \ _mm256_xor_si256( _mm256_xor_si256( \ @@ -375,6 +392,7 @@ do { \ T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ K, W[i] ) ); \ T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ D = _mm256_add_epi32( D, T1 ); \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) @@ -382,7 +400,7 @@ do { \ static void sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) { - register __m256i A, B, C, D, E, F, G, H; + register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m256i W[16]; mm256_block_bswap_32( W , in ); @@ -411,6 +429,8 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) H = m256_const1_64( 0x5BE0CD195BE0CD19 ); } + Y_xor_Z = _mm256_xor_si256( B, C ); + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); @@ -591,9 +611,20 @@ void sha256_8way_full( void *dst, const void *data, size_t len ) #define CHx16(X, Y, Z) \ _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) +/* #define MAJx16(X, Y, Z) \ _mm512_or_si512( _mm512_and_si512( X, Y ), \ _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) +*/ +/* +#define MAJx16(X, Y, Z) \ + _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \ + _mm512_xor_si512( Y, Z ) ) ) +*/ + +#define MAJx16(X, Y, Z) \ + _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \ + Y_xor_Z ) ) #define BSG2_0x16(x) \ _mm512_xor_si512( _mm512_xor_si512( \ @@ -621,6 +652,7 @@ do { \ T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \ K, W[i] ) ); \ T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ D = _mm512_add_epi32( D, T1 ); \ H = _mm512_add_epi32( T1, T2 ); \ } while (0) @@ -628,7 +660,7 @@ do { \ static void sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) { - register __m512i A, B, C, D, E, F, G, H; + register __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m512i W[16]; mm512_block_bswap_32( W , in ); @@ -657,6 +689,8 @@ sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) H = m512_const1_64( 0x5BE0CD195BE0CD19 ); } + Y_xor_Z = _mm512_xor_si512( B, C ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 9f5349b0..803c42f5 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -98,9 +98,21 @@ static const uint64_t K512[80] = #define CH8W(X, Y, Z) \ _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) +/* #define MAJ8W(X, Y, Z) \ _mm512_or_si512( _mm512_and_si512( X, Y ), \ _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) +*/ +/* Functionally identical to original but optimizable, + * subexpression X^Y from one step can be reused in the next step as Y^Z +#define MAJ8W(X, Y, Z) \ + _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \ + _mm512_xor_si512( Y, Z ) ) ) +*/ + +#define MAJ8W(X, Y, Z) \ + _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \ + Y_xor_Z ) ) #define BSG8W_5_0(x) \ _mm512_xor_si512( _mm512_xor_si512( \ @@ -172,6 +184,7 @@ do { \ T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \ K, W[i] ) ); \ T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ D = _mm512_add_epi64( D, T1 ); \ H = _mm512_add_epi64( T1, T2 ); \ } while (0) @@ -180,7 +193,7 @@ static void sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] ) { int i; - register __m512i A, B, C, D, E, F, G, H; + register __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m512i W[80]; mm512_block_bswap_64( W , in ); @@ -213,6 +226,8 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] ) H = m512_const1_64( 0x5BE0CD19137E2179 ); } + Y_xor_Z = _mm512_xor_si512( B, C ); + for ( i = 0; i < 80; i += 8 ) { SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); @@ -319,14 +334,20 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst ) // SHA-512 4 way 64 bit -/* + #define CH(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) +/* #define MAJ(X, Y, Z) \ _mm256_or_si256( _mm256_and_si256( X, Y ), \ _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) +*/ +#define MAJ(X, Y, Z) \ + _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ + Y_xor_Z ) ) + #define BSG5_0(x) \ mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \ _mm256_xor_si256( mm256_ror_64( x, 5 ), x ), 6 ), x ), 28 ) @@ -334,7 +355,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst ) #define BSG5_1(x) \ mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \ _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 ) -*/ + /* #define BSG5_0(x) \ _mm256_xor_si256( _mm256_xor_si256( \ @@ -402,7 +423,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 ) w1 = _mm256_xor_si256( X1a, X1b ); \ } while(0) */ - +/* #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ do { \ __m256i K = _mm256_set1_epi64x( K512[ i ] ); \ @@ -431,7 +452,7 @@ do { \ H = _mm256_add_epi64( T1, T2 ); \ D = _mm256_add_epi64( D, T1 ); \ } while (0) - +*/ /* #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ do { \ @@ -445,7 +466,7 @@ do { \ } while (0) */ -/* + #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ do { \ __m256i T1, T2; \ @@ -453,16 +474,17 @@ do { \ T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \ K, W[i] ) ); \ T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ D = _mm256_add_epi64( D, T1 ); \ H = _mm256_add_epi64( T1, T2 ); \ } while (0) -*/ + static void sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] ) { int i; - register __m256i A, B, C, D, E, F, G, H; + register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m256i W[80]; mm256_block_bswap_64( W , in ); @@ -495,6 +517,8 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] ) H = m256_const1_64( 0x5BE0CD19137E2179 ); } + Y_xor_Z = _mm256_xor_si256( B, C ); + for ( i = 0; i < 80; i += 8 ) { SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c index e96a2d1c..b67b0143 100644 --- a/algo/sha/sph_sha2.c +++ b/algo/sha/sph_sha2.c @@ -40,8 +40,8 @@ #endif #define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) -#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) - +//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) +#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) ) #define ROTR SPH_ROTR32 #define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) diff --git a/algo/sha/sph_sha2big.c b/algo/sha/sph_sha2big.c index 8ea292f6..06d2d16e 100644 --- a/algo/sha/sph_sha2big.c +++ b/algo/sha/sph_sha2big.c @@ -38,7 +38,8 @@ #if SPH_64 #define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) -#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) +//#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) +#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) ) #define ROTR64 SPH_ROTR64 diff --git a/configure b/configure index 1d15c406..00e7ac37 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.3. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.4. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.16.3' -PACKAGE_STRING='cpuminer-opt 3.16.3' +PACKAGE_VERSION='3.16.4' +PACKAGE_STRING='cpuminer-opt 3.16.4' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.16.3 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.16.4 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.16.3:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.16.4:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.16.3 +cpuminer-opt configure 3.16.4 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.16.3, which was +It was created by cpuminer-opt $as_me 3.16.4, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.16.3' + VERSION='3.16.4' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.16.3, which was +This file was extended by cpuminer-opt $as_me 3.16.4, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.16.3 +cpuminer-opt config.status 3.16.4 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 82a90496..6a4059da 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.16.3]) +AC_INIT([cpuminer-opt], [3.16.4]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 6b62a3c6..26e48b96 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -447,8 +447,10 @@ static bool work_decode( const json_t *val, struct work *work ) if ( !allow_mininginfo ) net_diff = algo_gate.calc_network_diff( work ); + else + net_diff = hash_to_diff( work->target ); - work->targetdiff = hash_to_diff( work->target ); + work->targetdiff = net_diff; stratum_diff = last_targetdiff = work->targetdiff; work->sharediff = 0; algo_gate.decode_extra_data( work, &net_blocks ); @@ -908,7 +910,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work ) } for ( i = 0; i < ARRAY_SIZE( work->target ); i++ ) work->target[7 - i] = be32dec( target + i ); - + net_diff = work->targetdiff = hash_to_diff( work->target ); + tmp = json_object_get( val, "workid" ); if ( tmp ) {