diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 4c6e60f7..1c4d4066 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,12 @@ If not what makes it happen or not happen? Change Log ---------- +v3.15.1 + +Fix compile on AMD Zen3 CPUs with VAES. +Force new work immediately after solving a block solo. + + v3.15.0 Fugue optimized with AES, improves many sha3 algos. diff --git a/algo/fugue/fugue-aesni.c b/algo/fugue/fugue-aesni.c index dde1b21c..2dd253a7 100644 --- a/algo/fugue/fugue-aesni.c +++ b/algo/fugue/fugue-aesni.c @@ -34,13 +34,11 @@ MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x03020000 MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104}; MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06}; MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E}; -MYALIGN const unsigned int _maskd3n[] = {0xffffffff, 0xffffffff, 0xffffffff, 0x00000000}; MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2}; MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8}; MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5}; MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11}; MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c}; -MYALIGN const unsigned int _zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000}; MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000}; MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303}; @@ -61,7 +59,7 @@ MYALIGN const unsigned int _IV512[] = { #define UNPACK_S0(s0, s1, t1)\ s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\ - s0 = _mm_and_si128(s0, M128(_maskd3n)) + s0 = mm128_mask_32( s0, 8 ) #define CMIX(s1, s2, r1, r2, t1, t2)\ t1 = s1;\ @@ -78,7 +76,7 @@ MYALIGN const unsigned int _IV512[] = { #define UNPACK_S0(s0, s1, t1)\ t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\ s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\ - s0 = _mm_and_si128(s0, M128(_maskd3n)) + s0 = mm128_mask_32( s0, 8 ) #define CMIX(s1, s2, r1, r2, t1, t2)\ t1 = _mm_shuffle_epi32(s1, 0xf9);\ @@ -138,7 +136,7 @@ MYALIGN const unsigned int _IV512[] = { #define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\ _t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\ - _t2 = _mm_aesenclast_si128(_t2, M128(_zero)) + _t2 = _mm_aesenclast_si128( _t2, m128_zero ) #define SUPERMIX(t0, t1, t2, t3, t4)\ PRESUPERMIX(t0, t1, t2, t3, t4);\ @@ -181,14 +179,14 @@ MYALIGN const unsigned int _IV512[] = { SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\ r2c = _mm_xor_si128(r2c, _t0);\ - _t0 = _mm_and_si128(_t0, M128(_maskd3n));\ + _t0 = mm128_mask_32( _t0, 8 ); \ r2d = _mm_xor_si128(r2d, _t0);\ UNPACK_S0(r1c, r1a, _t3);\ SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\ r3c = _mm_xor_si128(r3c, _t0);\ - _t0 = _mm_and_si128(_t0, M128(_maskd3n));\ + _t0 = mm128_mask_32( _t0, 8 ); \ r3d = _mm_xor_si128(r3d, _t0);\ UNPACK_S0(r2c, r2a, _t3);\ SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ @@ -203,21 +201,21 @@ MYALIGN const unsigned int _IV512[] = { SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\ r2c = _mm_xor_si128(r2c, _t0);\ - _t0 = _mm_and_si128(_t0, M128(_maskd3n));\ + _t0 = mm128_mask_32( _t0, 8 ); \ r2d = _mm_xor_si128(r2d, _t0);\ UNPACK_S0(r1c, r1a, _t3);\ SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\ r3c = _mm_xor_si128(r3c, _t0);\ - _t0 = _mm_and_si128(_t0, M128(_maskd3n));\ + _t0 = mm128_mask_32( _t0, 8 ); \ r3d = _mm_xor_si128(r3d, _t0);\ UNPACK_S0(r2c, r2a, _t3);\ SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ SUPERMIX(_t2, _t3, _t0, _t1, r3c);\ _t0 = _mm_shuffle_epi32(r3c, 0x39);\ r4c = _mm_xor_si128(r4c, _t0);\ - _t0 = _mm_and_si128(_t0, M128(_maskd3n));\ + _t0 = mm128_mask_32( _t0, 8 ); \ r4d = _mm_xor_si128(r4d, _t0);\ UNPACK_S0(r3c, r3a, _t3);\ SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\ @@ -462,7 +460,7 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize) ctx->uBlockLength = 4; for(i = 0; i < 6; i++) - ctx->state[i] = _mm_setzero_si128(); + ctx->state[i] = m128_zero; ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0); ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1); diff --git a/algo/fugue/fugue-aesni.h b/algo/fugue/fugue-aesni.h index 92a0a2ef..be9806f4 100644 --- a/algo/fugue/fugue-aesni.h +++ b/algo/fugue/fugue-aesni.h @@ -17,7 +17,7 @@ #if defined(__AES__) #include "algo/sha/sha3_common.h" -#include +#include "simd-utils.h" typedef struct diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index 32f642bb..8175f745 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -7,13 +7,13 @@ * This code is placed in the public domain */ - #if !defined(GROESTL256_INTR_4WAY_H__) #define GROESTL256_INTR_4WAY_H__ 1 #include "groestl256-hash-4way.h" -#if defined(__VAES__) +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + static const __m128i round_const_l0[] __attribute__ ((aligned (64))) = { { 0x7060504030201000, 0xffffffffffffffff }, diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h index a6453798..96788f43 100644 --- a/algo/groestl/groestl512-intr-4way.h +++ b/algo/groestl/groestl512-intr-4way.h @@ -7,13 +7,12 @@ * This code is placed in the public domain */ - #if !defined(GROESTL512_INTR_4WAY_H__) #define GROESTL512_INTR_4WAY_H__ 1 #include "groestl512-hash-4way.h" -#if defined(__VAES__) +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) static const __m128i round_const_p[] __attribute__ ((aligned (64))) = { diff --git a/algo/lyra2/phi2-4way.c b/algo/lyra2/phi2-4way.c index cecfd8d3..3d385c90 100644 --- a/algo/lyra2/phi2-4way.c +++ b/algo/lyra2/phi2-4way.c @@ -4,7 +4,7 @@ #include "algo/gost/sph_gost.h" #include "algo/cubehash/cubehash_sse2.h" #include "lyra2.h" -#if defined(__VAES__) +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #include "algo/echo/echo-hash-4way.h" #elif defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" diff --git a/algo/x13/phi1612.c b/algo/x13/phi1612.c index 13c79a3e..33a17ee9 100644 --- a/algo/x13/phi1612.c +++ b/algo/x13/phi1612.c @@ -42,7 +42,6 @@ void init_phi1612_ctx() sph_skein512_init( &phi_ctx.skein ); sph_jh512_init( &phi_ctx.jh ); cubehashInit( &phi_ctx.cube, 512, 16, 32 ); - sph_fugue512_init( &phi_ctx.fugue ); sph_gost512_init( &phi_ctx.gost ); #ifdef __AES__ init_echo( &phi_ctx.echo, 512 ); diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index 99575640..ed532424 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -7,6 +7,7 @@ #include #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" +//#include "algo/jh/jh-hash-sse2.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" @@ -49,6 +50,7 @@ struct TortureGarden sph_blake512_context blake; sph_bmw512_context bmw; sph_skein512_context skein; +// jh512_sse2_hashState jh; sph_jh512_context jh; sph_keccak512_context keccak; hashState_luffa luffa; @@ -125,6 +127,7 @@ static void get_hash( void *output, const void *input, TortureGarden *garden, SHA512_Final( (unsigned char*)hash, &garden->sha512 ); break; case 8: +// jh512_sse2_full( &garden->jh, hash, input, 64 ); sph_jh512_init(&garden->jh); sph_jh512(&garden->jh, input, 64); sph_jh512_close(&garden->jh, hash); diff --git a/build-allarch.sh b/build-allarch.sh index 8b022d02..50a5865f 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,7 +4,7 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null make distclean || echo clean rm -f config.status @@ -87,6 +87,16 @@ mv cpuminer.exe cpuminer-zen.exe strip -s cpuminer mv cpuminer cpuminer-zen +make clean || echo done +rm -f config.status +CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl +# CFLAGS="-O3 -march=znver3 -Wall -fno-common" ./configure --with-curl +make -j 8 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-zen3.exe +strip -s cpuminer +mv cpuminer cpuminer-zen3 + make clean || echo done rm -f config.status CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl diff --git a/clean-all.sh b/clean-all.sh index 42aa3ffc..2ca980ec 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -1,10 +1,9 @@ #!/bin/bash # -# imake clean and rm all the targetted executables. -# tips to users. +# make clean and rm all the targetted executables. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null -rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe cpuminer-sse42 cpuminer-ssse3 > /dev/null +rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null make distclean > /dev/null diff --git a/configure b/configure index 6bb6cb34..fcbefb8c 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.0. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.15.0' -PACKAGE_STRING='cpuminer-opt 3.15.0' +PACKAGE_VERSION='3.15.1' +PACKAGE_STRING='cpuminer-opt 3.15.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.15.0 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.15.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.15.0:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.15.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.15.0 +cpuminer-opt configure 3.15.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.15.0, which was +It was created by cpuminer-opt $as_me 3.15.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.15.0' + VERSION='3.15.1' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.15.0, which was +This file was extended by cpuminer-opt $as_me 3.15.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.15.0 +cpuminer-opt config.status 3.15.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 69a0f573..c0b3c5f3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.15.0]) +AC_INIT([cpuminer-opt], [3.15.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 0b80c530..ebfdb7d9 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1829,10 +1829,11 @@ bool submit_solution( struct work *work, const void *hash, update_submit_stats( work, hash ); if unlikely( !have_stratum && !have_longpoll ) - { // block solved, force getwork + { // solo, block solved, force getwork pthread_rwlock_wrlock( &g_work_lock ); g_work_time = 0; pthread_rwlock_unlock( &g_work_lock ); + restart_threads(); } if ( !opt_quiet ) @@ -1868,25 +1869,25 @@ static bool wanna_mine(int thr_id) bool state = true; if (opt_max_temp > 0.0) - { + { float temp = cpu_temp(0); if (temp > opt_max_temp) - { + { if (!thr_id && !conditional_state[thr_id] && !opt_quiet) applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp); state = false; } } if (opt_max_diff > 0.0 && net_diff > opt_max_diff) - { + { if (!thr_id && !conditional_state[thr_id] && !opt_quiet) applog(LOG_INFO, "network diff too high, waiting..."); state = false; } if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate) - { + { if (!thr_id && !conditional_state[thr_id] && !opt_quiet) - { + { char rate[32]; format_hashrate(opt_max_rate, rate); applog(LOG_INFO, "network hashrate too high, waiting %s...", rate); @@ -1903,7 +1904,7 @@ static bool wanna_mine(int thr_id) // default void sha256d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) { - sha256d(merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size); + sha256d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size ); for ( int i = 0; i < sctx->job.merkle_count; i++ ) { memcpy( merkle_root + 32, sctx->job.merkle[i], 32 ); @@ -2038,7 +2039,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) { unsigned char *xnonce2str = abin2hex( g_work->xnonce2, g_work->xnonce2_len ); - applog( LOG_INFO, "Extranonce %s, Block %d, Net Diff %.5g", + applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g", xnonce2str, sctx->block_height, net_diff ); free( xnonce2str ); } @@ -3509,7 +3510,7 @@ bool check_cpu_capability () use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; - use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes; + use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes && use_avx512; use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || use_sha || use_vaes ); diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 986304ce..8b1fbeba 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -135,11 +135,17 @@ static inline __m128i mm128_neg1_fn() // Bitwise not (~v) #define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 ) -// Unary negation of elements +// Unary negation of elements (-v) #define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v ) #define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v ) #define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v ) +// Clear (zero) 32 bit elements based on bits set in 4 bit mask. +// Fast, avoids using vector mask, but only available for 128 bit vectors. +#define mm128_mask_32( a, mask ) \ + _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( a ), \ + _mm_castsi128_ps( a ), mask ) ) + // Add 4 values, fewer dependencies than sequential addition. #define mm128_add4_64( a, b, c, d ) \ _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) ) @@ -269,11 +275,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) // Rotate vector elements accross all lanes #define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) - #define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) - - //#define mm128_swap_64( v ) _mm_alignr_epi8( v, v, 8 ) //#define mm128_ror_1x32( v ) _mm_alignr_epi8( v, v, 4 ) //#define mm128_rol_1x32( v ) _mm_alignr_epi8( v, v, 12 ) @@ -282,53 +285,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_ror_1x8( v ) _mm_alignr_epi8( v, v, 1 ) #define mm128_rol_1x8( v ) _mm_alignr_epi8( v, v, 15 ) +// Rotate by c bytes #define mm128_ror_x8( v, c ) _mm_alignr_epi8( v, c ) #define mm128_rol_x8( v, c ) _mm_alignr_epi8( v, 16-(c) ) -/* -// Rotate 16 byte (128 bit) vector by c bytes. -// Less efficient using shift but more versatile. Use only for odd number -// byte rotations. Use shuffle above whenever possible. -#define mm128_ror_x8( v, c ) \ - _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ) - -#define mm128_rol_x8( v, c ) \ - _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ) - -#if defined (__SSE3__) -// no SSE2 implementation, no current users - -#define mm128_ror_1x16( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \ - 0x0908070605040302 ) ) -#define mm128_rol_1x16( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \ - 0x0504030201000f0e ) ) -#define mm128_ror_1x8( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \ - 0x0807060504030201 ) ) -#define mm128_rol_1x8( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \ - 0x060504030201000f ) ) -#else // SSE2 - -#define mm128_ror_1x16( v ) \ - _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) ) - -#define mm128_rol_1x16( v ) \ - _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) ) - -#define mm128_ror_1x8( v ) \ - _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) ) - -#define mm128_rol_1x8( v ) \ - _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) ) - -#endif // SSE3 else SSE2 -*/ - - // Invert vector: {3,2,1,0} -> {0,1,2,3} #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b ) diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 9a165285..155293a9 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -26,8 +26,6 @@ #define mm256_concat_128( hi, lo ) \ _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) -#define m256_const1_128( v ) \ - _mm256_broadcastsi128_si256( v ) // Equavalent of set, move 64 bit integer constants to respective 64 bit // elements. @@ -144,10 +142,11 @@ do { \ // Parallel AES, for when x is expected to be in a 256 bit register. // Use same 128 bit key. -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if 0 #define mm256_aesenc_2x128( x, k ) \ - _mm256_aesenc_epi128( x, m256_const1_128(k ) ) + _mm256_aesenc_epi128( x, k ) #else diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 571c36b4..a13e88f4 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -56,15 +56,15 @@ // If an expensive constant is to be reused in the same function it should // be declared as a local variable defined once and reused. // -// Permutations cab be very exppensive if they use a vector control index, +// Permutations can be very expensive if they use a vector control index, // even if the permutation itself is quite efficient. // The index is essentially a constant with all the baggage that brings. // The same rules apply, if an index is to be reused it should be defined // as a local. This applies specifically to bswap operations. // // Additionally, permutations using smaller vectors can be more efficient -// if the permutation doesn't cross lane boundaries ,typically 128 bits, -// ans the smnaller vector can use an imm comtrol. +// if the permutation doesn't cross lane boundaries, typically 128 bits, +// and the smnaller vector can use an imm comtrol. // // If the permutation doesn't cross lane boundaries a shuffle instructions // can be used with imm control instead of permute. @@ -182,7 +182,10 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, // // Basic operations without SIMD equivalent +// ~x #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) + +// -x #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x ) #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x ) #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x ) @@ -443,20 +446,13 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // // Rotate elements within 256 bit lanes of 512 bit vector. -// Rename these for consistency. Element size is always last. -// mm__ - - // Swap hi & lo 128 bits in each 256 bit lane - #define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e ) // Rotate 256 bit lanes by one 64 bit element - #define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 ) #define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 ) - // Rotate 256 bit lanes by one 32 bit element #define mm512_ror256_32( v ) \ diff --git a/sysinfos.c b/sysinfos.c index c010a9af..1d5cdf39 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -331,16 +331,20 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) // Feature flags // CPU_INFO ECX -#define XSAVE_Flag (1<<26) -#define OSXSAVE_Flag (1<<27) -#define AVX_Flag (1<<28) +#define SSE3_Flag 1 +#define SSSE3_Flag (1<< 9) #define XOP_Flag (1<<11) #define FMA3_Flag (1<<12) #define AES_Flag (1<<25) +#define SSE41_Flag (1<<19) #define SSE42_Flag (1<<20) +#define AES_Flag (1<<25) +#define XSAVE_Flag (1<<26) +#define OSXSAVE_Flag (1<<27) +#define AVX_Flag (1<<28) // CPU_INFO EDX -#define SSE_Flag (1<<25) // EDX +#define SSE_Flag (1<<25) #define SSE2_Flag (1<<26) // EXTENDED_FEATURES EBX @@ -359,8 +363,8 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) // Use this to detect presence of feature #define AVX_mask (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag) -#define FMA3_mask (FMA3_Flag|AVX_mask) -#define AVX512_mask (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag) +#define FMA3_mask (FMA3_Flag|AVX_mask) +#define AVX512_mask (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag) static inline bool has_sha() { @@ -476,6 +480,15 @@ static inline bool has_avx512() #endif } +// AMD Zen3 added support for 256 bit VAES without requiring AVX512. +// The original Intel spec requires AVX512F to support 512 bit VAES and +// requires AVX512VL to support 256 bit VAES. +// cpuminer-opt only uses VAES512, simply testing the VAES bit is sufficient. +// However, proper detection of VAES512 and VAES256 requires more work: +// VAES512 = VAES && AVX512F (may not support VAES256) +// VAES256 = AVX512VL ? VAES : ( AVX && VAES ) (may not support VAES512) +// VAES = VAES && AVX512F && AVX512VL (supports both) + static inline bool has_vaes() { #ifdef __arm__ diff --git a/util.c b/util.c index 0eee4282..decd65ba 100644 --- a/util.c +++ b/util.c @@ -1485,9 +1485,12 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i sctx->xnonce2_size = xn2_size; pthread_mutex_unlock(&sctx->work_lock); - if (pndx == 0 && opt_debug) /* pool dynamic change */ - applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d", - xnonce1, xn2_size); + if ( !opt_quiet ) /* pool dynamic change */ + applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d", + xnonce1, xn2_size); +// if (pndx == 0 && opt_debug) +// applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d", +// xnonce1, xn2_size); return true; out: @@ -1581,8 +1584,6 @@ bool stratum_subscribe(struct stratum_ctx *sctx) return ret; } -extern bool opt_extranonce; - bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) { json_t *val = NULL, *res_val, *err_val;