v3.10.1

JayDDee · Dec 6, 2019 · 73430b1 · 73430b1
1 parent 4003938
commit 73430b1
Show file tree

Hide file tree

Showing 52 changed files with 4,531 additions and 890 deletions.
diff --git a/README.md b/README.md
@@ -144,6 +144,9 @@ Supported Algorithms
 Errata
 ------
 
+Old algorithms that are no longer used frequently will not have the latest
+optimizations.
+
 Cryptonight and variants are no longer supported, use another miner.
 
 Neoscrypt crashes on Windows, use legacy version.

diff --git a/README.txt b/README.txt
@@ -15,8 +15,8 @@ the features listed at cpuminer startup to ensure you are mining at
 optimum speed using the best available features.
 
 Architecture names and compile options used are only provided for Intel
-Core series. Even the newest Pentium and Celeron CPUs are often missing
-features.
+Core series. Budget CPUs like Pentium and Celeron are often missing the
+latest features.
 
 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
@@ -28,7 +28,7 @@ Exe name                Compile flags            Arch name
 cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem   
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
-cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
+cpuminer-avx2.exe      "-march=core-avx2 -maes"  Haswell, Sky-Kaby-Coffeelake
 cpuminer-avx512.exe    "-march=skylake-avx512"   Skylake-X, Cascadelake-X
 cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper
 

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -31,9 +31,20 @@ FreeBSD YMMV.
 Change Log
 ----------
 
+v3.10.1
+
+AVX512 for blake2b, nist5, quark, tribus.
+
+More broken lane fixes.
+
+Fixed buffer overflow in skein AVX512.
+
+Only the highest ranking feature in a class is listed at startup, lower ranking
+features are available but no longer listed.
+
 v3.10.0
 
-AVX-512 is now supported on selected algos, Windows binary is now available.
+AVX512 is now supported on selected algos, Windows binary is now available.
 AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
 skein & skein2.
 
@@ -45,7 +56,7 @@ Fixed some previously undetected buffer overflows.
 
 Lyra2rev2 3% faster SSE2 and AVX2.
 
-Added "-fno-asynchronous-unwind-tables" to AVX512 build acript for Windows
+Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
 to fix known mingw issue.
 
 Changed AVX2 build script to explicitly add AES to address change in

diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c
@@ -21,7 +21,7 @@
 
 #include "argon2.h"
 #include "core.h"
-
+#include "simd-utils.h"
 #include "../blake2/blake2.h"
 #include "../blake2/blamka-round-opt.h"
 
@@ -37,24 +37,28 @@
 
 #if defined(__AVX512F__)
 
-static void fill_block(__m512i *state, const block *ref_block,
-                       block *next_block, int with_xor) {
+static void fill_block( __m512i *state, const block *ref_block,
+                       block *next_block, int with_xor )
+{
     __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
     unsigned int i;
 
-    if (with_xor) {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
-            block_XY[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
-        }
-    } else {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+    if ( with_xor )
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+        {
+            state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+            block_XY[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)next_block->v + i ) );
         }
     }
+    else
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+            block_XY[i] = state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+    }
 
     BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
                     state[ 4], state[ 5], state[ 6], state[ 7] );
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
     BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
                     state[ 9], state[11], state[13], state[15] );
 
-/*
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_1(
-            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
-            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_2(
-            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
-            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
-    }
-*/
-
-    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
-        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+    {
+        state[i] = _mm512_xor_si512( state[i], block_XY[i] );
+        _mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
     }
 }
 
@@ -125,18 +116,6 @@ static void fill_block(__m256i *state, const block *ref_block,
     BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
                     state[19], state[23], state[27], state[31] );
 
-/*
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
-                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
-                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
-    }
-*/
-
     for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
         state[i] = _mm256_xor_si256(state[i], block_XY[i]);
         _mm256_store_si256((__m256i *)next_block->v + i, state[i]);
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
     if (with_xor) {
         for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
             state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
             block_XY[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)next_block->v + i));
         }
     } else {
         for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
             block_XY[i] = state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
         }
     }
 
@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
     BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
                   state[39], state[47], state[55], state[63] );
 
-/*
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
-            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
-            state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
-            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
-            state[8 * 6 + i], state[8 * 7 + i]);
-    }
-*/
     for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
         state[i] = _mm_xor_si128(state[i], block_XY[i]);
-        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
+        _mm_store_si128((__m128i *)next_block->v + i, state[i]);
     }
 }
 

diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -427,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
 #define SWAP_QUARTERS(A0, A1) \
     do { \
         SWAP_HALVES(A0, A1); \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
     } while((void)0, 0)
 
 #define UNSWAP_QUARTERS(A0, A1) \
     do { \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
         SWAP_HALVES(A0, A1); \
     } while((void)0, 0)
 

diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h
@@ -118,20 +118,42 @@ void blake256r8_8way_close(void *cc, void *dst);
 // Blake-512 4 way
 
 typedef struct {
-   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i buf[16];
    __m256i H[8];
    __m256i S[4];   
    size_t ptr;
    sph_u64 T0, T1;
-} blake_4way_big_context;
+} blake_4way_big_context __attribute__ ((aligned (128)));
 
 typedef blake_4way_big_context blake512_4way_context;
 
-void blake512_4way_init(void *cc);
-void blake512_4way(void *cc, const void *data, size_t len);
-void blake512_4way_close(void *cc, void *dst);
-void blake512_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
+void blake512_4way_init( void *cc );
+void blake512_4way_update( void *cc, const void *data, size_t len );
+#define blake512_4way blake512_4way_update
+void blake512_4way_close( void *cc, void *dst );
+void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[8];
+   __m512i S[4];
+   size_t ptr;
+   sph_u64 T0, T1;
+} blake_8way_big_context __attribute__ ((aligned (128)));
+
+typedef blake_8way_big_context blake512_8way_context;
+
+void blake512_8way_init( void *cc );
+void blake512_8way_update( void *cc, const void *data, size_t len );
+void blake512_8way_close( void *cc, void *dst );
+void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#endif  // AVX512
+
 
 #endif  // AVX2
 

diff --git a/algo/blake/blake2b-4way.c b/algo/blake/blake2b-4way.c
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    int thr_id = mythr->id;