Skip to content

Commit

Permalink
v3.10.1
Browse files Browse the repository at this point in the history
  • Loading branch information
JayDDee committed Dec 6, 2019
1 parent 4003938 commit 73430b1
Show file tree
Hide file tree
Showing 52 changed files with 4,531 additions and 890 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ Supported Algorithms
Errata
------

Old algorithms that are no longer used frequently will not have the latest
optimizations.

Cryptonight and variants are no longer supported, use another miner.

Neoscrypt crashes on Windows, use legacy version.
Expand Down
6 changes: 3 additions & 3 deletions README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ the features listed at cpuminer startup to ensure you are mining at
optimum speed using the best available features.

Architecture names and compile options used are only provided for Intel
Core series. Even the newest Pentium and Celeron CPUs are often missing
features.
Core series. Budget CPUs like Pentium and Celeron are often missing the
latest features.

AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
Expand All @@ -28,7 +28,7 @@ Exe name Compile flags Arch name
cpuminer-sse2.exe "-msse2" Core2, Nehalem
cpuminer-aes-sse42.exe "-march=westmere" Westmere
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Sky-Kaby-Coffeelake
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper

Expand Down
15 changes: 13 additions & 2 deletions RELEASE_NOTES
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,20 @@ FreeBSD YMMV.
Change Log
----------

v3.10.1

AVX512 for blake2b, nist5, quark, tribus.

More broken lane fixes.

Fixed buffer overflow in skein AVX512.

Only the highest ranking feature in a class is listed at startup, lower ranking
features are available but no longer listed.

v3.10.0

AVX-512 is now supported on selected algos, Windows binary is now available.
AVX512 is now supported on selected algos, Windows binary is now available.
AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
skein & skein2.

Expand All @@ -45,7 +56,7 @@ Fixed some previously undetected buffer overflows.

Lyra2rev2 3% faster SSE2 and AVX2.

Added "-fno-asynchronous-unwind-tables" to AVX512 build acript for Windows
Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
to fix known mingw issue.

Changed AVX2 build script to explicitly add AES to address change in
Expand Down
86 changes: 26 additions & 60 deletions algo/argon2/argon2d/argon2d/opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#include "argon2.h"
#include "core.h"

#include "simd-utils.h"
#include "../blake2/blake2.h"
#include "../blake2/blamka-round-opt.h"

Expand All @@ -37,24 +37,28 @@

#if defined(__AVX512F__)

static void fill_block(__m512i *state, const block *ref_block,
block *next_block, int with_xor) {
static void fill_block( __m512i *state, const block *ref_block,
block *next_block, int with_xor )
{
__m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
unsigned int i;

if (with_xor) {
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
state[i] = _mm512_xor_si512(
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
block_XY[i] = _mm512_xor_si512(
state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
}
} else {
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm512_xor_si512(
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
if ( with_xor )
{
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
{
state[i] = _mm512_xor_si512( state[i],
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
block_XY[i] = _mm512_xor_si512( state[i],
_mm512_load_si512( (const __m512i*)next_block->v + i ) );
}
}
else
{
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
block_XY[i] = state[i] = _mm512_xor_si512( state[i],
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
}

BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
state[ 4], state[ 5], state[ 6], state[ 7] );
Expand All @@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
state[ 9], state[11], state[13], state[15] );

/*
for (i = 0; i < 2; ++i) {
BLAKE2_ROUND_1(
state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
}
for (i = 0; i < 2; ++i) {
BLAKE2_ROUND_2(
state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
}
*/

for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
state[i] = _mm512_xor_si512(state[i], block_XY[i]);
_mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
{
state[i] = _mm512_xor_si512( state[i], block_XY[i] );
_mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
}
}

Expand Down Expand Up @@ -125,18 +116,6 @@ static void fill_block(__m256i *state, const block *ref_block,
BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
state[19], state[23], state[27], state[31] );

/*
for (i = 0; i < 4; ++i) {
BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
}
for (i = 0; i < 4; ++i) {
BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
}
*/

for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
_mm256_store_si256((__m256i *)next_block->v + i, state[i]);
Expand All @@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
if (with_xor) {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
block_XY[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
state[i], _mm_load_si128((const __m128i *)next_block->v + i));
}
} else {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
}
}

Expand Down Expand Up @@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],
state[39], state[47], state[55], state[63] );

/*
for (i = 0; i < 8; ++i) {
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
state[8 * i + 6], state[8 * i + 7]);
}
for (i = 0; i < 8; ++i) {
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
state[8 * 6 + i], state[8 * 7 + i]);
}
*/
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(state[i], block_XY[i]);
_mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
_mm_store_si128((__m128i *)next_block->v + i, state[i]);
}
}

Expand Down
8 changes: 4 additions & 4 deletions algo/argon2/argon2d/blake2/blamka-round-opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
#define SWAP_QUARTERS(A0, A1) \
do { \
SWAP_HALVES(A0, A1); \
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
} while((void)0, 0)

#define UNSWAP_QUARTERS(A0, A1) \
do { \
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
SWAP_HALVES(A0, A1); \
} while((void)0, 0)

Expand Down
36 changes: 29 additions & 7 deletions algo/blake/blake-hash-4way.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,20 +118,42 @@ void blake256r8_8way_close(void *cc, void *dst);
// Blake-512 4 way

typedef struct {
__m256i buf[16] __attribute__ ((aligned (64)));
__m256i buf[16];
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u64 T0, T1;
} blake_4way_big_context;
} blake_4way_big_context __attribute__ ((aligned (128)));

typedef blake_4way_big_context blake512_4way_context;

void blake512_4way_init(void *cc);
void blake512_4way(void *cc, const void *data, size_t len);
void blake512_4way_close(void *cc, void *dst);
void blake512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
void blake512_4way_init( void *cc );
void blake512_4way_update( void *cc, const void *data, size_t len );
#define blake512_4way blake512_4way_update
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

typedef struct {
__m512i buf[16];
__m512i H[8];
__m512i S[4];
size_t ptr;
sph_u64 T0, T1;
} blake_8way_big_context __attribute__ ((aligned (128)));

typedef blake_8way_big_context blake512_8way_context;

void blake512_8way_init( void *cc );
void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );

#endif // AVX512


#endif // AVX2

Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake2b-4way.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
Expand Down
Loading

0 comments on commit 73430b1

Please sign in to comment.