From 19b0ac6d5c0f48dd12596c44c6a0b77c1581dfff Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Thu, 13 Feb 2020 04:25:33 -0500 Subject: [PATCH] v3.12.3 --- RELEASE_NOTES | 11 + algo-gate-api.c | 64 +- algo/cubehash/cube-hash-2way.c | 16 - algo/cubehash/cube-hash-2way.h | 21 + algo/echo/echo-hash-4way.h | 10 +- algo/fugue/sph_fugue.h | 8 + algo/quark/anime-4way.c | 361 ++++++-- algo/quark/anime-gate.c | 8 +- algo/quark/anime-gate.h | 15 +- algo/quark/hmq1725-4way.c | 1379 ++++++++++-------------------- algo/quark/quark-4way.c | 318 +++---- algo/scrypt/scrypt.c | 3 +- algo/shavite/sph-shavite-aesni.c | 2 +- algo/shavite/sph_shavite.c | 5 + algo/shavite/sph_shavite.h | 31 +- algo/skein/skein-4way.c | 23 +- algo/skein/skein-hash-4way.c | 157 ++++ algo/skein/skein-hash-4way.h | 8 + algo/skein/skein2-4way.c | 37 +- algo/whirlpool/sph_whirlpool.h | 7 + algo/x13/skunk-4way.c | 74 +- algo/x16/hex.c | 25 +- algo/x16/x16r-4way.c | 152 +--- algo/x16/x16r-gate.h | 6 +- algo/x16/x16r.c | 17 +- algo/x16/x16rv2-4way.c | 303 +++---- algo/x16/x16rv2.c | 14 +- algo/x17/sonoa-4way.c | 752 +++++----------- algo/x17/x17-4way.c | 152 +--- algo/x17/xevan-4way.c | 256 ++---- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 2 + simd-utils/simd-256.h | 25 +- 34 files changed, 1714 insertions(+), 2570 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 257fd843..4d0e4476 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,17 @@ If not what makes it happen or not happen? Change Log ---------- +v3.12.3 + +Issue #238: Fixed skunk AVX2. + +Issue #239: Faster AVX2 & AVX512 for skein +44%, skein2 +30%, plus marginal +increases for skunk, x16r, x16rv2, x16rt, x16rt-veil, x16s, x21s. + +Faster anime VAES +57%, AVX512 +21%, AVX2 +3%. + +Redesigned code reponsible for #236. + v3.12.2 Fixed xevan, skein, skein2 AVX2, #238. diff --git a/algo-gate-api.c b/algo-gate-api.c index c8517227..4047d94c 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -281,39 +281,37 @@ void exec_hash_function( int algo, void *output, const void *pdata ) const char* const algo_alias_map[][2] = { // alias proper - { "argon2d-crds", "argon2d250" }, - { "argon2d-dyn", "argon2d500" }, - { "argon2d-uis", "argon2d4096" }, - { "bcd", "x13bcd" }, - { "bitcore", "timetravel10" }, - { "bitzeny", "yescryptr8" }, - { "blake256r8", "blakecoin" }, - { "blake256r8vnl", "vanilla" }, - { "blake256r14", "blake" }, - { "blake256r14dcr", "decred" }, - { "cryptonote", "cryptonight" }, - { "cryptonight-light", "cryptolight" }, - { "diamond", "dmd-gr" }, - { "droplp", "drop" }, - { "espers", "hmq1725" }, - { "flax", "c11" }, - { "hsr", "x13sm3" }, - { "jackpot", "jha" }, - { "jane", "scryptjane" }, - { "lyra2", "lyra2re" }, - { "lyra2v2", "lyra2rev2" }, - { "lyra2v3", "lyra2rev3" }, - { "myrgr", "myr-gr" }, - { "myriad", "myr-gr" }, - { "neo", "neoscrypt" }, - { "phi", "phi1612" }, - { "sib", "x11gost" }, - { "timetravel8", "timetravel" }, - { "veil", "x16rt-veil" }, - { "x16r-hex", "hex" }, - { "yenten", "yescryptr16" }, - { "ziftr", "zr5" }, - { NULL, NULL } + { "argon2d-crds", "argon2d250" }, + { "argon2d-dyn", "argon2d500" }, + { "argon2d-uis", "argon2d4096" }, + { "bcd", "x13bcd" }, + { "bitcore", "timetravel10" }, + { "bitzeny", "yescryptr8" }, + { "blake256r8", "blakecoin" }, + { "blake256r8vnl", "vanilla" }, + { "blake256r14", "blake" }, + { "blake256r14dcr", "decred" }, + { "diamond", "dmd-gr" }, + { "espers", "hmq1725" }, + { "flax", "c11" }, + { "hsr", "x13sm3" }, + { "jackpot", "jha" }, + { "jane", "scryptjane" }, + { "lyra2", "lyra2re" }, + { "lyra2v2", "lyra2rev2" }, + { "lyra2v3", "lyra2rev3" }, + { "myrgr", "myr-gr" }, + { "myriad", "myr-gr" }, + { "neo", "neoscrypt" }, + { "phi", "phi1612" }, + { "scryptn2", "scrypt:1048576" }, + { "sib", "x11gost" }, + { "timetravel8", "timetravel" }, + { "veil", "x16rt-veil" }, + { "x16r-hex", "hex" }, + { "yenten", "yescryptr16" }, + { "ziftr", "zr5" }, + { NULL, NULL } }; // if arg is a valid alias for a known algo it is updated with the proper diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c index 9a9dfc81..1201b8f2 100644 --- a/algo/cubehash/cube-hash-2way.c +++ b/algo/cubehash/cube-hash-2way.c @@ -179,14 +179,6 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, sp->rounds = 16; sp->pos = 0; - h[ 0] = m512_const1_128( iv[0] ); - h[ 1] = m512_const1_128( iv[1] ); - h[ 2] = m512_const1_128( iv[2] ); - h[ 3] = m512_const1_128( iv[3] ); - h[ 4] = m512_const1_128( iv[4] ); - h[ 5] = m512_const1_128( iv[5] ); - h[ 6] = m512_const1_128( iv[6] ); - h[ 7] = m512_const1_128( iv[7] ); h[ 0] = m512_const1_128( iv[0] ); h[ 1] = m512_const1_128( iv[1] ); h[ 2] = m512_const1_128( iv[2] ); @@ -447,14 +439,6 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen, sp->rounds = 16; sp->pos = 0; - h[ 0] = m256_const1_128( iv[0] ); - h[ 1] = m256_const1_128( iv[1] ); - h[ 2] = m256_const1_128( iv[2] ); - h[ 3] = m256_const1_128( iv[3] ); - h[ 4] = m256_const1_128( iv[4] ); - h[ 5] = m256_const1_128( iv[5] ); - h[ 6] = m256_const1_128( iv[6] ); - h[ 7] = m256_const1_128( iv[7] ); h[ 0] = m256_const1_128( iv[0] ); h[ 1] = m256_const1_128( iv[1] ); h[ 2] = m256_const1_128( iv[2] ); diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h index eddd8130..25df10e8 100644 --- a/algo/cubehash/cube-hash-2way.h +++ b/algo/cubehash/cube-hash-2way.h @@ -28,6 +28,27 @@ int cube_4way_update_close( cube_4way_context *sp, void *output, int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, const void *data, size_t size ); +int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen, + const void *data, size_t size ); + +#define cube512_4way_init( sp ) cube_4way_update( sp, 512 ) +#define cube512_4way_update cube_4way_update +#define cube512_4way_update_close cube_4way_update +#define cube512_4way_close cube_4way_update +#define cube512_4way_full( sp, output, data, size ) \ + cube_4way_full( sp, output, 512, data, size ) +#define cube512_4x256_full( sp, output, data, size ) \ + cube_4x256_full( sp, output, 512, data, size ) + +#define cube256_4way_init( sp ) cube_4way_update( sp, 256 ) +#define cube256_4way_update cube_4way_update +#define cube256_4way_update_close cube_4way_update +#define cube256_4way_close cube_4way_update +#define cube256_4way_full( sp, output, data, size ) \ + cube_4way_full( sp, output, 256, data, size ) +#define cube256_4x256_full( sp, output, data, size ) \ + cube_4x256_full( sp, output, 256, data, size ) + #endif // 2x128, 2 way parallel SSE2 diff --git a/algo/echo/echo-hash-4way.h b/algo/echo/echo-hash-4way.h index 014c789e..f9e906f2 100644 --- a/algo/echo/echo-hash-4way.h +++ b/algo/echo/echo-hash-4way.h @@ -22,18 +22,26 @@ typedef struct } echo_4way_context __attribute__ ((aligned (64))); int echo_4way_init( echo_4way_context *state, int hashbitlen ); - +#define echo512_4way_init( state ) echo_4way_init( state, 512 ) +#define echo256_4way_init( state ) echo_4way_init( state, 256 ) int echo_4way_update( echo_4way_context *state, const void *data, unsigned int databitlen); +#define echo512_4way_update echo_4way_update int echo_close( echo_4way_context *state, void *hashval ); +#define echo512_4way_close echo_4way_close int echo_4way_update_close( echo_4way_context *state, void *hashval, const void *data, int databitlen ); +#define echo512_4way_update_close echo_4way_update_close int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, const void *data, int datalen ); +#define echo512_4way_full( state, hashval, data, datalen ) \ + echo_4way_full( state, hashval, 512, data, datalen ) +#define echo256_4way_full( state, hashval, data, datalen ) \ + echo_4way_full( state, hashval, 256, data, datalen ) #endif #endif diff --git a/algo/fugue/sph_fugue.h b/algo/fugue/sph_fugue.h index d8d0ea04..08d4dde0 100644 --- a/algo/fugue/sph_fugue.h +++ b/algo/fugue/sph_fugue.h @@ -74,6 +74,14 @@ void sph_fugue512_close(void *cc, void *dst); void sph_fugue512_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); +#define sph_fugue512_full( cc, dst, data, len ) \ +do{ \ + sph_fugue512_init( cc ); \ + sph_fugue512( cc, data, len ); \ + sph_fugue512_close( cc, dst ); \ +}while(0) + + #ifdef __cplusplus } #endif diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c index a329d593..994d2909 100644 --- a/algo/quark/anime-4way.c +++ b/algo/quark/anime-4way.c @@ -1,18 +1,241 @@ #include "cpuminer-config.h" #include "anime-gate.h" - -#if defined (ANIME_4WAY) - #include #include #include - #include "algo/blake/blake-hash-4way.h" #include "algo/bmw/bmw-hash-4way.h" #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/groestl/aes_ni/hash-groestl.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" +#endif + +#if defined (ANIME_8WAY) + +typedef struct { + blake512_8way_context blake; + bmw512_8way_context bmw; +#if defined(__VAES__) + groestl512_4way_context groestl; +#else + hashState_groestl groestl; +#endif + jh512_8way_context jh; + skein512_8way_context skein; + keccak512_8way_context keccak; +} anime_8way_ctx_holder; + +anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64))); + +void init_anime_8way_ctx() +{ + blake512_8way_init( &anime_8way_ctx.blake ); + bmw512_8way_init( &anime_8way_ctx.bmw ); +#if defined(__VAES__) + groestl512_4way_init( &anime_8way_ctx.groestl, 64 ); +#else + init_groestl( &anime_8way_ctx.groestl, 64 ); +#endif + skein512_8way_init( &anime_8way_ctx.skein ); + jh512_8way_init( &anime_8way_ctx.jh ); + keccak512_8way_init( &anime_8way_ctx.keccak ); +} + +void anime_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhashA[8*8] __attribute__ ((aligned (64))); + uint64_t vhashB[8*8] __attribute__ ((aligned (64))); + uint64_t vhashC[8*8] __attribute__ ((aligned (64))); +#if !defined(__VAES__) + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); +#endif + __m512i* vh = (__m512i*)vhash; + __m512i* vhA = (__m512i*)vhashA; + __m512i* vhB = (__m512i*)vhashB; + __m512i* vhC = (__m512i*)vhashC; + const __m512i bit3_mask = m512_const1_64( 8 ); + const __m512i zero = _mm512_setzero_si512(); + __mmask8 vh_mask; + anime_8way_ctx_holder ctx; + memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) ); + + bmw512_8way_full( &ctx.bmw, vhash, input, 80 ); + + blake512_8way_full( &ctx.blake, vhash, vhash, 64 ); + + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), + zero ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( ( vh_mask & 0x0f ) != 0x0f ) + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + if ( ( vh_mask & 0xf0 ) != 0xf0 ) + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); + +#else + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + if ( hash0[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + if ( hash1[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + if ( hash2[0] & 8) + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + if ( hash3[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + if ( hash4[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + if ( hash5[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + if ( hash6[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + if ( hash7[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); + +#endif + + if ( vh_mask & 0xff ) + skein512_8way_full( &ctx.skein, vhashB, vhash, 64 ); + + mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); + +#endif + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), + zero ); + + if ( ( vh_mask & 0xff ) != 0xff ) + blake512_8way_full( &ctx.blake, vhashA, vhash, 64 ); + if ( vh_mask & 0xff ) + bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 ); + + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), + zero ); + + if ( ( vh_mask & 0xff ) != 0xff ) + { + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhashA ); + } + if ( vh_mask & 0xff ) + { + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhashB ); + } + + casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] ); + casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] ); + casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] ); + casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] ); +} + +int scanhash_anime_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint64_t hash64[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint64_t *hash64_q3 = &(hash64[3*8]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3]; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + *noncev = mm512_intrlv_blend_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); + + do + { + anime_8way_hash( hash64, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) ) + { + extr_lane_8x64( lane_hash, hash64, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) + { + pdata[19] = bswap_32( n + lane ); + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); + n += 8; + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined (ANIME_4WAY) typedef struct { blake512_4way_context blake; @@ -23,18 +246,6 @@ typedef struct { keccak512_4way_context keccak; } anime_4way_ctx_holder; -anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64))); - -void init_anime_4way_ctx() -{ - blake512_4way_init( &anime_4way_ctx.blake ); - bmw512_4way_init( &anime_4way_ctx.bmw ); - init_groestl( &anime_4way_ctx.groestl, 64 ); - skein512_4way_init( &anime_4way_ctx.skein ); - jh512_4way_init( &anime_4way_ctx.jh ); - keccak512_4way_init( &anime_4way_ctx.keccak ); -} - void anime_4way_hash( void *state, const void *input ) { uint64_t hash0[8] __attribute__ ((aligned (64))); @@ -48,81 +259,61 @@ void anime_4way_hash( void *state, const void *input ) __m256i* vhA = (__m256i*)vhashA; __m256i* vhB = (__m256i*)vhashB; __m256i vh_mask; - const uint32_t mask = 8; + int h_mask; const __m256i bit3_mask = m256_const1_64( 8 ); const __m256i zero = _mm256_setzero_si256(); anime_4way_ctx_holder ctx; - memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) ); + bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, input, 80 ); bmw512_4way_close( &ctx.bmw, vhash ); - blake512_4way_update( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhash ); + blake512_4way_full( &ctx.blake, vhash, vhash, 64 ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - if ( hash0[0] & mask ) - { - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - } - if ( hash1[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - } - if ( hash2[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - } - if ( hash3[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - } + // A + if ( hash0[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + if ( hash1[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + if ( hash2[0] & 8) + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + if ( hash3[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - if ( mm256_anybits0( vh_mask ) ) - { - skein512_4way_update( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhashB ); - } + // B + if ( h_mask & 0xffffffff ) + skein512_4way_full( &ctx.skein, vhashB, vhash, 64 ); mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - if ( mm256_anybits1( vh_mask ) ) - { - blake512_4way_init( &ctx.blake ); - blake512_4way_update( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhashA ); - } - if ( mm256_anybits0( vh_mask ) ) + // A + if ( ( h_mask & 0xffffffff ) != 0xffffffff ) + blake512_4way_full( &ctx.blake, vhashA, vhash, 64 ); + // B + if ( h_mask & 0xffffffff ) { bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); @@ -131,64 +322,74 @@ void anime_4way_hash( void *state, const void *input ) mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + keccak512_4way_init( &ctx.keccak ); keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); - skein512_4way_init( &ctx.skein ); - skein512_4way_update( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - if ( mm256_anybits1( vh_mask ) ) + // A + if ( ( h_mask & 0xffffffff ) != 0xffffffff ) { keccak512_4way_init( &ctx.keccak ); keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); } - if ( mm256_anybits0( vh_mask ) ) + // B + if ( h_mask & 0xffffffff ) { jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhashB ); } - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 ); + casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask ); + casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask ); + casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask ); + casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask ); } int scanhash_anime_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint64_t hash64[4*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint64_t *hash64_q3 = &(hash64[3*4]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; + const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3]; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 4; - __m256i *noncev = (__m256i*)vdata + 9; // aligned + __m256i *noncev = (__m256i*)vdata + 9; const int thr_id = mythr->id; + const bool bench = opt_benchmark; mm256_bswap32_intrlv80_4x64( vdata, pdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); - do { - anime_4way_hash( hash, vdata ); + anime_4way_hash( hash64, vdata ); - for ( int i = 0; i < 4; i++ ) - if ( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark ) + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) ) { - pdata[19] = bswap_32( n+i ); - submit_solution( work, hash+(i<<3), mythr ); + extr_lane_4x64( lane_hash, hash64, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) + { + pdata[19] = bswap_32( n + lane ); + submit_lane_solution( work, lane_hash, mythr, lane ); + } } *noncev = _mm256_add_epi32( *noncev, m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); pdata[19] = n; *hashes_done = n - first_nonce; return 0; diff --git a/algo/quark/anime-gate.c b/algo/quark/anime-gate.c index 53a06e1d..5e03c4aa 100644 --- a/algo/quark/anime-gate.c +++ b/algo/quark/anime-gate.c @@ -2,8 +2,10 @@ bool register_anime_algo( algo_gate_t* gate ) { -#if defined (ANIME_4WAY) - init_anime_4way_ctx(); +#if defined (ANIME_8WAY) + gate->scanhash = (void*)&scanhash_anime_8way; + gate->hash = (void*)&anime_8way_hash; +#elif defined (ANIME_4WAY) gate->scanhash = (void*)&scanhash_anime_4way; gate->hash = (void*)&anime_4way_hash; #else @@ -11,7 +13,7 @@ bool register_anime_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_anime; gate->hash = (void*)&anime_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/quark/anime-gate.h b/algo/quark/anime-gate.h index fdf34b4c..a7b08376 100644 --- a/algo/quark/anime-gate.h +++ b/algo/quark/anime-gate.h @@ -4,18 +4,25 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define ANIME_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define ANIME_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define ANIME_4WAY 1 #endif bool register_anime_algo( algo_gate_t* gate ); -#if defined(ANIME_4WAY) +#if defined(ANIME_8WAY) + +void anime_8way_hash( void *state, const void *input ); +int scanhash_anime_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(ANIME_4WAY) void anime_4way_hash( void *state, const void *input ); int scanhash_anime_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void init_anime_4way_ctx(); #endif diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 22d249f2..fd78d5b4 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -81,127 +81,68 @@ extern void hmq1725_8way_hash(void *state, const void *input) __m512i* vhB = (__m512i*)vhashB; __m512i* vhC = (__m512i*)vhashC; - bmw512_8way_init( &ctx.bmw ); - bmw512_8way_update( &ctx.bmw, input, 80 ); - bmw512_8way_close( &ctx.bmw, vhash ); - - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); - - intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + bmw512_8way_full( &ctx.bmw, vhash, input, 80 ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); // A - #if defined(__VAES__) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - if ( likely( ( vh_mask & 0x0f ) != 0x0f ) ) - { - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); - } - if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) ) - { - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); - } + if ( ( vh_mask & 0x0f ) != 0x0f ) + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + if ( ( vh_mask & 0xf0 ) != 0xf0 ) + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); + rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); #else - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); if ( hash0[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); if ( hash1[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); if ( hash2[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); if ( hash3[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); if ( hash4[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (char*)hash4, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); if ( hash5[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (char*)hash5, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); if ( hash6[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (char*)hash6, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); if ( hash7[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (char*)hash7, 512 ); - } + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); - intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); #endif // B if ( likely( vh_mask & 0xff ) ) - { - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhashB ); - } + skein512_8way_full( &ctx.skein, vhashB, vhash, 64 ); mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); @@ -216,32 +157,21 @@ extern void hmq1725_8way_hash(void *state, const void *input) vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); - if ( likely( ( vh_mask & 0xff ) != 0xff ) ) - { - blake512_8way_init( &ctx.blake ); - blake512_8way_update( &ctx.blake, vhash, 64 ); - blake512_8way_close( &ctx.blake, vhashA ); - } - - if ( likely( vh_mask & 0xff ) ) - { - bmw512_8way_init( &ctx.bmw ); - bmw512_8way_update( &ctx.bmw, vhash, 64 ); - bmw512_8way_close( &ctx.bmw, vhashB ); - } + // A + if ( ( vh_mask & 0xff ) != 0xff ) + blake512_8way_full( &ctx.blake, vhashA, vhash, 64 ); + // B + if ( vh_mask & 0xff ) + bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 ); mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 ); + luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); + cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), @@ -267,114 +197,60 @@ extern void hmq1725_8way_hash(void *state, const void *input) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); - rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); - #else - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); #endif - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 ); + simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); // 4x32 for haval - intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); // A if ( hash0[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); if ( hash1[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); if ( hash2[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); if ( hash3[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); if ( hash4[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); if ( hash5[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); if ( hash6[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); if ( hash7[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); - intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); // B if ( likely( vh_mask & 0xff ) ) @@ -392,51 +268,39 @@ extern void hmq1725_8way_hash(void *state, const void *input) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); #else - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash4, - (const BitSequence *)hash4, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash5, - (const BitSequence *)hash5, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash6, - (const BitSequence *)hash6, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash7, - (const BitSequence *)hash7, 512 ); - - intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)hash4, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)hash5, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)hash6, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)hash7, 64 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); #endif - blake512_8way_init( &ctx.blake ); - blake512_8way_update( &ctx.blake, vhash, 64 ); - blake512_8way_close( &ctx.blake, vhash ); + blake512_8way_full( &ctx.blake, vhash, vhash, 64 ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); @@ -447,74 +311,36 @@ extern void hmq1725_8way_hash(void *state, const void *input) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); if ( likely( ( vh_mask & 0x0f ) != 0x0f ) ) - { - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - } + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) ) - { - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - } + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); #else - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); if ( hash0[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); // - sph_shavite512_close( &ctx.shavite, hash0 ); //8 - } + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); // if ( hash1[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); // - sph_shavite512_close( &ctx.shavite, hash1 ); //8 - } + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); // if ( hash2[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); // - sph_shavite512_close( &ctx.shavite, hash2 ); //8 - } + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); // if ( hash3[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); // - sph_shavite512_close( &ctx.shavite, hash3 ); //8 - } + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); // if ( hash4[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); // - sph_shavite512_close( &ctx.shavite, hash4 ); //8 - } + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); // if ( hash5[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); // - sph_shavite512_close( &ctx.shavite, hash5 ); //8 - } + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); // if ( hash6[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); // - sph_shavite512_close( &ctx.shavite, hash6 ); //8 - } + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); // if ( hash7[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); // - sph_shavite512_close( &ctx.shavite, hash7 ); //8 - } + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); // - intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); #endif @@ -522,15 +348,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); if ( likely( vh_mask & 0x0f ) ) - { - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - } + luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 ); if ( likely( vh_mask & 0xf0 ) ) - { - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhashB, 64 ); - } + luffa512_4way_full( &ctx.luffa, vhash, vhashB, 64 ); rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); @@ -540,110 +360,64 @@ extern void hmq1725_8way_hash(void *state, const void *input) hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); - - intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); // A #if defined(__VAES__) - rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); if ( likely( ( vh_mask & 0x0f ) != 0x0f ) ) - { - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); - } + echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) ) - { - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); - } + echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); #else if ( hash0[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, 64 ); if ( hash1[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, 64 ); if ( hash2[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, 64 ); if ( hash3[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, 64 ); if ( hash4[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash4, - (const BitSequence *)hash4, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)hash4, 64 ); if ( hash5[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash5, - (const BitSequence *)hash5, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)hash5, 64 ); if ( hash6[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash6, - (const BitSequence *)hash6, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)hash6, 64 ); if ( hash7[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash7, - (const BitSequence *)hash7, 512 ); - } + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)hash7, 64 ); - intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); #endif @@ -651,15 +425,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); if ( likely( vh_mask & 0x0f ) ) - { - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - } + simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 ); if ( likely( vh_mask & 0xf0 ) ) - { - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhashB, 512 ); - } + simd512_4way_full( &ctx.simd, vhash, vhashB, 64 ); rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); @@ -671,92 +439,44 @@ extern void hmq1725_8way_hash(void *state, const void *input) shabal512_8way_update( &ctx.shabal, vhashA, 64 ); shabal512_8way_close( &ctx.shabal, vhash ); - dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + dintrlv_8x32_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); // A - intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); if ( hash0[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - } + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); if ( hash1[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - } + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); if ( hash2[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - } + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); if ( hash3[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - } + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); if ( hash4[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - } + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); if ( hash5[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - } + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); if ( hash6[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - } + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); if ( hash7[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); - } + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); - intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); // B if ( likely( vh_mask & 0xff ) ) @@ -770,39 +490,29 @@ extern void hmq1725_8way_hash(void *state, const void *input) #if defined(__VAES__) - rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); - rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); #else - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); - - intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); #endif @@ -812,8 +522,8 @@ extern void hmq1725_8way_hash(void *state, const void *input) vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); // A if ( likely( ( vh_mask & 0xff ) != 0xff ) ) @@ -829,53 +539,21 @@ extern void hmq1725_8way_hash(void *state, const void *input) // B if ( !( hash0[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); if ( !( hash1[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); if ( !( hash2[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); if ( !( hash3[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); if ( !( hash4[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); if ( !( hash5[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); if ( !( hash6[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); if ( !( hash7[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); intrlv_8x64_512( vhashB, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -889,41 +567,44 @@ extern void hmq1725_8way_hash(void *state, const void *input) int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint64_t hash64[8*8] __attribute__ ((aligned (128))); uint32_t vdata[20*8] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[49]); + uint64_t *hash64_q3 = &(hash64[3*8]); uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; + uint64_t *ptarget = (uint64_t*)work->target; + const uint64_t targ64_q3 = ptarget[3]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - const uint32_t last_nonce = max_nonce - 4; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - int thr_id = mythr->id; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; mm512_bswap32_intrlv80_8x64( vdata, pdata ); + *noncev = mm512_intrlv_blend_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - hmq1725_8way_hash( hash, vdata ); + hmq1725_8way_hash( hash64, vdata ); for ( int lane = 0; lane < 8; lane++ ) - if ( hash7[ lane<<1 ] <= Htarg ) + if ( hash64_q3[ lane ] <= targ64_q3 && !bench ) { - extr_lane_8x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + extr_lane_8x64( lane_hash, hash64, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) { - pdata[19] = n + lane; + pdata[19] = bswap_32( n + lane ); submit_lane_solution( work, lane_hash, mythr, lane ); } } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); n += 8; - } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } @@ -939,7 +620,9 @@ union _hmq1725_4way_context_overlay jh512_4way_context jh; keccak512_4way_context keccak; hashState_luffa luffa; + luffa_2way_context luffa2; cubehashParam cube; + cube_2way_context cube2; sph_shavite512_context shavite; hashState_sd sd; simd_2way_context simd; @@ -956,338 +639,217 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay; extern void hmq1725_4way_hash(void *state, const void *input) { - uint32_t hash0 [16] __attribute__ ((aligned (64))); - uint32_t hash1 [16] __attribute__ ((aligned (64))); - uint32_t hash2 [16] __attribute__ ((aligned (64))); - uint32_t hash3 [16] __attribute__ ((aligned (64))); - uint32_t vhash [16<<2] __attribute__ ((aligned (64))); - uint32_t vhashA[16<<2] __attribute__ ((aligned (64))); - uint32_t vhashB[16<<2] __attribute__ ((aligned (64))); - hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64))); - __m256i vh_mask; - const __m256i vmask = m256_const1_64( 24 ); - const uint32_t mask = 24; - __m256i* vh = (__m256i*)vhash; - __m256i* vhA = (__m256i*)vhashA; - __m256i* vhB = (__m256i*)vhashB; - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, input, 80 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + uint32_t hash0 [16] __attribute__ ((aligned (64))); + uint32_t hash1 [16] __attribute__ ((aligned (64))); + uint32_t hash2 [16] __attribute__ ((aligned (64))); + uint32_t hash3 [16] __attribute__ ((aligned (64))); + uint32_t vhash [16<<2] __attribute__ ((aligned (64))); + uint32_t vhashA[16<<2] __attribute__ ((aligned (64))); + uint32_t vhashB[16<<2] __attribute__ ((aligned (64))); + hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64))); + __m256i vh_mask; + int h_mask; + const __m256i vmask = m256_const1_64( 24 ); + const uint32_t mask = 24; + __m256i* vh = (__m256i*)vhash; + __m256i* vhA = (__m256i*)vhashA; + __m256i* vhB = (__m256i*)vhashB; + + bmw512_4way_init( &ctx.bmw ); + bmw512_4way_update( &ctx.bmw, input, 80 ); + bmw512_4way_close( &ctx.bmw, vhash ); + + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); // first fork, A is groestl serial, B is skein parallel. - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); // A - if ( hash0[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - } - if ( hash1[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - } - if ( hash2[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - } - if ( hash3[0] & mask ) - { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - } - - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + if ( hash0[0] & mask ) + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + if ( hash1[0] & mask ) + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + if ( hash2[0] & mask ) + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + if ( hash3[0] & mask ) + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + + intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); // B - if ( mm256_anybits0( vh_mask ) ) - { - skein512_4way_init( &ctx.skein ); - skein512_4way_update( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhashB ); - } + if ( h_mask & 0xffffffff ) + skein512_4way_full( &ctx.skein, vhashB, vhash, 64 ); - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4way_init( &ctx.jh ); + jh512_4way_update( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4way_init( &ctx.keccak ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhash ); // second fork, A = blake parallel, B= bmw parallel. - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - if ( mm256_anybits1( vh_mask ) ) - { - blake512_4way_init( &ctx.blake ); - blake512_4way_update( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhashA ); - } + if ( ( h_mask & 0xffffffff ) != 0xffffffff ) + blake512_4way_full( &ctx.blake, vhashA, vhash, 64 ); - if ( mm256_anybits0( vh_mask ) ) - { + if ( h_mask & 0xffffffff ) + { bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhashB ); - } + } - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); - - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0, - (const BitSequence *)hash0, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1, - (const BitSequence *)hash1, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2, - (const BitSequence *)hash2, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3, - (const BitSequence *)hash3, 64 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + luffa512_2way_full( &ctx.luffa2, vhashA, vhashA, 64 ); + luffa512_2way_full( &ctx.luffa2, vhashB, vhashB, 64 ); + + cube_2way_full( &ctx.cube2, vhashA, 512, vhashA, 64 ); + cube_2way_full( &ctx.cube2, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); // A= keccak parallel, B= jh parallel - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - if ( mm256_anybits1( vh_mask ) ) - { + if ( ( h_mask & 0xffffffff ) != 0xffffffff ) + { keccak512_4way_init( &ctx.keccak ); keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); - } + } - if ( mm256_anybits0( vh_mask ) ) - { + if ( h_mask & 0xffffffff ) + { jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhashB ); - } + } - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); - intrlv_2x128_512( vhashA, hash0, hash1 ); - intrlv_2x128_512( vhashB, hash2, hash3 ); + intrlv_2x128_512( vhashA, hash0, hash1 ); + intrlv_2x128_512( vhashB, hash2, hash3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); + simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); - rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - // 4x32 for haval - intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); + // A + if ( hash0[0] & mask ) + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + if ( hash1[0] & mask ) + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + if ( hash2[0] & mask ) + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + if ( hash3[0] & mask ) + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); - // A - - if ( hash0[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - } - if ( hash1[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - } - if ( hash2[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - } - if ( hash3[0] & mask ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - } - - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); -// B - if ( mm256_anybits0( vh_mask ) ) - { - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhash ); - memset( &vhash[8<<2], 0, 32<<2 ); - rintrlv_4x32_4x64( vhashB, vhash, 512 ); - } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - blake512_4way_init( &ctx.blake ); - blake512_4way_update( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhash ); + // B + if ( h_mask & 0xffffffff ) + { + haval256_5_4way_init( &ctx.haval ); + haval256_5_4way_update( &ctx.haval, vhash, 64 ); + haval256_5_4way_close( &ctx.haval, vhash ); + memset( &vhash[8<<2], 0, 32<<2 ); + rintrlv_4x32_4x64( vhashB, vhash, 512 ); + } - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); -// shavite & luffa, both serial, select individually. + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, 64 ); - if ( hash0[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); // - sph_shavite512_close( &ctx.shavite, hash0 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0, - (const BitSequence *)hash0, 64 ); - } + intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + blake512_4way_full( &ctx.blake, vhash, vhash, 64 ); - if ( hash1[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); // - sph_shavite512_close( &ctx.shavite, hash1 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1, - (const BitSequence *)hash1, 64 ); - } + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - if ( hash2[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); // - sph_shavite512_close( &ctx.shavite, hash2 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2, - (const BitSequence *)hash2, 64 ); - } +// shavite & luffa, both serial, select individually. - if ( hash3[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); // - sph_shavite512_close( &ctx.shavite, hash3 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3, - (const BitSequence *)hash3, 64 ); - } + if ( hash0[0] & mask ) + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); // + else + luffa_full( &ctx.luffa, (BitSequence*)hash0, 512, + (const BitSequence*)hash0, 64 ); + + if ( hash1[0] & mask ) + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); // + else + luffa_full( &ctx.luffa, (BitSequence*)hash1, 512, + (const BitSequence*)hash1, 64 ); + + if ( hash2[0] & mask ) + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); // + else + luffa_full( &ctx.luffa, (BitSequence*)hash2, 512, + (const BitSequence*)hash2, 64 ); + + if ( hash3[0] & mask ) + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); // + else + luffa_full( &ctx.luffa, (BitSequence*)hash3, 512, + (const BitSequence*)hash3, 64 ); - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4way_init( &ctx.hamsi ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); // In this situation serial simd seems to be faster. @@ -1295,61 +857,46 @@ extern void hmq1725_4way_hash(void *state, const void *input) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - if ( hash0[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - } - - else - { + if ( hash0[0] & mask ) //4 + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, 64 ); + else + { init_sd( &ctx.sd, 512 ); update_final_sd( &ctx.sd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - } + (const BitSequence *)hash0, 512 ); + } if ( hash1[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - } - + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, 64 ); else { init_sd( &ctx.sd, 512 ); update_final_sd( &ctx.sd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); + (const BitSequence *)hash1, 512 ); } if ( hash2[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - } - + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, 64 ); else { init_sd( &ctx.sd, 512 ); update_final_sd( &ctx.sd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); + (const BitSequence *)hash2, 512 ); } if ( hash3[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - } - + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, 64 ); else { init_sd( &ctx.sd, 512 ); update_final_sd( &ctx.sd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + (const BitSequence *)hash3, 512 ); } intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); @@ -1360,54 +907,30 @@ extern void hmq1725_4way_hash(void *state, const void *input) dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); // A = fugue serial, B = sha512 prarallel intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); if ( hash0[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - } + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); if ( hash1[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - } + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); if ( hash2[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - } + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); if ( hash3[0] & mask ) - { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - } + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - if ( mm256_anybits0( vh_mask ) ) + if ( h_mask & 0xffffffff ) { sha512_4way_init( &ctx.sha512 ); sha512_4way_update( &ctx.sha512, vhash, 64 ); @@ -1418,14 +941,10 @@ extern void hmq1725_4way_hash(void *state, const void *input) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); @@ -1435,15 +954,13 @@ extern void hmq1725_4way_hash(void *state, const void *input) // A = haval parallel, B = Whirlpool serial - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 4x32 for haval intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - if ( mm256_anybits1( vh_mask ) ) + if ( ( h_mask & 0xffffffff ) != 0xffffffff ) { haval256_5_4way_init( &ctx.haval ); haval256_5_4way_update( &ctx.haval, vhash, 64 ); @@ -1453,29 +970,13 @@ extern void hmq1725_4way_hash(void *state, const void *input) } if ( !( hash0[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); if ( !( hash1[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); if ( !( hash2[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); if ( !( hash3[0] & mask ) ) - { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - } + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 ); @@ -1483,48 +984,48 @@ extern void hmq1725_4way_hash(void *state, const void *input) bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - memcpy(state, vhash, 32<<2 ); + bmw512_4way_close( &ctx.bmw, state ); } int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[16*4] __attribute__ ((aligned (64))); + uint64_t hash64[8*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[25]); + uint64_t *hash64_q3 = &(hash64[3*4]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; + const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3]; + uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; const uint32_t last_nonce = max_nonce - 4; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; + __m256i *noncev = (__m256i*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; mm256_bswap32_intrlv80_4x64( vdata, pdata ); + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - hmq1725_4way_hash( hash, vdata ); + hmq1725_4way_hash( hash64, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( unlikely( hash7[ lane<<1 ] <= Htarg ) ) + if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) ) { - extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + extr_lane_4x64( lane_hash, hash64, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) { - pdata[19] = n + lane; + pdata[19] = bswap_32( n + lane ); submit_lane_solution( work, lane_hash, mythr, lane ); } } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); - + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index c8b96012..5e02c390 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -72,12 +72,10 @@ void quark_8way_hash( void *state, const void *input ) memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) ); - blake512_8way_update( &ctx.blake, input, 80 ); - blake512_8way_close( &ctx.blake, vhash ); - - bmw512_8way_update( &ctx.bmw, vhash, 64 ); - bmw512_8way_close( &ctx.bmw, vhash ); + blake512_8way_full( &ctx.blake, vhash, input, 80 ); + bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), zero ); @@ -86,70 +84,34 @@ void quark_8way_hash( void *state, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - if ( ( vh_mask & 0x0f ) != 0x0f ) - { - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); - } - if ( ( vh_mask & 0xf0 ) != 0xf0 ) - { - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); - } - rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); + if ( ( vh_mask & 0x0f ) != 0x0f ) + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + if ( ( vh_mask & 0xf0 ) != 0xf0 ) + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); #else dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 512 ); - if ( hash0[0] & mask ) - { - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - } - if ( hash1[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - } - if ( hash2[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - } - if ( hash3[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - } - if ( hash4[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (char*)hash4, 512 ); - } - if ( hash5[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (char*)hash5, 512 ); - } - if ( hash6[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (char*)hash6, 512 ); - } - if ( hash7[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (char*)hash7, 512 ); - } + if ( hash0[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + if ( hash1[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + if ( hash2[0] & 8) + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + if ( hash3[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + if ( hash4[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + if ( hash5[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + if ( hash6[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + if ( hash7[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 512 ); @@ -157,10 +119,7 @@ void quark_8way_hash( void *state, const void *input ) #endif if ( vh_mask & 0xff ) - { - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhashB ); - } + skein512_8way_full( &ctx.skein, vhashB, vhash, 64 ); mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); @@ -168,10 +127,10 @@ void quark_8way_hash( void *state, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + if ( ( vh_mask & 0x0f ) != 0x0f ) + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + if ( ( vh_mask & 0xf0 ) != 0xf0 ) + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); @@ -180,22 +139,22 @@ void quark_8way_hash( void *state, const void *input ) dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + if ( hash0[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + if ( hash1[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + if ( hash2[0] & 8) + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + if ( hash3[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + if ( hash4[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + if ( hash5[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + if ( hash6[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + if ( hash7[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 512 ); @@ -209,27 +168,16 @@ void quark_8way_hash( void *state, const void *input ) zero ); if ( ( vh_mask & 0xff ) != 0xff ) - { - blake512_8way_init( &ctx.blake ); - blake512_8way_update( &ctx.blake, vhash, 64 ); - blake512_8way_close( &ctx.blake, vhashA ); - } - + blake512_8way_full( &ctx.blake, vhashA, vhash, 64 ); if ( vh_mask & 0xff ) - { - bmw512_8way_init( &ctx.bmw ); - bmw512_8way_update( &ctx.bmw, vhash, 64 ); - bmw512_8way_close( &ctx.bmw, vhashB ); - } + bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 ); mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), zero ); @@ -258,41 +206,44 @@ void quark_8way_hash( void *state, const void *input ) int scanhash_quark_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[8*8] __attribute__ ((aligned (128))); - uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint64_t hash64[4*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[49]); - uint32_t *pdata = work->data; + uint64_t *hash64_q3 = &(hash64[3*8]); uint32_t *ptarget = work->target; + const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3]; + uint32_t *pdata = work->data; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - int thr_id = mythr->id; - const uint32_t Htarg = ptarget[7]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; mm512_bswap32_intrlv80_8x64( vdata, pdata ); + *noncev = mm512_intrlv_blend_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); do { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); - - quark_8way_hash( hash, vdata ); - pdata[19] = n; + quark_8way_hash( hash64, vdata ); - for ( int i = 0; i < 8; i++ ) - if ( unlikely( hash7[ i<<1 ] <= Htarg ) ) + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) ) { - extr_lane_8x64( lane_hash, hash, i, 256 ); - if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + extr_lane_8x64( lane_hash, hash64, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) { - pdata[19] = n+i; - submit_lane_solution( work, lane_hash, mythr, i ); + pdata[19] = bswap_32( n + lane ); + submit_lane_solution( work, lane_hash, mythr, lane ); } } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); n += 8; - } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart ); + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } @@ -333,67 +284,47 @@ void quark_4way_hash( void *state, const void *input ) __m256i* vhA = (__m256i*)vhashA; __m256i* vhB = (__m256i*)vhashB; __m256i vh_mask; + int h_mask; quark_4way_ctx_holder ctx; const __m256i bit3_mask = m256_const1_64( 8 ); - const uint32_t mask = 8; const __m256i zero = _mm256_setzero_si256(); memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) ); - blake512_4way_update( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); + blake512_4way_full( &ctx.blake, vhash, input, 80 ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - if ( hash0[0] & mask ) - { - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - } - if ( hash1[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - } - if ( hash2[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - } - if ( hash3[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - } + // A + if ( hash0[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + if ( hash1[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + if ( hash2[0] & 8) + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + if ( hash3[0] & 8 ) + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - if ( mm256_anybits1( vh_mask ) ) - { - skein512_4way_update( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhashB ); - } + // B + if ( likely( h_mask & 0xffffffff ) ) + skein512_4way_full( &ctx.skein, vhashB, vhash, 64 ); mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); @@ -401,15 +332,13 @@ void quark_4way_hash( void *state, const void *input ) jh512_4way_close( &ctx.jh, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - if ( mm256_anybits1( vh_mask ) ) - { - blake512_4way_init( &ctx.blake ); - blake512_4way_update( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhashA ); - } - - if ( mm256_anybits0( vh_mask ) ) + // A + if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) ) + blake512_4way_full( &ctx.blake, vhashA, vhash, 64 ); + // B + if ( likely( h_mask & 0xffffffff ) ) { bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); @@ -421,20 +350,20 @@ void quark_4way_hash( void *state, const void *input ) keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); - skein512_4way_init( &ctx.skein ); - skein512_4way_update( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); + h_mask = _mm256_movemask_epi8( vh_mask ); - if ( mm256_anybits1( vh_mask ) ) + // A + if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) ) { keccak512_4way_init( &ctx.keccak ); keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); } - - if ( mm256_anybits0( vh_mask ) ) + // B + if ( likely( h_mask & 0xffffffff ) ) { jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, 64 ); @@ -451,41 +380,44 @@ void quark_4way_hash( void *state, const void *input ) int scanhash_quark_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint64_t hash64[4*4] __attribute__ ((aligned (64))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[25]); + uint64_t *hash64_q3 = &(hash64[3*4]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; + const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3]; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; - const uint32_t Htarg = ptarget[7]; + const uint32_t last_nonce = max_nonce - 4; + __m256i *noncev = (__m256i*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; mm256_bswap32_intrlv80_4x64( vdata, pdata ); + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - quark_4way_hash( hash, vdata ); - pdata[19] = n; + quark_4way_hash( hash64, vdata ); - for ( int i = 0; i < 4; i++ ) - if ( unlikely( hash7[ i<<1 ] <= Htarg ) ) + for ( int lane = 0; lane < 4; lane++ ) + if ( hash64_q3[ lane ] <= targ64_q3 && !bench ) { - extr_lane_4x64( lane_hash, hash, i, 256 ); - if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + extr_lane_4x64( lane_hash, hash64, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) { - pdata[19] = n+i; - submit_lane_solution( work, lane_hash, mythr, i ); + pdata[19] = bswap_32( n + lane ); + submit_lane_solution( work, lane_hash, mythr, lane ); } } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); - *hashes_done = n - first_nonce + 1; + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index a14821ce..003af36b 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -707,6 +707,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; // thr_id arg is deprecated int throughput = scrypt_best_throughput(); int i; + volatile uint8_t *restart = &(work_restart[thr_id].restart); #ifdef HAVE_SHA256_4WAY if (sha256_use_4way()) @@ -757,7 +758,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, submit_solution( work, hash, mythr ); } } - } while (likely(n < max_nonce && !work_restart[thr_id].restart)); + } while ( likely( n < max_nonce && !(*restart) ) ); *hashes_done = n - pdata[19] + 1; pdata[19] = n; diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index e9536a0b..e047d778 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -33,7 +33,7 @@ #include #include -#ifdef __AES__ +#if defined(__AES__) #include "sph_shavite.h" #include "simd-utils.h" diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c index ba4384b4..41988f97 100644 --- a/algo/shavite/sph_shavite.c +++ b/algo/shavite/sph_shavite.c @@ -35,6 +35,8 @@ #include "sph_shavite.h" +#if !defined(__AES__) + #ifdef __cplusplus extern "C"{ #endif @@ -1762,3 +1764,6 @@ sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst #ifdef __cplusplus } #endif + +#endif // !AES + diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h index ed06ca69..cca59726 100644 --- a/algo/shavite/sph_shavite.h +++ b/algo/shavite/sph_shavite.h @@ -262,15 +262,9 @@ void sph_shavite384_close(void *cc, void *dst); void sph_shavite384_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); -// Always define sw but only define aesni when available -// Define fptrs for aesni or sw, not both. -void sph_shavite512_sw_init(void *cc); -void sph_shavite512_sw(void *cc, const void *data, size_t len); -void sph_shavite512_sw_close(void *cc, void *dst); -void sph_shavite512_sw_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - +//Don't call these directly from application code, use the macros below. #ifdef __AES__ + void sph_shavite512_aesni_init(void *cc); void sph_shavite512_aesni(void *cc, const void *data, size_t len); void sph_shavite512_aesni_close(void *cc, void *dst); @@ -285,6 +279,13 @@ void sph_shavite512_aesni_addbits_and_close( #else +void sph_shavite512_sw_init(void *cc); +void sph_shavite512_sw(void *cc, const void *data, size_t len); +void sph_shavite512_sw_close(void *cc, void *dst); +void sph_shavite512_sw_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + + #define sph_shavite512_init sph_shavite512_sw_init #define sph_shavite512 sph_shavite512_sw #define sph_shavite512_close sph_shavite512_sw_close @@ -293,6 +294,20 @@ void sph_shavite512_aesni_addbits_and_close( #endif +// Use these macros from application code. +#define shavite512_context sph_shavite512_context + +#define shavite512_init sph_shavite512_init +#define shavite512_update sph_shavite512 +#define shavite512_close sph_shavite512_close + +#define shavite512_full( cc, dst, data, len ) \ +do{ \ + shavite512_init( cc ); \ + shavite512_update( cc, data, len ); \ + shavite512_close( cc, dst ); \ +}while(0) + #ifdef __cplusplus } #endif diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index 68467855..14957273 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -24,11 +24,7 @@ void skeinhash_8way( void *state, const void *input ) uint32_t vhash32[16*8] __attribute__ ((aligned (128))); sha256_8way_context ctx_sha256; - skein512_8way_full( &ctx_skein, vhash64, input, 80 ); - -// skein512_8way_update( &ctx_skein, input + (64*8), 16 ); -// skein512_8way_close( &ctx_skein, vhash64 ); - + skein512_8way_final16( &ctx_skein, vhash64, input + (64*8) ); rintrlv_8x64_8x32( vhash32, vhash64, 512 ); sha256_8way_init( &ctx_sha256 ); @@ -57,8 +53,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce, *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); -// skein512_8way_init( &skein512_8way_ctx ); -// skein512_8way_update( &skein512_8way_ctx, vdata, 64 ); + skein512_8way_prehash64( &skein512_8way_ctx, vdata ); do { skeinhash_8way( hash, vdata ); @@ -85,14 +80,14 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce, #elif defined (SKEIN_4WAY) -//static __thread skein512_4way_context skein512_4way_ctx -// __attribute__ ((aligned (64))); +static __thread skein512_4way_context skein512_4way_ctx + __attribute__ ((aligned (64))); void skeinhash_4way( void *state, const void *input ) { uint64_t vhash64[8*4] __attribute__ ((aligned (128))); skein512_4way_context ctx_skein; -// memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) ); + memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) ); #if defined(__SHA__) uint32_t hash0[16] __attribute__ ((aligned (64))); uint32_t hash1[16] __attribute__ ((aligned (64))); @@ -104,10 +99,7 @@ void skeinhash_4way( void *state, const void *input ) sha256_4way_context ctx_sha256; #endif - skein512_4way_full( &ctx_skein, vhash64, input, 80 ); - -// skein512_4way_update( &ctx_skein, input + (64*4), 16 ); -// skein512_4way_close( &ctx_skein, vhash64 ); + skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) ); #if defined(__SHA__) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 ); @@ -156,8 +148,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; mm256_bswap32_intrlv80_4x64( vdata, pdata ); -// skein512_4way_init( &skein512_4way_ctx ); -// skein512_4way_update( &skein512_4way_ctx, vdata, 64 ); + skein512_4way_prehash64( &skein512_4way_ctx, vdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index 51b63ddc..2a36d558 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -728,6 +728,86 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data, casti_m512i( out, 7 ) = h7; } +void +skein512_8way_prehash64( skein512_8way_context *sc, const void *data ) +{ + __m512i *vdata = (__m512*)data; + __m512i *buf = sc->buf; + buf[0] = vdata[0]; + buf[1] = vdata[1]; + buf[2] = vdata[2]; + buf[3] = vdata[3]; + buf[4] = vdata[4]; + buf[5] = vdata[5]; + buf[6] = vdata[6]; + buf[7] = vdata[7]; + register __m512i h0 = m512_const1_64( 0x4903ADFF749C51CE ); + register __m512i h1 = m512_const1_64( 0x0D95DE399746DF03 ); + register __m512i h2 = m512_const1_64( 0x8FD1934127C79BCE ); + register __m512i h3 = m512_const1_64( 0x9A255629FF352CB1 ); + register __m512i h4 = m512_const1_64( 0x5DB62599DF6CA7B0 ); + register __m512i h5 = m512_const1_64( 0xEABE394CA9D5C3F4 ); + register __m512i h6 = m512_const1_64( 0x991112C71A75B523 ); + register __m512i h7 = m512_const1_64( 0xAE18A40B660FCC33 ); + uint64_t bcount = 1; + + UBI_BIG_8WAY( 224, 0 ); + sc->h0 = h0; + sc->h1 = h1; + sc->h2 = h2; + sc->h3 = h3; + sc->h4 = h4; + sc->h5 = h5; + sc->h6 = h6; + sc->h7 = h7; +} + +void +skein512_8way_final16( skein512_8way_context *sc, void *output, + const void *data ) +{ + __m512i *in = (__m512i*)data; + __m512i *buf = sc->buf; + __m512i *out = (__m512i*)output; + register __m512i h0 = sc->h0; + register __m512i h1 = sc->h1; + register __m512i h2 = sc->h2; + register __m512i h3 = sc->h3; + register __m512i h4 = sc->h4; + register __m512i h5 = sc->h5; + register __m512i h6 = sc->h6; + register __m512i h7 = sc->h7; + + const __m512i zero = m512_zero; + buf[0] = in[0]; + buf[1] = in[1]; + buf[2] = zero; + buf[3] = zero; + buf[4] = zero; + buf[5] = zero; + buf[6] = zero; + buf[7] = zero; + + uint64_t bcount = 1; + UBI_BIG_8WAY( 352, 16 ); + + buf[0] = zero; + buf[1] = zero; + + bcount = 0; + UBI_BIG_8WAY( 510, 8 ); + + out[0] = h0; + out[1] = h1; + out[2] = h2; + out[3] = h3; + out[4] = h4; + out[5] = h5; + out[6] = h6; + out[7] = h7; +} + + void skein256_8way_update(void *cc, const void *data, size_t len) { @@ -942,6 +1022,83 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data, casti_m256i( out, 7 ) = h7; } +void +skein512_4way_prehash64( skein512_4way_context *sc, const void *data ) +{ + __m256i *vdata = (__m256i*)data; + __m256i *buf = sc->buf; + buf[0] = vdata[0]; + buf[1] = vdata[1]; + buf[2] = vdata[2]; + buf[3] = vdata[3]; + buf[4] = vdata[4]; + buf[5] = vdata[5]; + buf[6] = vdata[6]; + buf[7] = vdata[7]; + register __m256i h0 = m256_const1_64( 0x4903ADFF749C51CE ); + register __m256i h1 = m256_const1_64( 0x0D95DE399746DF03 ); + register __m256i h2 = m256_const1_64( 0x8FD1934127C79BCE ); + register __m256i h3 = m256_const1_64( 0x9A255629FF352CB1 ); + register __m256i h4 = m256_const1_64( 0x5DB62599DF6CA7B0 ); + register __m256i h5 = m256_const1_64( 0xEABE394CA9D5C3F4 ); + register __m256i h6 = m256_const1_64( 0x991112C71A75B523 ); + register __m256i h7 = m256_const1_64( 0xAE18A40B660FCC33 ); + uint64_t bcount = 1; + + UBI_BIG_4WAY( 224, 0 ); + sc->h0 = h0; + sc->h1 = h1; + sc->h2 = h2; + sc->h3 = h3; + sc->h4 = h4; + sc->h5 = h5; + sc->h6 = h6; + sc->h7 = h7; +} + +void +skein512_4way_final16( skein512_4way_context *sc, void *out, const void *data ) +{ + __m256i *vdata = (__m256i*)data; + __m256i *buf = sc->buf; + register __m256i h0 = sc->h0; + register __m256i h1 = sc->h1; + register __m256i h2 = sc->h2; + register __m256i h3 = sc->h3; + register __m256i h4 = sc->h4; + register __m256i h5 = sc->h5; + register __m256i h6 = sc->h6; + register __m256i h7 = sc->h7; + + const __m256i zero = m256_zero; + buf[0] = vdata[0]; + buf[1] = vdata[1]; + buf[2] = zero; + buf[3] = zero; + buf[4] = zero; + buf[5] = zero; + buf[6] = zero; + buf[7] = zero; + + uint64_t bcount = 1; + UBI_BIG_4WAY( 352, 16 ); + + buf[0] = zero; + buf[1] = zero; + + bcount = 0; + UBI_BIG_4WAY( 510, 8 ); + + casti_m256i( out, 0 ) = h0; + casti_m256i( out, 1 ) = h1; + casti_m256i( out, 2 ) = h2; + casti_m256i( out, 3 ) = h3; + casti_m256i( out, 4 ) = h4; + casti_m256i( out, 5 ) = h5; + casti_m256i( out, 6 ) = h6; + casti_m256i( out, 7 ) = h7; +} + void skein256_4way_update(void *cc, const void *data, size_t len) { diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index c60ba5d3..eb857d29 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -69,6 +69,10 @@ void skein512_8way_init( skein512_8way_context *sc ); void skein512_8way_update( void *cc, const void *data, size_t len ); void skein512_8way_close( void *cc, void *dst ); +void skein512_8way_prehash64( skein512_8way_context *sc, const void *data ); +void skein512_8way_final16( skein512_8way_context *sc, void *out, + const void *data ); + void skein256_8way_init( skein256_8way_context *sc ); void skein256_8way_update( void *cc, const void *data, size_t len ); void skein256_8way_close( void *cc, void *dst ); @@ -96,6 +100,10 @@ void skein256_4way_init( skein256_4way_context *sc ); void skein256_4way_update( void *cc, const void *data, size_t len ); void skein256_4way_close( void *cc, void *dst ); +void skein512_4way_prehash64( skein512_4way_context *sc, const void *data ); +void skein512_4way_final16( skein512_4way_context *sc, void *out, + const void *data ); + #ifdef __cplusplus } #endif diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c index 2469271b..6fd1c274 100644 --- a/algo/skein/skein2-4way.c +++ b/algo/skein/skein2-4way.c @@ -5,20 +5,16 @@ #if defined(SKEIN_8WAY) -// static __thread skein512_8way_context skein512_8way_ctx -// __attribute__ ((aligned (64))); + static __thread skein512_8way_context skein512_8way_ctx + __attribute__ ((aligned (64))); void skein2hash_8way( void *output, const void *input ) { uint64_t hash[16*8] __attribute__ ((aligned (128))); skein512_8way_context ctx; -// memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) ); - - skein512_8way_full( &ctx, hash, input, 80 ); - -// skein512_8way_update( &ctx, input + (64*8), 16 ); -// skein512_8way_close( &ctx, hash ); + memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) ); + skein512_8way_final16( &ctx, hash, input + (64*8) ); skein512_8way_full( &ctx, output, hash, 64 ); } @@ -38,16 +34,17 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, __m512i *noncev = (__m512i*)vdata + 9; const int thr_id = mythr->id; const bool bench = opt_benchmark; + skein512_8way_context ctx; mm512_bswap32_intrlv80_8x64( vdata, pdata ); *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); -// skein512_8way_init( &skein512_8way_ctx ); -// skein512_8way_update( &skein512_8way_ctx, vdata, 64 ); + skein512_8way_prehash64( &ctx, vdata ); do { - skein2hash_8way( hash, vdata ); + skein512_8way_final16( &ctx, hash, vdata + (16*8) ); + skein512_8way_full( &ctx, hash, hash, 64 ); for ( int lane = 0; lane < 8; lane++ ) if ( unlikely( hashq3[ lane ] <= targq3 && !bench ) ) @@ -71,19 +68,16 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, #elif defined(SKEIN_4WAY) -//static __thread skein512_4way_context skein512_4way_ctx -// __attribute__ ((aligned (64))); +static __thread skein512_4way_context skein512_4way_ctx + __attribute__ ((aligned (64))); void skein2hash_4way( void *output, const void *input ) { skein512_4way_context ctx; -// memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) ); + memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) ); uint64_t hash[16*4] __attribute__ ((aligned (64))); -// skein512_4way_update( &ctx, input + (64*4), 16 ); -// skein512_4way_close( &ctx, hash ); - - skein512_4way_full( &ctx, hash, input, 80 ); + skein512_4way_final16( &ctx, hash, input + (64*4) ); skein512_4way_full( &ctx, output, hash, 64 ); } @@ -103,15 +97,16 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, __m256i *noncev = (__m256i*)vdata + 9; const int thr_id = mythr->id; const bool bench = opt_benchmark; + skein512_4way_context ctx; mm256_bswap32_intrlv80_4x64( vdata, pdata ); -// skein512_4way_init( &skein512_4way_ctx ); -// skein512_4way_update( &skein512_4way_ctx, vdata, 64 ); + skein512_4way_prehash64( &ctx, vdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - skein2hash_4way( hash, vdata ); + skein512_4way_final16( &ctx, hash, vdata + (16*4) ); + skein512_4way_full( &ctx, hash, hash, 64 ); for ( int lane = 0; lane < 4; lane++ ) if ( hash_q3[ lane ] <= targ_q3 ) diff --git a/algo/whirlpool/sph_whirlpool.h b/algo/whirlpool/sph_whirlpool.h index 70dc7fa4..801a9f92 100644 --- a/algo/whirlpool/sph_whirlpool.h +++ b/algo/whirlpool/sph_whirlpool.h @@ -120,6 +120,13 @@ void sph_whirlpool(void *cc, const void *data, size_t len); */ void sph_whirlpool_close(void *cc, void *dst); +#define sph_whirlpool512_full( cc, dst, data, len ) \ +do{ \ + sph_whirlpool_init( cc ); \ + sph_whirlpool( cc, data, len ); \ + sph_whirlpool_close( cc, dst ); \ +}while(0) + /** * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL. */ diff --git a/algo/x13/skunk-4way.c b/algo/x13/skunk-4way.c index 566f5458..c7f6fd68 100644 --- a/algo/x13/skunk-4way.c +++ b/algo/x13/skunk-4way.c @@ -35,8 +35,7 @@ void skunk_8way_hash( void *output, const void *input ) skunk_8way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) ); - skein512_8way_update( &ctx.skein, input, 80 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8way_final16( &ctx.skein, vhash, input ); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 512 ); @@ -104,35 +103,35 @@ int scanhash_skunk_8way( struct work *work, uint32_t max_nonce, uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; uint32_t n = first_nonce; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; + __m512i *noncev = (__m512i*)vdata + 9; + const int thr_id = mythr->id; volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ((uint32_t*)ptarget)[7] = 0x0cff; + if ( bench ) ptarget[7] = 0x0fff; mm512_bswap32_intrlv80_8x64( vdata, pdata ); + skein512_8way_prehash64( &skunk_8way_ctx.skein, vdata ); + *noncev = mm512_intrlv_blend_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); do { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); - skunk_8way_hash( hash, vdata ); - pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) - if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); n +=8; - } while ( likely( ( n < max_nonce-8 ) && !(*restart) ) ); - + } while ( likely( ( n < last_nonce ) && !( *restart ) ) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } @@ -159,17 +158,16 @@ static __thread skunk_4way_ctx_holder skunk_4way_ctx; void skunk_4way_hash( void *output, const void *input ) { + uint64_t vhash[8*4] __attribute__ ((aligned (128))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); skunk_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) ); - skein512_4way_update( &ctx.skein, input, 80 ); - skein512_4way_close( &ctx.skein, vhash ); + skein512_4way_final16( &ctx.skein, vhash, input + (64*4) ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 ); @@ -213,40 +211,40 @@ void skunk_4way_hash( void *output, const void *input ) int scanhash_skunk_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t hash[4*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - volatile uint8_t *restart = &(work_restart[thr_id].restart); + __m256i *noncev = (__m256i*)vdata + 9; + const int thr_id = mythr->id; + volatile uint8_t *restart = &( work_restart[ thr_id ].restart ); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ((uint32_t*)ptarget)[7] = 0x0cff; + if ( bench ) ptarget[7] = 0x0fff; mm256_bswap32_intrlv80_4x64( vdata, pdata ); + skein512_4way_prehash64( &skunk_4way_ctx.skein, vdata ); + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - skunk_4way_hash( hash, vdata ); - pdata[19] = n; for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n + i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n +=4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; + } while ( likely( ( n < last_nonce ) && !( *restart ) ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x16/hex.c b/algo/x16/hex.c index bd9294e2..2ed56479 100644 --- a/algo/x16/hex.c +++ b/algo/x16/hex.c @@ -47,6 +47,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output) *sptr = '\0'; } +/* union _hex_context_overlay { #if defined(__AES__) @@ -63,7 +64,7 @@ union _hex_context_overlay sph_keccak512_context keccak; hashState_luffa luffa; cubehashParam cube; - sph_shavite512_context shavite; + shavite512_context shavite; hashState_sd simd; sph_hamsi512_context hamsi; sph_fugue512_context fugue; @@ -72,13 +73,14 @@ union _hex_context_overlay SHA512_CTX sha512; }; typedef union _hex_context_overlay hex_context_overlay; +*/ -static __thread hex_context_overlay hex_ctx; +static __thread x16r_context_overlay hex_ctx; void hex_hash( void* output, const void* input ) { uint32_t _ALIGN(128) hash[16]; - hex_context_overlay ctx; + x16r_context_overlay ctx; memcpy( &ctx, &hex_ctx, sizeof(ctx) ); void *in = (void*) input; int size = 80; @@ -157,9 +159,7 @@ void hex_hash( void* output, const void* input ) } break; case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in, size ); - sph_shavite512_close( &ctx.shavite, hash ); + shavite512_full( &ctx.shavite, hash, in, size ); break; case SIMD: init_sd( &ctx.simd, 512 ); @@ -187,9 +187,7 @@ void hex_hash( void* output, const void* input ) sph_hamsi512_close( &ctx.hamsi, hash ); break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in, size ); - sph_fugue512_close( &ctx.fugue, hash ); + sph_fugue512_full( &ctx.fugue, hash, in, size ); break; case SHABAL: if ( i == 0 ) @@ -203,13 +201,12 @@ void hex_hash( void* output, const void* input ) break; case WHIRLPOOL: if ( i == 0 ) - sph_whirlpool( &ctx.whirlpool, in+64, 16 ); - else { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); + sph_whirlpool( &ctx.whirlpool, in+64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash ); } - sph_whirlpool_close( &ctx.whirlpool, hash ); + else + sph_whirlpool512_full( &ctx.whirlpool, hash, in, size ); break; case SHA_512: SHA512_Init( &ctx.sha512 ); diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 32a39a8f..4f3880ab 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -287,30 +287,14 @@ void x16r_8way_hash_generic( void* output, const void* input ) shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in4, size ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in5, size ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in6, size ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in7, size ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, in0, size ); + shavite512_full( &ctx.shavite, hash1, in1, size ); + shavite512_full( &ctx.shavite, hash2, in2, size ); + shavite512_full( &ctx.shavite, hash3, in3, size ); + shavite512_full( &ctx.shavite, hash4, in4, size ); + shavite512_full( &ctx.shavite, hash5, in5, size ); + shavite512_full( &ctx.shavite, hash6, in6, size ); + shavite512_full( &ctx.shavite, hash7, in7, size ); #endif break; case SIMD: @@ -363,30 +347,14 @@ void x16r_8way_hash_generic( void* output, const void* input ) hash7, vhash ); break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in4, size ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in5, size ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in6, size ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in7, size ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, in0, size ); + sph_fugue512_full( &ctx.fugue, hash1, in1, size ); + sph_fugue512_full( &ctx.fugue, hash2, in2, size ); + sph_fugue512_full( &ctx.fugue, hash3, in3, size ); + sph_fugue512_full( &ctx.fugue, hash4, in4, size ); + sph_fugue512_full( &ctx.fugue, hash5, in5, size ); + sph_fugue512_full( &ctx.fugue, hash6, in6, size ); + sph_fugue512_full( &ctx.fugue, hash7, in7, size ); break; case SHABAL: intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, @@ -431,30 +399,14 @@ void x16r_8way_hash_generic( void* output, const void* input ) } else { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in4, size ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in5, size ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in6, size ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in7, size ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, in4, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, in5, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, in6, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, in7, size ); } break; case SHA_512: @@ -576,8 +528,7 @@ void x16r_4way_prehash( void *vdata, void *pdata ) break; case SKEIN: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_init( &x16r_ctx.skein ); - skein512_4way_update( &x16r_ctx.skein, vdata, 64 ); + skein512_4way_prehash64( &x16r_ctx.skein, vdata ); break; case LUFFA: mm128_bswap32_80( edata, pdata ); @@ -692,10 +643,7 @@ void x16r_4way_hash_generic( void* output, const void* input ) break; case SKEIN: if ( i == 0 ) - { - skein512_4way_update( &ctx.skein, input + (64<<2), 16 ); - skein512_4way_close( &ctx.skein, vhash ); - } + skein512_4way_final16( &ctx.skein, vhash, input + (64*4) ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); @@ -756,18 +704,10 @@ void x16r_4way_hash_generic( void* output, const void* input ) } break; case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); + shavite512_full( &ctx.shavite, hash0, in0, size ); + shavite512_full( &ctx.shavite, hash1, in1, size ); + shavite512_full( &ctx.shavite, hash2, in2, size ); + shavite512_full( &ctx.shavite, hash3, in3, size ); break; case SIMD: intrlv_2x128( vhash, in0, in1, size<<3 ); @@ -800,18 +740,10 @@ void x16r_4way_hash_generic( void* output, const void* input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, in0, size ); + sph_fugue512_full( &ctx.fugue, hash1, in1, size ); + sph_fugue512_full( &ctx.fugue, hash2, in2, size ); + sph_fugue512_full( &ctx.fugue, hash3, in3, size ); break; case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); @@ -842,18 +774,10 @@ void x16r_4way_hash_generic( void* output, const void* input ) } else { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size ); } break; case SHA_512: diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index d58aab83..a75ca829 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -121,7 +121,7 @@ union _x16r_8way_context_overlay echo_4way_context echo; #else hashState_groestl groestl; - sph_shavite512_context shavite; + shavite512_context shavite; hashState_echo echo; #endif } __attribute__ ((aligned (64))); @@ -152,7 +152,7 @@ union _x16r_4way_context_overlay luffa_2way_context luffa; hashState_luffa luffa1; cubehashParam cube; - sph_shavite512_context shavite; + shavite512_context shavite; simd_2way_context simd; hamsi512_4way_context hamsi; sph_fugue512_context fugue; @@ -191,7 +191,7 @@ union _x16r_context_overlay sph_keccak512_context keccak; hashState_luffa luffa; cubehashParam cube; - sph_shavite512_context shavite; + shavite512_context shavite; hashState_sd simd; sph_hamsi512_context hamsi; sph_fugue512_context fugue; diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c index 08fd5317..09e89665 100644 --- a/algo/x16/x16r.c +++ b/algo/x16/x16r.c @@ -124,9 +124,7 @@ void x16r_hash_generic( void* output, const void* input ) (byte*)in, size ); break; case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in, size ); - sph_shavite512_close( &ctx.shavite, hash ); + shavite512_full( &ctx.shavite, hash, in, size ); break; case SIMD: simd_full( &ctx.simd, (BitSequence *)hash, @@ -153,9 +151,7 @@ void x16r_hash_generic( void* output, const void* input ) sph_hamsi512_close( &ctx.hamsi, hash ); break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in, size ); - sph_fugue512_close( &ctx.fugue, hash ); + sph_fugue512_full( &ctx.fugue, hash, in, size ); break; case SHABAL: if ( i == 0 ) @@ -169,13 +165,12 @@ void x16r_hash_generic( void* output, const void* input ) break; case WHIRLPOOL: if ( i == 0 ) - sph_whirlpool( &ctx.whirlpool, in+64, 16 ); - else { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); + sph_whirlpool( &ctx.whirlpool, in+64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash ); } - sph_whirlpool_close( &ctx.whirlpool, hash ); + else + sph_whirlpool512_full( &ctx.whirlpool, hash, in, size ); break; case SHA_512: SHA512_Init( &ctx.sha512 ); diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index 33d4a89f..9a7ea443 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -57,7 +57,7 @@ union _x16rv2_8way_context_overlay echo_4way_context echo; #else hashState_groestl groestl; - sph_shavite512_context shavite; + shavite512_context shavite; hashState_echo echo; #endif } __attribute__ ((aligned (64))); @@ -371,30 +371,14 @@ void x16rv2_8way_hash( void* output, const void* input ) shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in4, size ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in5, size ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in6, size ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in7, size ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, in0, size ); + shavite512_full( &ctx.shavite, hash1, in1, size ); + shavite512_full( &ctx.shavite, hash2, in2, size ); + shavite512_full( &ctx.shavite, hash3, in3, size ); + shavite512_full( &ctx.shavite, hash4, in4, size ); + shavite512_full( &ctx.shavite, hash5, in5, size ); + shavite512_full( &ctx.shavite, hash6, in6, size ); + shavite512_full( &ctx.shavite, hash7, in7, size ); #endif break; case SIMD: @@ -448,30 +432,14 @@ void x16rv2_8way_hash( void* output, const void* input ) hash7, vhash ); break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in4, size ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in5, size ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in6, size ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in7, size ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, in0, size ); + sph_fugue512_full( &ctx.fugue, hash1, in1, size ); + sph_fugue512_full( &ctx.fugue, hash2, in2, size ); + sph_fugue512_full( &ctx.fugue, hash3, in3, size ); + sph_fugue512_full( &ctx.fugue, hash4, in4, size ); + sph_fugue512_full( &ctx.fugue, hash5, in5, size ); + sph_fugue512_full( &ctx.fugue, hash6, in6, size ); + sph_fugue512_full( &ctx.fugue, hash7, in7, size ); break; case SHABAL: intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, @@ -516,30 +484,14 @@ void x16rv2_8way_hash( void* output, const void* input ) } else { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in4, size ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in5, size ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in6, size ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in7, size ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, in4, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, in5, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, in6, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, in7, size ); } break; case SHA_512: @@ -747,7 +699,7 @@ union _x16rv2_4way_context_overlay keccak512_4way_context keccak; luffa_2way_context luffa; cubehashParam cube; - sph_shavite512_context shavite; + shavite512_context shavite; simd_2way_context simd; hamsi512_4way_context hamsi; sph_fugue512_context fugue; @@ -831,47 +783,47 @@ void x16rv2_4way_hash( void* output, const void* input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case KECCAK: - if ( i == 0 ) - { - sph_tiger( &ctx.tiger, in0 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash0 ); - memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); - sph_tiger( &ctx.tiger, in1 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash1 ); - memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); - sph_tiger( &ctx.tiger, in2 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash2 ); - memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); - sph_tiger( &ctx.tiger, in3 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash3 ); - } - else - { - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in0, size ); - sph_tiger_close( &ctx.tiger, hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in1, size ); - sph_tiger_close( &ctx.tiger, hash1 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in2, size ); - sph_tiger_close( &ctx.tiger, hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in3, size ); - sph_tiger_close( &ctx.tiger, hash3 ); - } - for ( int i = (24/4); i < (64/4); i++ ) + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + } + for ( int i = (24/4); i < (64/4); i++ ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0; - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + keccak512_4way_init( &ctx.keccak ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhash ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case SKEIN: if ( i == 0 ) - skein512_4way_update( &ctx.skein, input + (64<<2), 16 ); + skein512_4way_final16( &ctx.skein, vhash, input + (64*4) ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); @@ -882,46 +834,46 @@ void x16rv2_4way_hash( void* output, const void* input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case LUFFA: - if ( i == 0 ) - { - sph_tiger( &ctx.tiger, in0 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash0 ); - memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); - sph_tiger( &ctx.tiger, in1 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash1 ); - memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); - sph_tiger( &ctx.tiger, in2 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash2 ); - memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); - sph_tiger( &ctx.tiger, in3 + 64, 16 ); - sph_tiger_close( &ctx.tiger, hash3 ); - } - else - { - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in0, size ); - sph_tiger_close( &ctx.tiger, hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in1, size ); - sph_tiger_close( &ctx.tiger, hash1 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in2, size ); - sph_tiger_close( &ctx.tiger, hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in3, size ); - sph_tiger_close( &ctx.tiger, hash3 ); - } - for ( int i = (24/4); i < (64/4); i++ ) + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + } + for ( int i = (24/4); i < (64/4); i++ ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0; - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); + intrlv_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + dintrlv_2x128( hash0, hash1, vhash, 512 ); + intrlv_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + dintrlv_2x128( hash2, hash3, vhash, 512 ); break; case CUBEHASH: if ( i == 0 ) @@ -955,18 +907,10 @@ void x16rv2_4way_hash( void* output, const void* input ) } break; case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); + shavite512_full( &ctx.shavite, hash0, in0, size ); + shavite512_full( &ctx.shavite, hash1, in1, size ); + shavite512_full( &ctx.shavite, hash2, in2, size ); + shavite512_full( &ctx.shavite, hash3, in3, size ); break; case SIMD: intrlv_2x128( vhash, in0, in1, size<<3 ); @@ -999,18 +943,10 @@ void x16rv2_4way_hash( void* output, const void* input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, in0, size ); + sph_fugue512_full( &ctx.fugue, hash1, in1, size ); + sph_fugue512_full( &ctx.fugue, hash2, in2, size ); + sph_fugue512_full( &ctx.fugue, hash3, in3, size ); break; case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); @@ -1041,18 +977,10 @@ void x16rv2_4way_hash( void* output, const void* input ) } else { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size ); } break; case SHA_512: @@ -1117,7 +1045,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; const int thr_id = mythr->id; - __m256i *noncev = (__m256i*)vdata + 9; // aligned + __m256i *noncev = (__m256i*)vdata + 9; volatile uint8_t *restart = &(work_restart[thr_id].restart); const bool bench = opt_benchmark; @@ -1134,7 +1062,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); s_ntime = ntime; if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime ); + applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); } // Do midstate prehash on hash functions with block size <= 64 bytes. @@ -1157,8 +1085,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, break; case SKEIN: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_init( &x16rv2_ctx.skein ); - skein512_4way_update( &x16rv2_ctx.skein, vdata, 64 ); + skein512_4way_prehash64( &x16r_ctx.skein, vdata ); break; case CUBEHASH: mm128_bswap32_80( edata, pdata ); diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c index e56dd1db..2c85c885 100644 --- a/algo/x16/x16rv2.c +++ b/algo/x16/x16rv2.c @@ -51,7 +51,7 @@ union _x16rv2_context_overlay sph_keccak512_context keccak; hashState_luffa luffa; cubehashParam cube; - sph_shavite512_context shavite; + shavite512_context shavite; hashState_sd simd; sph_hamsi512_context hamsi; sph_fugue512_context fugue; @@ -136,9 +136,7 @@ void x16rv2_hash( void* output, const void* input ) (const byte*)in, size ); break; case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in, size ); - sph_shavite512_close( &ctx.shavite, hash ); + shavite512_full( &ctx.shavite, hash, in, size ); break; case SIMD: init_sd( &ctx.simd, 512 ); @@ -162,9 +160,7 @@ void x16rv2_hash( void* output, const void* input ) sph_hamsi512_close( &ctx.hamsi, hash ); break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in, size ); - sph_fugue512_close( &ctx.fugue, hash ); + sph_fugue512_full( &ctx.fugue, hash, in, size ); break; case SHABAL: sph_shabal512_init( &ctx.shabal ); @@ -172,9 +168,7 @@ void x16rv2_hash( void* output, const void* input ) sph_shabal512_close( &ctx.shabal, hash ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); - sph_whirlpool_close( &ctx.whirlpool, hash ); + sph_whirlpool512_full( &ctx.whirlpool, hash, in, size ); break; case SHA_512: sph_tiger_init( &ctx.tiger ); diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index ce01f58b..9b9380bc 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -127,40 +127,22 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -236,9 +218,7 @@ void sonoa_8way_hash( void *state, const void *input ) #endif - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, 64 ); @@ -258,40 +238,22 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -393,40 +355,22 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -477,30 +421,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); // 4 @@ -537,9 +465,7 @@ void sonoa_8way_hash( void *state, const void *input ) #endif - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, 64 ); @@ -559,40 +485,22 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -643,30 +551,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -714,39 +606,21 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); #else - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -791,9 +665,7 @@ void sonoa_8way_hash( void *state, const void *input ) #endif - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, 64 ); @@ -813,40 +685,22 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -897,30 +751,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -932,30 +770,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); // 6 @@ -992,9 +814,7 @@ void sonoa_8way_hash( void *state, const void *input ) #endif - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, 64 ); @@ -1014,40 +834,22 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -1098,30 +900,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -1133,30 +919,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -1168,30 +938,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); // 7 @@ -1248,40 +1002,22 @@ void sonoa_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -1332,30 +1068,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -1367,30 +1087,14 @@ void sonoa_8way_hash( void *state, const void *input ) dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -1657,18 +1361,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); // 4 intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); @@ -1730,18 +1426,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); @@ -1840,18 +1528,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); @@ -1861,18 +1541,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); // 6 @@ -1935,18 +1607,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); @@ -1956,18 +1620,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); @@ -1977,18 +1633,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); // 7 @@ -2051,18 +1699,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); @@ -2072,18 +1712,10 @@ void sonoa_4way_hash( void *state, const void *input ) dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 7f1780f8..8846dbd5 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -132,30 +132,14 @@ void x17_8way_hash( void *state, const void *input ) dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, 64 ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, 64 ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, 64 ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, 64 ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, 64 ); + shavite512_full( &ctx.shavite, hash1, hash1, 64 ); + shavite512_full( &ctx.shavite, hash2, hash2, 64 ); + shavite512_full( &ctx.shavite, hash3, hash3, 64 ); + shavite512_full( &ctx.shavite, hash4, hash4, 64 ); + shavite512_full( &ctx.shavite, hash5, hash5, 64 ); + shavite512_full( &ctx.shavite, hash6, hash6, 64 ); + shavite512_full( &ctx.shavite, hash7, hash7, 64 ); intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); @@ -206,30 +190,14 @@ void x17_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, 64 ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, 64 ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, 64 ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, 64 ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 ); intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -241,30 +209,14 @@ void x17_8way_hash( void *state, const void *input ) dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 ); intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -283,10 +235,10 @@ void x17_8way_hash( void *state, const void *input ) int scanhash_x17_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t hash32[8*8] __attribute__ ((aligned (128))); uint32_t vdata[20*8] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hashd7 = &(hash[7*8]); + uint32_t *hash32_d7 = &(hash32[7*8]); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -294,7 +246,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce, __m512i *noncev = (__m512i*)vdata + 9; uint32_t n = first_nonce; const int thr_id = mythr->id; - const uint32_t targ32 = ptarget[7]; + const uint32_t targ32_d7 = ptarget[7]; const bool bench = opt_benchmark; mm512_bswap32_intrlv80_8x64( vdata, pdata ); @@ -303,12 +255,12 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce, n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - x17_8way_hash( hash, vdata ); + x17_8way_hash( hash32, vdata ); for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) ) + if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) ) { - extr_lane_8x32( lane_hash, hash, lane, 256 ); + extr_lane_8x32( lane_hash, hash32, lane, 256 ); if ( likely( valid_hash( lane_hash, ptarget ) ) ) { pdata[19] = bswap_32( n + lane ); @@ -418,18 +370,10 @@ void x17_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); @@ -439,18 +383,10 @@ void x17_4way_hash( void *state, const void *input ) dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 ); intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); @@ -468,10 +404,10 @@ void x17_4way_hash( void *state, const void *input ) int scanhash_x17_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[8*4] __attribute__ ((aligned (64))); + uint32_t hash32[8*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hashd7 = &(hash[ 7*4 ]); + uint32_t *hash32_d7 = &(hash32[ 7*4 ]); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -479,7 +415,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce, __m256i *noncev = (__m256i*)vdata + 9; uint32_t n = first_nonce; const int thr_id = mythr->id; - const uint32_t targ32 = ptarget[7]; + const uint32_t targ32_d7 = ptarget[7]; const bool bench = opt_benchmark; mm256_bswap32_intrlv80_4x64( vdata, pdata ); @@ -487,12 +423,12 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce, _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - x17_4way_hash( hash, vdata ); + x17_4way_hash( hash32, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( unlikely( hashd7[ lane ] <= targ32 && !bench ) ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 && !bench ) ) { - extr_lane_4x32( lane_hash, hash, lane, 256 ); + extr_lane_4x32( lane_hash, hash32, lane, 256 ); if ( valid_hash( lane_hash, ptarget ) ) { pdata[19] = bswap_32( n + lane ); diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index 8d40a11d..02b0e13c 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -134,30 +134,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, dataLen ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, dataLen ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, dataLen ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, dataLen ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, dataLen ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, dataLen ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, dataLen ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, dataLen ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, dataLen ); + shavite512_full( &ctx.shavite, hash1, hash1, dataLen ); + shavite512_full( &ctx.shavite, hash2, hash2, dataLen ); + shavite512_full( &ctx.shavite, hash3, hash3, dataLen ); + shavite512_full( &ctx.shavite, hash4, hash4, dataLen ); + shavite512_full( &ctx.shavite, hash5, hash5, dataLen ); + shavite512_full( &ctx.shavite, hash6, hash6, dataLen ); + shavite512_full( &ctx.shavite, hash7, hash7, dataLen ); intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -208,30 +192,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, dataLen ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, dataLen ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, dataLen ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, dataLen ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, dataLen ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, dataLen ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, dataLen ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, dataLen ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, dataLen ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, dataLen ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, dataLen ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, dataLen ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, dataLen ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, dataLen ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, dataLen ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, dataLen ); intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -243,30 +211,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, dataLen ); intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -345,30 +297,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, dataLen ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, dataLen ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, dataLen ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, dataLen ); - sph_shavite512_close( &ctx.shavite, hash3 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash4, dataLen ); - sph_shavite512_close( &ctx.shavite, hash4 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash5, dataLen ); - sph_shavite512_close( &ctx.shavite, hash5 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash6, dataLen ); - sph_shavite512_close( &ctx.shavite, hash6 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash7, dataLen ); - sph_shavite512_close( &ctx.shavite, hash7 ); + shavite512_full( &ctx.shavite, hash0, hash0, dataLen ); + shavite512_full( &ctx.shavite, hash1, hash1, dataLen ); + shavite512_full( &ctx.shavite, hash2, hash2, dataLen ); + shavite512_full( &ctx.shavite, hash3, hash3, dataLen ); + shavite512_full( &ctx.shavite, hash4, hash4, dataLen ); + shavite512_full( &ctx.shavite, hash5, hash5, dataLen ); + shavite512_full( &ctx.shavite, hash6, hash6, dataLen ); + shavite512_full( &ctx.shavite, hash7, hash7, dataLen ); intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -419,30 +355,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, dataLen ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, dataLen ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, dataLen ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, dataLen ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash4, dataLen ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash5, dataLen ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash6, dataLen ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash7, dataLen ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, dataLen ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, dataLen ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, dataLen ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, dataLen ); + sph_fugue512_full( &ctx.fugue, hash4, hash4, dataLen ); + sph_fugue512_full( &ctx.fugue, hash5, hash5, dataLen ); + sph_fugue512_full( &ctx.fugue, hash6, hash6, dataLen ); + sph_fugue512_full( &ctx.fugue, hash7, hash7, dataLen ); intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -454,30 +374,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash4, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash5, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash6, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash7, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, dataLen ); intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -636,18 +540,10 @@ void xevan_4way_hash( void *output, const void *input ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, dataLen ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, dataLen ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, dataLen ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, dataLen ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); // Parallel 4way 32 bit intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -659,18 +555,10 @@ void xevan_4way_hash( void *output, const void *input ) dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); // Serial - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -749,18 +637,10 @@ void xevan_4way_hash( void *output, const void *input ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, dataLen ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, dataLen ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, dataLen ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, dataLen ); - sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 ); + sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 ); + sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 ); intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -770,18 +650,10 @@ void xevan_4way_hash( void *output, const void *input ) dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen ); + sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); diff --git a/configure b/configure index e48b7208..7657c1ef 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.12.2. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.12.3. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.12.2' -PACKAGE_STRING='cpuminer-opt 3.12.2' +PACKAGE_VERSION='3.12.3' +PACKAGE_STRING='cpuminer-opt 3.12.3' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.12.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.12.3 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.12.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.12.3:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.12.2 +cpuminer-opt configure 3.12.3 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.12.2, which was +It was created by cpuminer-opt $as_me 3.12.3, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.12.2' + VERSION='3.12.3' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.12.2, which was +This file was extended by cpuminer-opt $as_me 3.12.3, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.12.2 +cpuminer-opt config.status 3.12.3 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 382d0df8..f8acae9a 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.12.2]) +AC_INIT([cpuminer-opt], [3.12.3]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 2c89fc19..43cd4f52 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2458,6 +2458,8 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) pthread_mutex_unlock( &sctx->work_lock ); + restart_threads(); + if ( opt_debug ) { unsigned char *xnonce2str = abin2hex( g_work->xnonce2, diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 7e8f61b1..e3df4403 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -120,11 +120,26 @@ do { \ } while(0) -// Horizontal vector testing -#define mm256_allbits0( a ) _mm256_testc_si256( a, m256_neg1 ) -#define mm256_allbits1( a ) _mm256_testz_si256( a, a ) -#define mm256_anybits0( a ) !mm256_allbits1( a ) -#define mm256_anybits1( a ) !mm256_allbits0( a ) +// Bytewise test of all 256 bits +#define mm256_all0_8( a ) \ + ( _mm256_movemask_epi8( a ) == 0 ) + +#define mm256_all1_8( a ) \ + ( _mm256_movemask_epi8( a ) == -1 ) + + +#define mm256_anybits0( a ) \ + ( _mm256_movemask_epi8( a ) & 0xffffffff ) + +#define mm256_anybits1( a ) \ + ( ( _mm256_movemask_epi8( a ) & 0xffffffff ) != 0xffffffff ) + + +// Bitwise test of all 256 bits +#define mm256_allbits0( a ) _mm256_testc_si256( a, m256_neg1 ) +#define mm256_allbits1( a ) _mm256_testc_si256( m256_zero, a ) +//#define mm256_anybits0( a ) !mm256_allbits1( a ) +//#define mm256_anybits1( a ) !mm256_allbits0( a ) // Parallel AES, for when x is expected to be in a 256 bit register.