v3.16.3

JayDDee · May 6, 2021 · 3c5e892 · 3c5e892
1 parent f3333b0
commit 3c5e892
Show file tree

Hide file tree

Showing 14 changed files with 183 additions and 170 deletions.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -65,11 +65,16 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.16.3
+
+#313 Fix compile error with GCC 11.
+Incremental improvements to verthash.
+
 v3.16.2
 
 Verthash: midstate prehash optimization for all architectures.
 Verthash: AVX2 optimization.
-GBT: added support for Bech32 addresses, untested.
+GBT: added support for Bech32 addresses.
 Linux: added CPU frequency to benchmark log.
 Fixed integer overflow in time calculations.
 
@@ -111,7 +116,6 @@ RPC getmininginfo method.
 v3.15.5
 
 Fix stratum jobs lost if 2 jobs received in less than one second.
-
 
 v3.15.4
 

diff --git a/algo/blake/blake2b-hash-4way.h b/algo/blake/blake2b-hash-4way.h
@@ -17,7 +17,7 @@
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
    __m512i b[16]; // input buffer
    __m512i h[8];  // chained state
    uint64_t t[2];  // total number of bytes
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
 #if defined(__AVX2__)
 
 // state context
-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
 	__m256i b[16]; // input buffer
 	__m256i h[8];  // chained state
 	uint64_t t[2];  // total number of bytes

diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h
@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
 } blake2s_nway_param;
 #pragma pack(pop)
 
-ALIGN( 64 ) typedef struct __blake2s_4way_state
+typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
    __m128i h[8];
    uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
 
 #if defined(__AVX2__)
 
-ALIGN( 64 ) typedef struct __blake2s_8way_state
+typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
    __m256i h[8];
    uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-ALIGN( 128 ) typedef struct __blake2s_16way_state
+typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
    __m512i h[8];
    uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];

diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c
@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
 
 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
 {
-	blake2s_state S[1];
+	blake2s_state S;
 
 	/* Verify parameters */
 	if ( NULL == in ) return -1;
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
 
 	if( keylen > 0 )
 	{
-		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+		if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
 	}
 	else
 	{
-		if( blake2s_init( S, outlen ) < 0 ) return -1;
+		if( blake2s_init( &S, outlen ) < 0 ) return -1;
 	}
 
-	blake2s_update( S, ( uint8_t * )in, inlen );
-	blake2s_final( S, out, outlen );
+	blake2s_update( &S, ( uint8_t * )in, inlen );
+	blake2s_final( &S, out, outlen );
 	return 0;
 }
 

diff --git a/algo/blake/sph-blake2s.h b/algo/blake/sph-blake2s.h
@@ -116,7 +116,7 @@ extern "C" {
 		uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
 	} blake2s_param;
 
-	ALIGN( 64 ) typedef struct __blake2s_state
+	typedef struct ALIGN( 64 ) __blake2s_state
 	{
 		uint32_t h[8];
 		uint32_t t[2];

diff --git a/algo/blake/sph_blake2b.h b/algo/blake/sph_blake2b.h
@@ -18,7 +18,7 @@
 #endif
 
 // state context
-ALIGN(64) typedef struct {
+typedef ALIGN(64) struct {
 	uint8_t b[128]; // input buffer
 	uint64_t h[8];  // chained state
 	uint64_t t[2];  // total number of bytes

diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c
@@ -127,10 +127,8 @@ void quark_8way_hash( void *state, const void *input )
 
      rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     if ( ( vh_mask & 0x0f ) != 0x0f )
-       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-     if ( ( vh_mask & 0xf0 ) != 0xf0 )
-       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
 
      rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
 
@@ -139,22 +137,14 @@ void quark_8way_hash( void *state, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, 512 );
 
-    if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    if ( hash4[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    if ( hash5[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    if ( hash6[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    if ( hash7[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  512 );

diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
@@ -44,8 +44,8 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
        if ( opt_data_file || !opt_verify ) 
        {
           if ( opt_data_file )
-             applog( LOG_ERR,
-                     "Verthash data file not found or invalid: %s", info->fileName );
+             applog( LOG_ERR, "Verthash data file not found or invalid: %s",
+                     info->fileName );
           else
           {
              applog( LOG_ERR,
@@ -134,117 +134,133 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
     return (a ^ b) * 0x1000193;
 }
 
-void verthash_hash( const unsigned char* blob_bytes,
-                    const size_t blob_size,
-                    const unsigned char(*input)[VH_HEADER_SIZE],
-                    unsigned char(*output)[VH_HASH_OUT_SIZE] )
+#if 0
+static void rotate_indexes( uint32_t *p )
 {
-    unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64)));
-    unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64)));
-    uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
-    uint32_t* p0_index = (uint32_t*)p0;
+#if defined(__AVX2__)
 
-    verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] );
-
-    for ( size_t x = 0; x < VH_N_ROT; ++x )
-    {
-        memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
-                p0, VH_N_SUBSET);
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
+   {
+      __m256i *px = (__m256i*)p + x;
 
-#if defined(__AVX2__)
+      px[0] = mm256_rol_32( px[0], 1 );
+      px[1] = mm256_rol_32( px[1], 1 );
+      px[2] = mm256_rol_32( px[2], 1 );
+      px[3] = mm256_rol_32( px[3], 1 );
+      px[4] = mm256_rol_32( px[4], 1 );
+      px[5] = mm256_rol_32( px[5], 1 );
+      px[6] = mm256_rol_32( px[6], 1 );
+      px[7] = mm256_rol_32( px[7], 1 );
+   }
 
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8)
-        {
-           casti_m256i( p0_index, y   ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y   ), 1 );
-           casti_m256i( p0_index, y+1 ) = mm256_rol_32( 
-                                            casti_m256i( p0_index, y+1 ), 1 );
-           casti_m256i( p0_index, y+2 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+2 ), 1 );
-           casti_m256i( p0_index, y+3 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+3 ), 1 );
-           casti_m256i( p0_index, y+4 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+4 ), 1 );
-           casti_m256i( p0_index, y+5 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+5 ), 1 );
-           casti_m256i( p0_index, y+6 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+6 ), 1 );
-           casti_m256i( p0_index, y+7 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+7 ), 1 );
-        }
+#else
+
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
+   {
+      __m128i *px = (__m128i*)p0_index + x;
+
+      px[0] = mm128_rol_32( px[0], 1 );
+      px[1] = mm128_rol_32( px[1], 1 );
+      px[2] = mm128_rol_32( px[2], 1 );
+      px[3] = mm128_rol_32( px[3], 1 );
+      px[4] = mm128_rol_32( px[4], 1 );
+      px[5] = mm128_rol_32( px[5], 1 );
+      px[6] = mm128_rol_32( px[6], 1 );
+      px[7] = mm128_rol_32( px[7], 1 );
+   }
+
+#endif
+/*   
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
+      p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
+*/
+}
+#endif
+
+static inline uint32_t rotl32( uint32_t a, size_t r )
+{
+   return ( a << r ) | ( a >> (32-r) );
+}
+
+// Vectorized and targetted version of fnv1a
+#if defined (__AVX2__)        
+
+#define MULXOR \
+   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
+                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );
+
+#elif defined(__SSE41__)
+
+#define MULXOR \
+   casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \
+                  casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \
+   casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \
+                  casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k );
 
 #else
 
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8)
-        {
-           casti_m128i( p0_index, y   ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y   ), 1 );
-           casti_m128i( p0_index, y+1 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+1 ), 1 );
-           casti_m128i( p0_index, y+2 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+2 ), 1 );
-           casti_m128i( p0_index, y+3 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+3 ), 1 );
-           casti_m128i( p0_index, y+4 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+4 ), 1 );
-           casti_m128i( p0_index, y+5 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+5 ), 1 );
-           casti_m128i( p0_index, y+6 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+6 ), 1 );
-           casti_m128i( p0_index, y+7 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+7 ), 1 );
-        }
-
+#define MULXOR \
+   for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \
+       hash[j] = fnv1a( hash[j], blob_off[j] ); \
+
 #endif
 
-    }
+#define UPDATE_ACCUMULATOR \
+   accumulator = fnv1a( accumulator, blob_off[0] ); \
+   accumulator = fnv1a( accumulator, blob_off[1] ); \
+   accumulator = fnv1a( accumulator, blob_off[2] ); \
+   accumulator = fnv1a( accumulator, blob_off[3] ); \
+   accumulator = fnv1a( accumulator, blob_off[4] ); \
+   accumulator = fnv1a( accumulator, blob_off[5] ); \
+   accumulator = fnv1a( accumulator, blob_off[6] ); \
+   accumulator = fnv1a( accumulator, blob_off[7] )
+
+
+// first pass no rotate
+#define ROUND_0 \
+for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
+{ \
+   const uint32_t *blob_off = blob + \
+                         ( ( fnv1a( subset[i], accumulator ) % mdiv ) \
+                         * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
+   UPDATE_ACCUMULATOR; \
+   MULXOR; \
+}
 
-    sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE );
-
-    uint32_t* p1_32 = (uint32_t*)p1;
-    uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes;
-    uint32_t value_accumulator = 0x811c9dc5;
+// subsequent passes rotate by r on demand, no need for mass rotate
+#define ROUND_r( r ) \
+for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
+{ \
+   const uint32_t *blob_off = blob + \
+                 ( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
+                 * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
+   UPDATE_ACCUMULATOR; \
+   MULXOR; \
+}
+
+void verthash_hash( const void *blob_bytes, const size_t blob_size,
+                    const void *input, void *output )
+{
+    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
+    uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
+    const uint32_t *blob = (const uint32_t*)blob_bytes;
+    uint32_t accumulator = 0x811c9dc5;
     const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
                              / VH_BYTE_ALIGNMENT ) + 1;
 #if defined (__AVX2__)        
     const __m256i k = _mm256_set1_epi32( 0x1000193 );
 #elif defined(__SSE41__)
     const __m128i k = _mm_set1_epi32( 0x1000193 );
 #endif
+
+    sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE );
+    verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] );
 
-    for ( size_t i = 0; i < VH_N_INDEXES; i++ )
-    {
-        const uint32_t offset =
-                      ( fnv1a( seek_indexes[i], value_accumulator) % mdiv )
-                      * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) );
-        const uint32_t *blob_off = blob_bytes_32 + offset;
-
-        // update value accumulator for next seek index
-        value_accumulator = fnv1a( value_accumulator, blob_off[0] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[1] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[2] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[3] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[4] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[5] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[6] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[7] );
-
-#if defined (__AVX2__)        
-        *(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256(
-                                  *(__m256i*)p1_32, *(__m256i*)blob_off ), k );
-#elif defined(__SSE41__)
-        casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128( 
-                    casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k );
-        casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128( 
-                    casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k );
-#else
-         for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ )
-            p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] );
-#endif
-
-    }
+    ROUND_0;
+    for ( size_t r = 1; r < VH_N_ROT; ++r )
+       ROUND_r( r );
 
-    memcpy( output, p1, VH_HASH_OUT_SIZE );
+    memcpy( output, hash, VH_HASH_OUT_SIZE );
 }
 
 //-----------------------------------------------------------------------------