diff --git a/Makefile.am b/Makefile.am
index db71cf12..82eeb6f6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -289,7 +289,7 @@ cpuminer_SOURCES = \
   algo/yescrypt/yescrypt-best.c \
   algo/yespower/yespower-gate.c \
   algo/yespower/yespower-blake2b.c \
-  algo/yespower/crypto/blake2b-yp.c \
+  algo/yespower/crypto/hmac-blake2b.c \
   algo/yespower/yescrypt-r8g.c \
   algo/yespower/yespower-opt.c
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index ef93a255..5e3c78b2 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,15 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.20.1
+
+sph_blake2b optimized 1-way SSSE3 & AVX2.
+Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
+Removed imprecise hash & target display from rejected share log.
+Share and target difficulty is now displayed only for low diificulty shares.
+Updated configure.ac to check for AVX512 asm support.
+Small optimization to Lyra2 SSE2.
+
 v3.20.0
 
 #375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.
diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c
index f4824434..d04601f3 100644
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
 };
 
 
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 #define B2B8W_G(a, b, c, d, x, y) \
diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c
index 246e3af0..5c947e18 100644
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -757,7 +757,6 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
    GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
    GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
 
-
    // remaining rounds  
    ROUND_B_8WAY(2);
    ROUND_B_8WAY(3);
diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c
index a17d7d75..9e13fe4f 100644
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -30,16 +30,10 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
+#include "simd-utils.h"
 #include "algo/sha/sph_types.h"
 #include "sph_blake2b.h"
 
-// Cyclic right rotation.
-
-#ifndef ROTR64
-#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-#endif
-
 // Little-endian byte access.
 
 #define B2B_GET64(p)                            \
@@ -54,45 +48,131 @@
 
 // G Mixing function.
 
-#define B2B_G(a, b, c, d, x, y) {   \
-	v[a] = v[a] + v[b] + x;         \
-	v[d] = ROTR64(v[d] ^ v[a], 32); \
-	v[c] = v[c] + v[d];             \
-	v[b] = ROTR64(v[b] ^ v[c], 24); \
-	v[a] = v[a] + v[b] + y;         \
-	v[d] = ROTR64(v[d] ^ v[a], 16); \
-	v[c] = v[c] + v[d];             \
-	v[b] = ROTR64(v[b] ^ v[c], 63); }
+#if defined(__AVX2__)
+
+#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \
+{ \
+  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
+              _mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \
+                                 m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
+  V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \
+  V[2] = _mm256_add_epi64( V[2], V[3] ); \
+  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+  __m256i *V = (__m256i*)v; \
+  BLAKE2B_G( R,  0,  2,  4,  6, 32, 24 ); \
+  BLAKE2B_G( R,  1,  3,  5,  7, 16, 63 ); \
+  V[3] = mm256_shufll_64( V[3] ); \
+  V[2] = mm256_swap_128( V[2] ); \
+  V[1] = mm256_shuflr_64( V[1] ); \
+  BLAKE2B_G( R,  8, 10, 12, 14, 32, 24 ); \
+  BLAKE2B_G( R,  9, 11, 13, 15, 16, 63 ); \
+  V[3] = mm256_shuflr_64( V[3] ); \
+  V[2] = mm256_swap_128( V[2] ); \
+  V[1] = mm256_shufll_64( V[1] ); \
+}
+
+#elif defined(__SSSE3__)
+
+#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \
+{ \
+   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
+                 _mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
+   Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \
+   Vc = _mm_add_epi64( Vc, Vd ); \
+   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   __m128i *V = (__m128i*)v; \
+   __m128i V2, V3, V6, V7; \
+   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \
+   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \
+   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \
+   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \
+   V2 = mm128_shufl2r_64( V[2], V[3] ); \
+   V3 = mm128_shufl2r_64( V[3], V[2] ); \
+   V6 = mm128_shufl2l_64( V[6], V[7] ); \
+   V7 = mm128_shufl2l_64( V[7], V[6] ); \
+   BLAKE2B_G( R, V[0], V2, V[5], V6,  8, 10, 32, 24 ); \
+   BLAKE2B_G( R, V[0], V2, V[5], V6,  9, 11, 16, 63 ); \
+   BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \
+   BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \
+   V[2] = mm128_shufl2l_64( V2, V3 ); \
+   V[3] = mm128_shufl2l_64( V3, V2 ); \
+   V[6] = mm128_shufl2r_64( V6, V7 ); \
+   V[7] = mm128_shufl2r_64( V7, V6 ); \
+}
+
+#else
+
+#ifndef ROTR64
+#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+#endif
+
+#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
+{ \
+   Va = Va + Vb + m[ sigma[R][Sa] ]; \
+   Vd = ROTR64( Vd ^ Va, 32 ); \
+   Vc = Vc + Vd; \
+   Vb = ROTR64( Vb ^ Vc, 24 ); \
+   Va = Va + Vb + m[ sigma[R][Sb] ]; \
+   Vd = ROTR64( Vd ^ Va, 16 ); \
+   Vc = Vc + Vd; \
+   Vb = ROTR64( Vb ^ Vc, 63 ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12],  0,  1 ); \
+   BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13],  2,  3 ); \
+   BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14],  4,  5 ); \
+   BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15],  6,  7 ); \
+   BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15],  8,  9 ); \
+   BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
+   BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
+   BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
+}
+
+#endif
 
 // Initialization Vector.
 
-static const uint64_t blake2b_iv[8] = {
+static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
+{
 	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
 	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
 	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };
 
+static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
+{
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
 // Compression function. "last" flag indicates last block.
 
 static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 {
-	const uint8_t sigma[12][16] = {
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-	};
-	int i;
-	uint64_t v[16], m[16];
+	uint64_t v[16] __attribute__ ((aligned (32)));
+   uint64_t m[16] __attribute__ ((aligned (32)));
+   int i;
 
 	for (i = 0; i < 8; i++) {           // init work variables
 		v[i] = ctx->h[i];
@@ -106,16 +186,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 	for (i = 0; i < 16; i++)            // get little-endian words
 		m[i] = B2B_GET64(&ctx->b[8 * i]);
 
-	for (i = 0; i < 12; i++) {          // twelve rounds
-		B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-		B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-		B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-		B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-		B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-		B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-		B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-		B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-	}
+	for (i = 0; i < 12; i++)
+      BLAKE2B_ROUND( i );   
 
 	for( i = 0; i < 8; ++i )
 		ctx->h[i] ^= v[i] ^ v[i + 8];
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index 1c904447..2385640e 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -150,12 +150,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G_2X64( s1, s3, s5, s7 ); \
    mm128_vrol256_64( s6, s7 ); \
    mm128_vror256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 ); \
-   G_2X64( s0, s2, s4, s6 ); \
-   G_2X64( s1, s3, s5, s7 ); \
+   G_2X64( s0, s2, s5, s6 ); \
+   G_2X64( s1, s3, s4, s7 ); \
    mm128_vror256_64( s6, s7 ); \
-   mm128_vrol256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 );
+   mm128_vrol256_64( s2, s3 );
 
 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
    LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
diff --git a/algo/yespower/crypto/blake2b-yp.c b/algo/yespower/crypto/blake2b-yp.c
deleted file mode 100644
index dc6eee6a..00000000
--- a/algo/yespower/crypto/blake2b-yp.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright 2009 Colin Percival, 2014 savale
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *  notice, this list of conditions and the following disclaimer in the
- *  documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include "simd-utils.h"
-#include <algo/yespower/crypto/sph_types.h>
-#include "blake2b-yp.h"
-
-// Cyclic right rotation.
-//#ifndef ROTR64
-//#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-//#endif
-
-#define ROTR64(x, y) ror64( x, y )
-
-// Little-endian byte access.
-#define B2B_GET64(p)                            \
-    (((uint64_t) ((uint8_t *) (p))[0]) ^        \
-    (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
-    (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
-    (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
-    (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
-    (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
-    (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
-    (((uint64_t) ((uint8_t *) (p))[7]) << 56))
-
-// G Mixing function.
-#define B2B_G(a, b, c, d, x, y) {   \
-    v[a] = v[a] + v[b] + x;      \
-    v[d] = ROTR64(v[d] ^ v[a], 32); \
-    v[c] = v[c] + v[d];          \
-    v[b] = ROTR64(v[b] ^ v[c], 24); \
-    v[a] = v[a] + v[b] + y;      \
-    v[d] = ROTR64(v[d] ^ v[a], 16); \
-    v[c] = v[c] + v[d];          \
-    v[b] = ROTR64(v[b] ^ v[c], 63); }
-
-// Initialization Vector.
-static const uint64_t blake2b_iv[8] = {
-    0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
-    0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
-    0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
-    0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
-};
-
-// Compression function. "last" flag indicates last block.
-static void blake2b_compress(blake2b_yp_ctx *ctx, int last)
-{
-    const uint8_t sigma[12][16] = {
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-    };
-    int i;
-    uint64_t v[16], m[16];
-
-    // init work variables
-    for (i = 0; i < 8; i++) {
-        v[i] = ctx->h[i];
-        v[i + 8] = blake2b_iv[i];
-    }
-
-    v[12] ^= ctx->t[0]; // low 64 bits of offset
-    v[13] ^= ctx->t[1]; // high 64 bits
-
-    // last block flag set ?
-    if (last) { 
-        v[14] = ~v[14];
-    }
-
-    // get little-endian words
-    for (i = 0; i < 16; i++) {
-        m[i] = B2B_GET64(&ctx->b[8 * i]);
-    }
-
-    // twelve rounds
-    for (i = 0; i < 12; i++) {
-        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-    }
-
-    for(i = 0; i < 8; ++i) {
-        ctx->h[i] ^= v[i] ^ v[i + 8];
-    }
-}
-
-// Initialize the hashing context "ctx" with optional key "key".
-// 1 <= outlen <= 64 gives the digest size in bytes.
-// Secret key (also <= 64 bytes) is optional (keylen = 0).
-int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen,
-    const void *key, size_t keylen) // (keylen=0: no key)
-{
-    size_t i;
-
-    // illegal parameters
-    if (outlen == 0 || outlen > 64 || keylen > 64) {
-        return -1;
-    }
-
-    // state, "param block"
-    for (i = 0; i < 8; i++) {
-        ctx->h[i] = blake2b_iv[i];
-    }
-
-    ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
-
-    ctx->t[0] = 0; // input count low word
-    ctx->t[1] = 0; // input count high word
-    ctx->c = 0; // pointer within buffer
-    ctx->outlen = outlen;
-
-    // zero input block
-    for (i = keylen; i < 128; i++) {
-        ctx->b[i] = 0;
-    }
-
-    if (keylen > 0) {
-        blake2b_yp_update(ctx, key, keylen);
-        ctx->c = 128; // at the end
-    }
-
-    return 0;
-}
-
-// Add "inlen" bytes from "in" into the hash.
-void blake2b_yp_update(blake2b_yp_ctx *ctx,
-    const void *in, size_t inlen) // data bytes
-{
-    size_t i;
-    for (i = 0; i < inlen; i++) {
-        if (ctx->c == 128) { // buffer full ?
-            ctx->t[0] += ctx->c; // add counters
-            if (ctx->t[0] < ctx->c) // carry overflow ?
-                ctx->t[1]++; // high word
-            blake2b_compress(ctx, 0); // compress (not last)
-            ctx->c = 0; // counter to zero
-        }
-        ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
-    }
-}
-
-// Generate the message digest (size given in init).
-// Result placed in "out".
-void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out)
-{
-    size_t i;
-
-    ctx->t[0] += ctx->c; // mark last block offset
-    // carry overflow
-    if (ctx->t[0] < ctx->c) {
-        ctx->t[1]++; // high word
-    }
-
-    // fill up with zeros
-    while (ctx->c < 128) {
-        ctx->b[ctx->c++] = 0;
-    }
-
-    blake2b_compress(ctx, 1); // final block flag = 1
-
-    // little endian convert and store
-    for (i = 0; i < ctx->outlen; i++) {
-        ((uint8_t *) out)[i] =
-            (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
-    }
-}
-
-// inlen = number of bytes
-void blake2b_yp_hash(void *out, const void *in, size_t inlen) {
-    blake2b_yp_ctx ctx;
-    blake2b_yp_init(&ctx, 32, NULL, 0);
-    blake2b_yp_update(&ctx, in, inlen);
-    blake2b_yp_final(&ctx, out);
-}
-
-// // keylen = number of bytes
-void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) {
-    const uint8_t *key = _key;
-    uint8_t keyhash[32];
-    uint8_t pad[64];
-    uint64_t i;
-
-    if (keylen > 64) {
-        blake2b_yp_hash(keyhash, key, keylen);
-        key = keyhash;
-        keylen = 32;
-    }
-
-    blake2b_yp_init(&hctx->inner, 32, NULL, 0);
-    memset(pad, 0x36, 64);
-    for (i = 0; i < keylen; ++i) {
-        pad[i] ^= key[i];
-    }
-
-    blake2b_yp_update(&hctx->inner, pad, 64);
-    blake2b_yp_init(&hctx->outer, 32, NULL, 0);
-    memset(pad, 0x5c, 64);
-    for (i = 0; i < keylen; ++i) {
-        pad[i] ^= key[i];
-    }
-
-    blake2b_yp_update(&hctx->outer, pad, 64);
-    memset(keyhash, 0, 32);
-}
-
-// datalen = number of bits
-void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) {
-    // update the inner state
-    blake2b_yp_update(&hctx->inner, data, datalen);
-}
-
-void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) {
-    uint8_t ihash[32];
-    blake2b_yp_final(&hctx->inner, ihash);
-    blake2b_yp_update(&hctx->outer, ihash, 32);
-    blake2b_yp_final(&hctx->outer, digest);
-    memset(ihash, 0, 32);
-}
-
-// // keylen = number of bytes; inlen = number of bytes
-void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) {
-    hmac_yp_ctx hctx;
-    hmac_blake2b_yp_init(&hctx, key, keylen);
-    hmac_blake2b_yp_update(&hctx, in, inlen);
-    hmac_blake2b_yp_final(&hctx, out);
-}
-
-void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-    hmac_yp_ctx PShctx, hctx;
-    size_t i;
-    uint32_t ivec;
-    uint8_t U[32];
-    uint8_t T[32];
-    uint64_t j;
-    int k;
-    size_t clen;
-
-    /* Compute HMAC state after processing P and S. */
-    hmac_blake2b_yp_init(&PShctx, passwd, passwdlen);
-    hmac_blake2b_yp_update(&PShctx, salt, saltlen);
-
-    /* Iterate through the blocks. */
-    for (i = 0; i * 32 < dkLen; i++) {
-        /* Generate INT(i + 1). */
-        ivec = bswap_32( i+1 );
-
-        /* Compute U_1 = PRF(P, S || INT(i)). */
-        memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
-        hmac_blake2b_yp_update(&hctx, &ivec, 4);
-        hmac_blake2b_yp_final(&hctx, U);
-
-        /* T_i = U_1 ... */
-        memcpy(T, U, 32);
-
-        for (j = 2; j <= c; j++) {
-            /* Compute U_j. */
-            hmac_blake2b_yp_init(&hctx, passwd, passwdlen);
-            hmac_blake2b_yp_update(&hctx, U, 32);
-            hmac_blake2b_yp_final(&hctx, U);
-
-            /* ... xor U_j ... */
-            for (k = 0; k < 32; k++) {
-                T[k] ^= U[k];
-            }
-        }
-
-        /* Copy as many bytes as necessary into buf. */
-        clen = dkLen - i * 32;
-        if (clen > 32) {
-            clen = 32;
-        }
-
-        memcpy(&buf[i * 32], T, clen);
-    }
-
-    /* Clean PShctx, since we never called _Final on it. */
-    memset(&PShctx, 0, sizeof(hmac_yp_ctx));
-}
diff --git a/algo/yespower/crypto/blake2b-yp.h b/algo/yespower/crypto/blake2b-yp.h
deleted file mode 100644
index a240bc6a..00000000
--- a/algo/yespower/crypto/blake2b-yp.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#pragma once
-#ifndef __BLAKE2B_H__
-#define __BLAKE2B_H__
-
-#include <stddef.h>
-#include <stdint.h>
-
-#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
-#define NATIVE_LITTLE_ENDIAN
-#endif
-
-// state context
-typedef struct {
-    uint8_t b[128]; // input buffer
-    uint64_t h[8];  // chained state
-    uint64_t t[2];  // total number of bytes
-    size_t c;       // pointer for b[]
-    size_t outlen;  // digest size
-} blake2b_yp_ctx;
-
-typedef struct {
-    blake2b_yp_ctx inner;
-    blake2b_yp_ctx outer;
-} hmac_yp_ctx;
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen);
-void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen);
-void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out);
-void blake2b_yp_hash(void *out, const void *in, size_t inlen);
-void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen);
-void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
diff --git a/algo/yespower/crypto/hmac-blake2b.c b/algo/yespower/crypto/hmac-blake2b.c
new file mode 100644
index 00000000..83b9e58a
--- /dev/null
+++ b/algo/yespower/crypto/hmac-blake2b.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "simd-utils.h"
+#include "hmac-blake2b.h"
+
+// keylen = number of bytes
+void hmac_blake2b_init( hmac_blake2b_ctx *hctx, const void *_key,
+                        size_t keylen )
+{
+    const uint8_t *key = _key;
+    uint8_t keyhash[32];
+    uint8_t pad[64];
+    uint64_t i;
+
+    if (keylen > 64)
+    {
+       sph_blake2b_ctx ctx;
+       sph_blake2b_init( &ctx, 32, NULL, 0 );
+       sph_blake2b_update( &ctx, key, keylen );
+       sph_blake2b_final( &ctx, keyhash );
+       key = keyhash;
+       keylen = 32;
+    }
+
+    sph_blake2b_init( &hctx->inner, 32, NULL, 0 );
+    memset( pad, 0x36, 64 );
+    for ( i = 0; i < keylen; ++i )
+        pad[i] ^= key[i];
+
+    sph_blake2b_update( &hctx->inner, pad, 64 );
+    sph_blake2b_init( &hctx->outer, 32, NULL, 0 );
+    memset( pad, 0x5c, 64 );
+    for ( i = 0; i < keylen; ++i )
+        pad[i] ^= key[i];
+
+    sph_blake2b_update( &hctx->outer, pad, 64 );
+    memset( keyhash, 0, 32 );
+}
+
+// datalen = number of bits
+void hmac_blake2b_update( hmac_blake2b_ctx *hctx, const void *data,
+                          size_t datalen )
+{
+    // update the inner state
+    sph_blake2b_update( &hctx->inner, data, datalen );
+}
+
+void hmac_blake2b_final( hmac_blake2b_ctx *hctx, uint8_t *digest )
+{
+    uint8_t ihash[32];
+    sph_blake2b_final( &hctx->inner, ihash );
+    sph_blake2b_update( &hctx->outer, ihash, 32 );
+    sph_blake2b_final( &hctx->outer, digest );
+    memset( ihash, 0, 32 );
+}
+
+// // keylen = number of bytes; inlen = number of bytes
+void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
+                        const void *in, size_t inlen )
+{
+    hmac_blake2b_ctx hctx;
+    hmac_blake2b_init( &hctx, key, keylen );
+    hmac_blake2b_update( &hctx, in, inlen );
+    hmac_blake2b_final( &hctx, out );
+}
+
+void pbkdf2_blake2b( const uint8_t *passwd, size_t passwdlen,
+                     const uint8_t *salt, size_t saltlen, uint64_t c,
+                     uint8_t *buf, size_t dkLen )
+{
+    hmac_blake2b_ctx PShctx, hctx;
+    size_t i;
+    uint32_t ivec;
+    uint8_t U[32];
+    uint8_t T[32];
+    uint64_t j;
+    int k;
+    size_t clen;
+
+    /* Compute HMAC state after processing P and S. */
+    hmac_blake2b_init( &PShctx, passwd, passwdlen );
+    hmac_blake2b_update( &PShctx, salt, saltlen );
+
+    /* Iterate through the blocks. */
+    for ( i = 0; i * 32 < dkLen; i++ )
+    {
+        /* Generate INT(i + 1). */
+        ivec = bswap_32( i+1 );
+
+        /* Compute U_1 = PRF(P, S || INT(i)). */
+        memcpy( &hctx, &PShctx, sizeof(hmac_blake2b_ctx) );
+        hmac_blake2b_update( &hctx, &ivec, 4 );
+        hmac_blake2b_final( &hctx, U );
+
+        /* T_i = U_1 ... */
+        memcpy( T, U, 32 );
+
+        for ( j = 2; j <= c; j++ )
+        {
+            /* Compute U_j. */
+            hmac_blake2b_init( &hctx, passwd, passwdlen );
+            hmac_blake2b_update( &hctx, U, 32 );
+            hmac_blake2b_final( &hctx, U );
+
+            /* ... xor U_j ... */
+            for ( k = 0; k < 32; k++ )
+                T[k] ^= U[k];
+        }
+
+        /* Copy as many bytes as necessary into buf. */
+        clen = dkLen - i * 32;
+        if (clen > 32)
+            clen = 32;
+
+        memcpy( &buf[i * 32], T, clen );
+    }
+
+    /* Clean PShctx, since we never called _Final on it. */
+    memset( &PShctx, 0, sizeof(hmac_blake2b_ctx) );
+}
diff --git a/algo/yespower/crypto/hmac-blake2b.h b/algo/yespower/crypto/hmac-blake2b.h
new file mode 100644
index 00000000..d90b6438
--- /dev/null
+++ b/algo/yespower/crypto/hmac-blake2b.h
@@ -0,0 +1,34 @@
+#pragma once
+#ifndef __HMAC_BLAKE2B_H__
+#define __HMAC_BLAKE2B_H__
+
+#include <stddef.h>
+#include <stdint.h>
+#include "algo/blake/sph_blake2b.h"
+
+#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+
+typedef struct
+{
+    sph_blake2b_ctx inner;
+    sph_blake2b_ctx outer;
+} hmac_blake2b_ctx;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
+                        const void *in, size_t inlen );
+
+void pbkdf2_blake2b( const uint8_t * passwd, size_t passwdlen,
+                     const uint8_t * salt, size_t saltlen, uint64_t c,
+                     uint8_t * buf, size_t dkLen );
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/algo/yespower/crypto/sph_types.h b/algo/yespower/crypto/sph_types.h
deleted file mode 100644
index cef79bde..00000000
--- a/algo/yespower/crypto/sph_types.h
+++ /dev/null
@@ -1,1976 +0,0 @@
-/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
-/**
- * Basic type definitions.
- *
- * This header file defines the generic integer types that will be used
- * for the implementation of hash functions; it also contains helper
- * functions which encode and decode multi-byte integer values, using
- * either little-endian or big-endian conventions.
- *
- * This file contains a compile-time test on the size of a byte
- * (the <code>unsigned char</code> C type). If bytes are not octets,
- * i.e. if they do not have a size of exactly 8 bits, then compilation
- * is aborted. Architectures where bytes are not octets are relatively
- * rare, even in the embedded devices market. We forbid non-octet bytes
- * because there is no clear convention on how octet streams are encoded
- * on such systems.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_types.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_TYPES_H__
-#define SPH_TYPES_H__
-
-#include <limits.h>
-
-/*
- * All our I/O functions are defined over octet streams. We do not know
- * how to handle input data if bytes are not octets.
- */
-#if CHAR_BIT != 8
-#error This code requires 8-bit bytes
-#endif
-
-/* ============= BEGIN documentation block for Doxygen ============ */
-
-#ifdef DOXYGEN_IGNORE
-
-/** @mainpage sphlib C code documentation
- *
- * @section overview Overview
- *
- * <code>sphlib</code> is a library which contains implementations of
- * various cryptographic hash functions. These pages have been generated
- * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
- * document the API for the C implementations.
- *
- * The API is described in appropriate header files, which are available
- * in the "Files" section. Each hash function family has its own header,
- * whose name begins with <code>"sph_"</code> and contains the family
- * name. For instance, the API for the RIPEMD hash functions is available
- * in the header file <code>sph_ripemd.h</code>.
- *
- * @section principles API structure and conventions
- *
- * @subsection io Input/output conventions
- *
- * In all generality, hash functions operate over strings of bits.
- * Individual bits are rarely encountered in C programming or actual
- * communication protocols; most protocols converge on the ubiquitous
- * "octet" which is a group of eight bits. Data is thus expressed as a
- * stream of octets. The C programming language contains the notion of a
- * "byte", which is a data unit managed under the type <code>"unsigned
- * char"</code>. The C standard prescribes that a byte should hold at
- * least eight bits, but possibly more. Most modern architectures, even
- * in the embedded world, feature eight-bit bytes, i.e. map bytes to
- * octets.
- *
- * Nevertheless, for some of the implemented hash functions, an extra
- * API has been added, which allows the input of arbitrary sequences of
- * bits: when the computation is about to be closed, 1 to 7 extra bits
- * can be added. The functions for which this API is implemented include
- * the SHA-2 functions and all SHA-3 candidates.
- *
- * <code>sphlib</code> defines hash function which may hash octet streams,
- * i.e. streams of bits where the number of bits is a multiple of eight.
- * The data input functions in the <code>sphlib</code> API expect data
- * as anonymous pointers (<code>"const void *"</code>) with a length
- * (of type <code>"size_t"</code>) which gives the input data chunk length
- * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
- * header contains a compile-time test which prevents compilation on
- * architectures where this property is not met.
- *
- * The hash function output is also converted into bytes. All currently
- * implemented hash functions have an output width which is a multiple of
- * eight, and this is likely to remain true for new designs.
- *
- * Most hash functions internally convert input data into 32-bit of 64-bit
- * words, using either little-endian or big-endian conversion. The hash
- * output also often consists of such words, which are encoded into output
- * bytes with a similar endianness convention. Some hash functions have
- * been only loosely specified on that subject; when necessary,
- * <code>sphlib</code> has been tested against published "reference"
- * implementations in order to use the same conventions.
- *
- * @subsection shortname Function short name
- *
- * Each implemented hash function has a "short name" which is used
- * internally to derive the identifiers for the functions and context
- * structures which the function uses. For instance, MD5 has the short
- * name <code>"md5"</code>. Short names are listed in the next section,
- * for the implemented hash functions. In subsequent sections, the
- * short name will be assumed to be <code>"XXX"</code>: replace with the
- * actual hash function name to get the C identifier.
- *
- * Note: some functions within the same family share the same core
- * elements, such as update function or context structure. Correspondingly,
- * some of the defined types or functions may actually be macros which
- * transparently evaluate to another type or function name.
- *
- * @subsection context Context structure
- *
- * Each implemented hash fonction has its own context structure, available
- * under the type name <code>"sph_XXX_context"</code> for the hash function
- * with short name <code>"XXX"</code>. This structure holds all needed
- * state for a running hash computation.
- *
- * The contents of these structures are meant to be opaque, and private
- * to the implementation. However, these contents are specified in the
- * header files so that application code which uses <code>sphlib</code>
- * may access the size of those structures.
- *
- * The caller is responsible for allocating the context structure,
- * whether by dynamic allocation (<code>malloc()</code> or equivalent),
- * static allocation (a global permanent variable), as an automatic
- * variable ("on the stack"), or by any other mean which ensures proper
- * structure alignment. <code>sphlib</code> code performs no dynamic
- * allocation by itself.
- *
- * The context must be initialized before use, using the
- * <code>sph_XXX_init()</code> function. This function sets the context
- * state to proper initial values for hashing.
- *
- * Since all state data is contained within the context structure,
- * <code>sphlib</code> is thread-safe and reentrant: several hash
- * computations may be performed in parallel, provided that they do not
- * operate on the same context. Moreover, a running computation can be
- * cloned by copying the context (with a simple <code>memcpy()</code>):
- * the context and its clone are then independant and may be updated
- * with new data and/or closed without interfering with each other.
- * Similarly, a context structure can be moved in memory at will:
- * context structures contain no pointer, in particular no pointer to
- * themselves.
- *
- * @subsection dataio Data input
- *
- * Hashed data is input with the <code>sph_XXX()</code> fonction, which
- * takes as parameters a pointer to the context, a pointer to the data
- * to hash, and the number of data bytes to hash. The context is updated
- * with the new data.
- *
- * Data can be input in one or several calls, with arbitrary input lengths.
- * However, it is best, performance wise, to input data by relatively big
- * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
- * optimize things and avoid internal copying.
- *
- * When all data has been input, the context can be closed with
- * <code>sph_XXX_close()</code>. The hash output is computed and written
- * into the provided buffer. The caller must take care to provide a
- * buffer of appropriate length; e.g., when using SHA-1, the output is
- * a 20-byte word, therefore the output buffer must be at least 20-byte
- * long.
- *
- * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
- * function can be used instead of <code>sph_XXX_close()</code>. This
- * function can take a few extra <strong>bits</strong> to be added at
- * the end of the input message. This allows hashing messages with a
- * bit length which is not a multiple of 8. The extra bits are provided
- * as an unsigned integer value, and a bit count. The bit count must be
- * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
- * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
- * For instance, to add three bits of value 1, 1 and 0, the unsigned
- * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
- * will be 3.
- *
- * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
- * it evaluates to the function output size, expressed in bits. For instance,
- * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
- *
- * When closed, the context is automatically reinitialized and can be
- * immediately used for another computation. It is not necessary to call
- * <code>sph_XXX_init()</code> after a close. Note that
- * <code>sph_XXX_init()</code> can still be called to "reset" a context,
- * i.e. forget previously input data, and get back to the initial state.
- *
- * @subsection alignment Data alignment
- *
- * "Alignment" is a property of data, which is said to be "properly
- * aligned" when its emplacement in memory is such that the data can
- * be optimally read by full words. This depends on the type of access;
- * basically, some hash functions will read data by 32-bit or 64-bit
- * words. <code>sphlib</code> does not mandate such alignment for input
- * data, but using aligned data can substantially improve performance.
- *
- * As a rule, it is best to input data by chunks whose length (in bytes)
- * is a multiple of eight, and which begins at "generally aligned"
- * addresses, such as the base address returned by a call to
- * <code>malloc()</code>.
- *
- * @section functions Implemented functions
- *
- * We give here the list of implemented functions. They are grouped by
- * family; to each family corresponds a specific header file. Each
- * individual function has its associated "short name". Please refer to
- * the documentation for that header file to get details on the hash
- * function denomination and provenance.
- *
- * Note: the functions marked with a '(64)' in the list below are
- * available only if the C compiler provides an integer type of length
- * 64 bits or more. Such a type is mandatory in the latest C standard
- * (ISO 9899:1999, aka "C99") and is present in several older compilers
- * as well, so chances are that such a type is available.
- *
- * - HAVAL family: file <code>sph_haval.h</code>
- *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
- *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
- *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
- *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
- *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
- *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
- *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
- *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
- *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
- *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
- *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
- *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
- *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
- *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
- *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
- * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
- * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
- * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
- * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
- * - RadioGatun family: file <code>sph_radiogatun.h</code>
- *   - RadioGatun[32]: short name: <code>radiogatun32</code>
- *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
- * - RIPEMD family: file <code>sph_ripemd.h</code>
- *   - RIPEMD: short name: <code>ripemd</code>
- *   - RIPEMD-128: short name: <code>ripemd128</code>
- *   - RIPEMD-160: short name: <code>ripemd160</code>
- * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
- * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
- * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
- *   - SHA-224: short name: <code>sha224</code>
- *   - SHA-256: short name: <code>sha256</code>
- *   - SHA-384: short name: <code>sha384</code> (64)
- *   - SHA-512: short name: <code>sha512</code> (64)
- * - Tiger family: file <code>sph_tiger.h</code>
- *   - Tiger: short name: <code>tiger</code> (64)
- *   - Tiger2: short name: <code>tiger2</code> (64)
- * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
- *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
- *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
- *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
- *
- * The fourteen second-round SHA-3 candidates are also implemented;
- * when applicable, the implementations follow the "final" specifications
- * as published for the third round of the SHA-3 competition (BLAKE,
- * Groestl, JH, Keccak and Skein have been tweaked for third round).
- *
- * - BLAKE family: file <code>sph_blake.h</code>
- *   - BLAKE-224: short name: <code>blake224</code>
- *   - BLAKE-256: short name: <code>blake256</code>
- *   - BLAKE-384: short name: <code>blake384</code>
- *   - BLAKE-512: short name: <code>blake512</code>
- * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
- *   - BMW-224: short name: <code>bmw224</code>
- *   - BMW-256: short name: <code>bmw256</code>
- *   - BMW-384: short name: <code>bmw384</code> (64)
- *   - BMW-512: short name: <code>bmw512</code> (64)
- * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
- *   CubeHash16/32 in the CubeHash specification)
- *   - CubeHash-224: short name: <code>cubehash224</code>
- *   - CubeHash-256: short name: <code>cubehash256</code>
- *   - CubeHash-384: short name: <code>cubehash384</code>
- *   - CubeHash-512: short name: <code>cubehash512</code>
- * - ECHO family: file <code>sph_echo.h</code>
- *   - ECHO-224: short name: <code>echo224</code>
- *   - ECHO-256: short name: <code>echo256</code>
- *   - ECHO-384: short name: <code>echo384</code>
- *   - ECHO-512: short name: <code>echo512</code>
- * - Fugue family: file <code>sph_fugue.h</code>
- *   - Fugue-224: short name: <code>fugue224</code>
- *   - Fugue-256: short name: <code>fugue256</code>
- *   - Fugue-384: short name: <code>fugue384</code>
- *   - Fugue-512: short name: <code>fugue512</code>
- * - Groestl family: file <code>sph_groestl.h</code>
- *   - Groestl-224: short name: <code>groestl224</code>
- *   - Groestl-256: short name: <code>groestl256</code>
- *   - Groestl-384: short name: <code>groestl384</code>
- *   - Groestl-512: short name: <code>groestl512</code>
- * - Hamsi family: file <code>sph_hamsi.h</code>
- *   - Hamsi-224: short name: <code>hamsi224</code>
- *   - Hamsi-256: short name: <code>hamsi256</code>
- *   - Hamsi-384: short name: <code>hamsi384</code>
- *   - Hamsi-512: short name: <code>hamsi512</code>
- * - JH family: file <code>sph_jh.h</code>
- *   - JH-224: short name: <code>jh224</code>
- *   - JH-256: short name: <code>jh256</code>
- *   - JH-384: short name: <code>jh384</code>
- *   - JH-512: short name: <code>jh512</code>
- * - Keccak family: file <code>sph_keccak.h</code>
- *   - Keccak-224: short name: <code>keccak224</code>
- *   - Keccak-256: short name: <code>keccak256</code>
- *   - Keccak-384: short name: <code>keccak384</code>
- *   - Keccak-512: short name: <code>keccak512</code>
- * - Luffa family: file <code>sph_luffa.h</code>
- *   - Luffa-224: short name: <code>luffa224</code>
- *   - Luffa-256: short name: <code>luffa256</code>
- *   - Luffa-384: short name: <code>luffa384</code>
- *   - Luffa-512: short name: <code>luffa512</code>
- * - Shabal family: file <code>sph_shabal.h</code>
- *   - Shabal-192: short name: <code>shabal192</code>
- *   - Shabal-224: short name: <code>shabal224</code>
- *   - Shabal-256: short name: <code>shabal256</code>
- *   - Shabal-384: short name: <code>shabal384</code>
- *   - Shabal-512: short name: <code>shabal512</code>
- * - SHAvite-3 family: file <code>sph_shavite.h</code>
- *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
- *     short name: <code>shabal224</code>
- *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
- *     short name: <code>shabal256</code>
- *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
- *     short name: <code>shabal384</code>
- *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
- *     short name: <code>shabal512</code>
- * - SIMD family: file <code>sph_simd.h</code>
- *   - SIMD-224: short name: <code>simd224</code>
- *   - SIMD-256: short name: <code>simd256</code>
- *   - SIMD-384: short name: <code>simd384</code>
- *   - SIMD-512: short name: <code>simd512</code>
- * - Skein family: file <code>sph_skein.h</code>
- *   - Skein-224 (nominally specified as Skein-512-224): short name:
- *     <code>skein224</code> (64)
- *   - Skein-256 (nominally specified as Skein-512-256): short name:
- *     <code>skein256</code> (64)
- *   - Skein-384 (nominally specified as Skein-512-384): short name:
- *     <code>skein384</code> (64)
- *   - Skein-512 (nominally specified as Skein-512-512): short name:
- *     <code>skein512</code> (64)
- *
- * For the second-round SHA-3 candidates, the functions are as specified
- * for round 2, i.e. with the "tweaks" that some candidates added
- * between round 1 and round 2. Also, some of the submitted packages for
- * round 2 contained errors, in the specification, reference code, or
- * both. <code>sphlib</code> implements the corrected versions.
- */
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 32 bits; on most
- * architectures, it will have a width of exactly 32 bits. Unsigned C
- * types implement arithmetics modulo a power of 2; use the
- * <code>SPH_T32()</code> macro to ensure that the value is truncated
- * to exactly 32 bits. Unless otherwise specified, all macros and
- * functions which accept <code>sph_u32</code> values assume that these
- * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
- * where <code>sph_u32</code> is larger than that.
- */
-typedef __arch_dependant__ sph_u32;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u32</code>; it has
- * width 32 bits or more.
- */
-typedef __arch_dependant__ sph_s32;
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 64 bits; on most
- * architectures which feature such a type, it will have a width of
- * exactly 64 bits. C99-compliant platform will have this type; it
- * is also defined when the GNU compiler (gcc) is used, and on
- * platforms where <code>unsigned long</code> is large enough. If this
- * type is not available, then some hash functions which depends on
- * a 64-bit type will not be available (most notably SHA-384, SHA-512,
- * Tiger and WHIRLPOOL).
- */
-typedef __arch_dependant__ sph_u64;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u64</code>; it has
- * width 64 bits or more.
- */
-typedef __arch_dependant__ sph_s64;
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u32</code>. Depending on
- * how this type is defined, a suffix such as <code>UL</code> may
- * be appended to the argument.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C32(x)
-
-/**
- * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
- * a no-op, recognized as such by the compiler.
- *
- * @param x   the value to truncate (of type <code>sph_u32</code>)
- */
-#define SPH_T32(x)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTL32(x, n)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTR32(x, n)
-
-/**
- * This macro is defined on systems for which a 64-bit type has been
- * detected, and is used for <code>sph_u64</code>.
- */
-#define SPH_64
-
-/**
- * This macro is defined on systems for the "native" integer size is
- * 64 bits (64-bit values fit in one register).
- */
-#define SPH_64_TRUE
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u64</code>. Depending on
- * how this type is defined, a suffix such as <code>ULL</code> may
- * be appended to the argument. This macro is defined only if a
- * 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C64(x)
-
-/**
- * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
- * a no-op, recognized as such by the compiler. This macro is defined only
- * if a 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the value to truncate (of type <code>sph_u64</code>)
- */
-#define SPH_T64(x)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTL64(x, n)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTR64(x, n)
-
-/**
- * This macro evaluates to <code>inline</code> or an equivalent construction,
- * if available on the compilation platform, or to nothing otherwise. This
- * is used to declare inline functions, for which the compiler should
- * endeavour to include the code directly in the caller. Inline functions
- * are typically defined in header files as replacement for macros.
- */
-#define SPH_INLINE
-
-/**
- * This macro is defined if the platform has been detected as using
- * little-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_LITTLE_ENDIAN
-
-/**
- * This macro is defined if the platform has been detected as using
- * big-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_BIG_ENDIAN
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in little-endian
- * convention. This is the case for little-endian platforms, and also
- * for the big-endian platforms which have special little-endian access
- * opcodes (e.g. Ultrasparc).
- */
-#define SPH_LITTLE_FAST
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in big-endian
- * convention. This is the case for little-endian platforms, and also
- * for the little-endian platforms which have special big-endian access
- * opcodes.
- */
-#define SPH_BIG_FAST
-
-/**
- * On some platforms, this macro is defined to an unsigned integer type
- * into which pointer values may be cast. The resulting value can then
- * be tested for being a multiple of 2, 4 or 8, indicating an aligned
- * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
- */
-#define SPH_UPTR
-
-/**
- * When defined, this macro indicates that unaligned memory accesses
- * are possible with only a minor penalty, and thus should be prefered
- * over strategies which first copy data to an aligned buffer.
- */
-#define SPH_UNALIGNED
-
-/**
- * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
- * <code>0x78563412</code>). This is an inline function which resorts
- * to inline assembly on some platforms, for better performance.
- *
- * @param x   the 32-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u32 sph_bswap32(sph_u32 x);
-
-/**
- * Byte-swap a 64-bit word. This is an inline function which resorts
- * to inline assembly on some platforms, for better performance. This
- * function is defined only if a suitable 64-bit type was found for
- * <code>sph_u64</code>
- *
- * @param x   the 64-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u64 sph_bswap64(sph_u64 x);
-
-/**
- * Decode a 16-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16le(const void *src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16le(void *dst, unsigned val);
-
-/**
- * Decode a 16-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16be(const void *src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16be(void *dst, unsigned val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le(const void *src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32le()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le_aligned(const void *src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le(void *dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32le()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le_aligned(void *dst, sph_u32 val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be(const void *src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32be()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be_aligned(const void *src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be(void *dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32be()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be_aligned(void *dst, sph_u32 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le(const void *src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64le()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le_aligned(const void *src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le(void *dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64le()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le_aligned(void *dst, sph_u64 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be(const void *src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64be()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be_aligned(const void *src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be(void *dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64be()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
-
-#endif
-
-/* ============== END documentation block for Doxygen ============= */
-
-#ifndef DOXYGEN_IGNORE
-
-/*
- * We want to define the types "sph_u32" and "sph_u64" which hold
- * unsigned values of at least, respectively, 32 and 64 bits. These
- * tests should select appropriate types for most platforms. The
- * macro "SPH_64" is defined if the 64-bit is supported.
- */
-
-#undef SPH_64
-#undef SPH_64_TRUE
-
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-
-/*
- * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
- * type, if any, or otherwise use a wider type (which must exist, for
- * C99 conformance).
- */
-
-#include <stdint.h>
-
-#ifdef UINT32_MAX
-typedef uint32_t sph_u32;
-typedef int32_t sph_s32;
-#else
-typedef uint_fast32_t sph_u32;
-typedef int_fast32_t sph_s32;
-#endif
-#if !SPH_NO_64
-#ifdef UINT64_MAX
-typedef uint64_t sph_u64;
-typedef int64_t sph_s64;
-#else
-typedef uint_fast64_t sph_u64;
-typedef int_fast64_t sph_s64;
-#endif
-#endif
-
-#define SPH_C32(x)    ((sph_u32)(x))
-#if !SPH_NO_64
-#define SPH_C64(x)    ((sph_u64)(x))
-#define SPH_64  1
-#endif
-
-#else
-
-/*
- * On non-C99 systems, we use "unsigned int" if it is wide enough,
- * "unsigned long" otherwise. This supports all "reasonable" architectures.
- * We have to be cautious: pre-C99 preprocessors handle constants
- * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
- */
-
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-
-typedef unsigned int sph_u32;
-typedef int sph_s32;
-
-#define SPH_C32(x)    ((sph_u32)(x ## U))
-
-#else
-
-typedef unsigned long sph_u32;
-typedef long sph_s32;
-
-#define SPH_C32(x)    ((sph_u32)(x ## UL))
-
-#endif
-
-#if !SPH_NO_64
-
-/*
- * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
- * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
- * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
- * test whether "unsigned long long" is available; we also know that
- * gcc features this type, even if the libc header do not know it.
- */
-
-#if ((ULONG_MAX >> 31) >> 31) >= 3
-
-typedef unsigned long sph_u64;
-typedef long sph_s64;
-
-#define SPH_C64(x)    ((sph_u64)(x ## UL))
-
-#define SPH_64  1
-
-#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
-
-typedef unsigned long long sph_u64;
-typedef long long sph_s64;
-
-#define SPH_C64(x)    ((sph_u64)(x ## ULL))
-
-#define SPH_64  1
-
-#else
-
-/*
- * No 64-bit type...
- */
-
-#endif
-
-#endif
-
-#endif
-
-/*
- * If the "unsigned long" type has length 64 bits or more, then this is
- * a "true" 64-bit architectures. This is also true with Visual C on
- * amd64, even though the "long" type is limited to 32 bits.
- */
-#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
-#define SPH_64_TRUE   1
-#endif
-
-/*
- * Implementation note: some processors have specific opcodes to perform
- * a rotation. Recent versions of gcc recognize the expression above and
- * use the relevant opcodes, when appropriate.
- */
-
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
-
-#if SPH_64
-
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
-#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
-
-#endif
-
-#ifndef DOXYGEN_IGNORE
-/*
- * Define SPH_INLINE to be an "inline" qualifier, if available. We define
- * some small macro-like functions which benefit greatly from being inlined.
- */
-#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
-#define SPH_INLINE inline
-#elif defined _MSC_VER
-#define SPH_INLINE __inline
-#else
-#define SPH_INLINE
-#endif
-#endif
-
-/*
- * We define some macros which qualify the architecture. These macros
- * may be explicit set externally (e.g. as compiler parameters). The
- * code below sets those macros if they are not already defined.
- *
- * Most macros are boolean, thus evaluate to either zero or non-zero.
- * The SPH_UPTR macro is special, in that it evaluates to a C type,
- * or is not defined.
- *
- * SPH_UPTR             if defined: unsigned type to cast pointers into
- *
- * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
- * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
- * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
- * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
- * SPH_BIG_FAST         non-zero if big-endian decoding is fast
- *
- * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
- * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
- * _must_ be non-zero in those situations. The 32-bit and 64-bit types
- * _must_ also have an exact width.
- *
- * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
- * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
- * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
- * SPH_I386_GCC         x86-compatible (32-bit) with gcc
- * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
- * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
- * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
- * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
- * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
- *
- * TODO: enhance automatic detection, for more architectures and compilers.
- * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
- * some very fast functions (e.g. MD4) when using unaligned input data.
- * The CPU-specific-with-GCC macros are useful only for inline assembly,
- * normally restrained to this header file.
- */
-
-/*
- * 32-bit x86, aka "i386 compatible".
- */
-#if defined __i386__ || defined _M_IX86
-
-#define SPH_DETECT_UNALIGNED         1
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u32
-#ifdef __GNUC__
-#define SPH_DETECT_I386_GCC          1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_I386_MSVC         1
-#endif
-
-/*
- * 64-bit x86, hereafter known as "amd64".
- */
-#elif defined __x86_64 || defined _M_X64
-
-#define SPH_DETECT_UNALIGNED         1
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_AMD64_GCC         1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_AMD64_MSVC        1
-#endif
-
-/*
- * 64-bit Sparc architecture (implies v9).
- */
-#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \
-    || defined __sparcv9
-
-#define SPH_DETECT_BIG_ENDIAN        1
-#define SPH_DETECT_UPTR              sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_SPARCV9_GCC_64    1
-#define SPH_DETECT_LITTLE_FAST       1
-#endif
-
-/*
- * 32-bit Sparc.
- */
-#elif (defined __sparc__ || defined __sparc) \
-    && !(defined __sparcv9 || defined __arch64__)
-
-#define SPH_DETECT_BIG_ENDIAN        1
-#define SPH_DETECT_UPTR              sph_u32
-#if defined __GNUC__ && defined __sparc_v9__
-#define SPH_DETECT_SPARCV9_GCC_32    1
-#define SPH_DETECT_LITTLE_FAST       1
-#endif
-
-/*
- * ARM, little-endian.
- */
-#elif defined __arm__ && __ARMEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN     1
-
-/*
- * MIPS, little-endian.
- */
-#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN     1
-
-/*
- * MIPS, big-endian.
- */
-#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
-
-#define SPH_DETECT_BIG_ENDIAN        1
-
-/*
- * PowerPC.
- */
-#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \
-    || defined _ARCH_PPC
-
-/*
- * Note: we do not declare cross-endian access to be "fast": even if
- * using inline assembly, implementation should still assume that
- * keeping the decoded word in a temporary is faster than decoding
- * it again.
- */
-#if defined __GNUC__
-#if SPH_64_TRUE
-#define SPH_DETECT_PPC64_GCC         1
-#else
-#define SPH_DETECT_PPC32_GCC         1
-#endif
-#endif
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN        1
-#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#endif
-
-/*
- * Itanium, 64-bit.
- */
-#elif defined __ia64 || defined __ia64__ \
-    || defined __itanium__ || defined _M_IA64
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN        1
-#else
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#endif
-#if defined __LP64__ || defined _LP64
-#define SPH_DETECT_UPTR              sph_u64
-#else
-#define SPH_DETECT_UPTR              sph_u32
-#endif
-
-#endif
-
-#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
-#define SPH_DETECT_SPARCV9_GCC       1
-#endif
-
-#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
-#define SPH_UNALIGNED         SPH_DETECT_UNALIGNED
-#endif
-#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
-#define SPH_UPTR              SPH_DETECT_UPTR
-#endif
-#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
-#define SPH_LITTLE_ENDIAN     SPH_DETECT_LITTLE_ENDIAN
-#endif
-#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
-#define SPH_BIG_ENDIAN        SPH_DETECT_BIG_ENDIAN
-#endif
-#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST       SPH_DETECT_LITTLE_FAST
-#endif
-#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST    SPH_DETECT_BIG_FAST
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
-#define SPH_SPARCV9_GCC_32    SPH_DETECT_SPARCV9_GCC_32
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
-#define SPH_SPARCV9_GCC_64    SPH_DETECT_SPARCV9_GCC_64
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
-#define SPH_SPARCV9_GCC       SPH_DETECT_SPARCV9_GCC
-#endif
-#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
-#define SPH_I386_GCC          SPH_DETECT_I386_GCC
-#endif
-#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
-#define SPH_I386_MSVC         SPH_DETECT_I386_MSVC
-#endif
-#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
-#define SPH_AMD64_GCC         SPH_DETECT_AMD64_GCC
-#endif
-#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
-#define SPH_AMD64_MSVC        SPH_DETECT_AMD64_MSVC
-#endif
-#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
-#define SPH_PPC32_GCC         SPH_DETECT_PPC32_GCC
-#endif
-#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
-#define SPH_PPC64_GCC         SPH_DETECT_PPC64_GCC
-#endif
-
-#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST              1
-#endif
-#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST                 1
-#endif
-
-#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
-#error SPH_UPTR defined, but endianness is not known.
-#endif
-
-#if SPH_I386_GCC && !SPH_NO_ASM
-
-/*
- * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * values.
- */
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-    __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
-    return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-    return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
-        | (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
-#elif SPH_AMD64_GCC && !SPH_NO_ASM
-
-/*
- * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * and 64-bit values.
- */
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-    __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
-    return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-    __asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x));
-    return x;
-}
-
-#endif
-
-/*
- * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
- * to generate proper opcodes for endianness swapping with the pure C
- * implementation below.
- *
-
-#elif SPH_I386_MSVC && !SPH_NO_ASM
-
-static __inline sph_u32 __declspec(naked) __fastcall
-sph_bswap32(sph_u32 x)
-{
-    __asm {
-        bswap  ecx
-        mov    eax,ecx
-        ret
-    }
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-    return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
-        | (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
- *
- * [end of disabled code]
- */
-
-#else
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-    x = SPH_T32((x << 16) | (x >> 16));
-    x = ((x & SPH_C32(0xFF00FF00)) >> 8)
-        | ((x & SPH_C32(0x00FF00FF)) << 8);
-    return x;
-}
-
-#if SPH_64
-
-/**
- * Byte-swap a 64-bit value.
- *
- * @param x   the input value
- * @return  the byte-swapped value
- */
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-    x = SPH_T64((x << 32) | (x >> 32));
-    x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16)
-        | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
-    x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8)
-        | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
-    return x;
-}
-
-#endif
-
-#endif
-
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-
-/*
- * On UltraSPARC systems, native ordering is big-endian, but it is
- * possible to perform little-endian read accesses by specifying the
- * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
- * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
- * contains the source address and %dst is the destination register,
- * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
- * to get the address space name. The latter format is better since it
- * combines an addition and the actual access in a single opcode; but
- * it requires the setting (and subsequent resetting) of %asi, which is
- * slow. Some operations (i.e. MD5 compression function) combine many
- * successive little-endian read accesses, which may share the same
- * %asi setting. The macros below contain the appropriate inline
- * assembly.
- */
-
-#define SPH_SPARCV9_SET_ASI   \
-    sph_u32 sph_sparcv9_asi; \
-    __asm__ __volatile__ ( \
-        "rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi));
-
-#define SPH_SPARCV9_RESET_ASI  \
-    __asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi));
-
-#define SPH_SPARCV9_DEC32LE(base, idx)   ({ \
-        sph_u32 sph_sparcv9_tmp; \
-        __asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \
-            : "=r" (sph_sparcv9_tmp) : "r" (base)); \
-        sph_sparcv9_tmp; \
-    })
-
-#endif
-
-static SPH_INLINE void
-sph_enc16be(void *dst, unsigned val)
-{
-    ((unsigned char *)dst)[0] = (val >> 8);
-    ((unsigned char *)dst)[1] = val;
-}
-
-static SPH_INLINE unsigned
-sph_dec16be(const void *src)
-{
-    return ((unsigned)(((const unsigned char *)src)[0]) << 8)
-        | (unsigned)(((const unsigned char *)src)[1]);
-}
-
-static SPH_INLINE void
-sph_enc16le(void *dst, unsigned val)
-{
-    ((unsigned char *)dst)[0] = val;
-    ((unsigned char *)dst)[1] = val >> 8;
-}
-
-static SPH_INLINE unsigned
-sph_dec16le(const void *src)
-{
-    return (unsigned)(((const unsigned char *)src)[0])
-        | ((unsigned)(((const unsigned char *)src)[1]) << 8);
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void
-sph_enc32be(void *dst, sph_u32 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    val = sph_bswap32(val);
-#endif
-    *(sph_u32 *)dst = val;
-#else
-    if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-        val = sph_bswap32(val);
-#endif
-        *(sph_u32 *)dst = val;
-    } else {
-        ((unsigned char *)dst)[0] = (val >> 24);
-        ((unsigned char *)dst)[1] = (val >> 16);
-        ((unsigned char *)dst)[2] = (val >> 8);
-        ((unsigned char *)dst)[3] = val;
-    }
-#endif
-#else
-    ((unsigned char *)dst)[0] = (val >> 24);
-    ((unsigned char *)dst)[1] = (val >> 16);
-    ((unsigned char *)dst)[2] = (val >> 8);
-    ((unsigned char *)dst)[3] = val;
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc32be_aligned(void *dst, sph_u32 val)
-{
-#if SPH_LITTLE_ENDIAN
-    *(sph_u32 *)dst = sph_bswap32(val);
-#elif SPH_BIG_ENDIAN
-    *(sph_u32 *)dst = val;
-#else
-    ((unsigned char *)dst)[0] = (val >> 24);
-    ((unsigned char *)dst)[1] = (val >> 16);
-    ((unsigned char *)dst)[2] = (val >> 8);
-    ((unsigned char *)dst)[3] = val;
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32be(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap32(*(const sph_u32 *)src);
-#else
-    return *(const sph_u32 *)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-        return sph_bswap32(*(const sph_u32 *)src);
-#else
-        return *(const sph_u32 *)src;
-#endif
-    } else {
-        return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-            | ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-            | ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-            | (sph_u32)(((const unsigned char *)src)[3]);
-    }
-#endif
-#else
-    return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-        | ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-        | ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-        | (sph_u32)(((const unsigned char *)src)[3]);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32be_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap32(*(const sph_u32 *)src);
-#elif SPH_BIG_ENDIAN
-    return *(const sph_u32 *)src;
-#else
-    return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-        | ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-        | ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-        | (sph_u32)(((const unsigned char *)src)[3]);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void
-sph_enc32le(void *dst, sph_u32 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    val = sph_bswap32(val);
-#endif
-    *(sph_u32 *)dst = val;
-#else
-    if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_BIG_ENDIAN
-        val = sph_bswap32(val);
-#endif
-        *(sph_u32 *)dst = val;
-    } else {
-        ((unsigned char *)dst)[0] = val;
-        ((unsigned char *)dst)[1] = (val >> 8);
-        ((unsigned char *)dst)[2] = (val >> 16);
-        ((unsigned char *)dst)[3] = (val >> 24);
-    }
-#endif
-#else
-    ((unsigned char *)dst)[0] = val;
-    ((unsigned char *)dst)[1] = (val >> 8);
-    ((unsigned char *)dst)[2] = (val >> 16);
-    ((unsigned char *)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc32le_aligned(void *dst, sph_u32 val)
-{
-#if SPH_LITTLE_ENDIAN
-    *(sph_u32 *)dst = val;
-#elif SPH_BIG_ENDIAN
-    *(sph_u32 *)dst = sph_bswap32(val);
-#else
-    ((unsigned char *)dst)[0] = val;
-    ((unsigned char *)dst)[1] = (val >> 8);
-    ((unsigned char *)dst)[2] = (val >> 16);
-    ((unsigned char *)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32le(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    return sph_bswap32(*(const sph_u32 *)src);
-#else
-    return *(const sph_u32 *)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-        sph_u32 tmp;
-
-        /*
-         * "__volatile__" is needed here because without it,
-         * gcc-3.4.3 miscompiles the code and performs the
-         * access before the test on the address, thus triggering
-         * a bus error...
-         */
-        __asm__ __volatile__ (
-            "lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-        return tmp;
-/*
- * On PowerPC, this turns out not to be worth the effort: the inline
- * assembly makes GCC optimizer uncomfortable, which tends to nullify
- * the decoding gains.
- *
- * For most hash functions, using this inline assembly trick changes
- * hashing speed by less than 5% and often _reduces_ it. The biggest
- * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
- * less then 10%. The speed gain on CubeHash is probably due to the
- * chronic shortage of registers that CubeHash endures; for the other
- * functions, the generic code appears to be efficient enough already.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-        sph_u32 tmp;
-
-        __asm__ __volatile__ (
-            "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-        return tmp;
- */
-#else
-        return sph_bswap32(*(const sph_u32 *)src);
-#endif
-#else
-        return *(const sph_u32 *)src;
-#endif
-    } else {
-        return (sph_u32)(((const unsigned char *)src)[0])
-            | ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-            | ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-            | ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-    }
-#endif
-#else
-    return (sph_u32)(((const unsigned char *)src)[0])
-        | ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-        | ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-        | ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32le_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-    return *(const sph_u32 *)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-    sph_u32 tmp;
-
-    __asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-    return tmp;
-/*
- * Not worth it generally.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-    sph_u32 tmp;
-
-    __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-    return tmp;
- */
-#else
-    return sph_bswap32(*(const sph_u32 *)src);
-#endif
-#else
-    return (sph_u32)(((const unsigned char *)src)[0])
-        | ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-        | ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-        | ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-#endif
-}
-
-#if SPH_64
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void
-sph_enc64be(void *dst, sph_u64 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    val = sph_bswap64(val);
-#endif
-    *(sph_u64 *)dst = val;
-#else
-    if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-        val = sph_bswap64(val);
-#endif
-        *(sph_u64 *)dst = val;
-    } else {
-        ((unsigned char *)dst)[0] = (val >> 56);
-        ((unsigned char *)dst)[1] = (val >> 48);
-        ((unsigned char *)dst)[2] = (val >> 40);
-        ((unsigned char *)dst)[3] = (val >> 32);
-        ((unsigned char *)dst)[4] = (val >> 24);
-        ((unsigned char *)dst)[5] = (val >> 16);
-        ((unsigned char *)dst)[6] = (val >> 8);
-        ((unsigned char *)dst)[7] = val;
-    }
-#endif
-#else
-    ((unsigned char *)dst)[0] = (val >> 56);
-    ((unsigned char *)dst)[1] = (val >> 48);
-    ((unsigned char *)dst)[2] = (val >> 40);
-    ((unsigned char *)dst)[3] = (val >> 32);
-    ((unsigned char *)dst)[4] = (val >> 24);
-    ((unsigned char *)dst)[5] = (val >> 16);
-    ((unsigned char *)dst)[6] = (val >> 8);
-    ((unsigned char *)dst)[7] = val;
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc64be_aligned(void *dst, sph_u64 val)
-{
-#if SPH_LITTLE_ENDIAN
-    *(sph_u64 *)dst = sph_bswap64(val);
-#elif SPH_BIG_ENDIAN
-    *(sph_u64 *)dst = val;
-#else
-    ((unsigned char *)dst)[0] = (val >> 56);
-    ((unsigned char *)dst)[1] = (val >> 48);
-    ((unsigned char *)dst)[2] = (val >> 40);
-    ((unsigned char *)dst)[3] = (val >> 32);
-    ((unsigned char *)dst)[4] = (val >> 24);
-    ((unsigned char *)dst)[5] = (val >> 16);
-    ((unsigned char *)dst)[6] = (val >> 8);
-    ((unsigned char *)dst)[7] = val;
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64be(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap64(*(const sph_u64 *)src);
-#else
-    return *(const sph_u64 *)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-        return sph_bswap64(*(const sph_u64 *)src);
-#else
-        return *(const sph_u64 *)src;
-#endif
-    } else {
-        return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-            | ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-            | ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-            | ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-            | ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-            | ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-            | ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-            | (sph_u64)(((const unsigned char *)src)[7]);
-    }
-#endif
-#else
-    return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-        | ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-        | ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-        | ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-        | ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-        | ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-        | ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-        | (sph_u64)(((const unsigned char *)src)[7]);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64be_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap64(*(const sph_u64 *)src);
-#elif SPH_BIG_ENDIAN
-    return *(const sph_u64 *)src;
-#else
-    return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-        | ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-        | ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-        | ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-        | ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-        | ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-        | ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-        | (sph_u64)(((const unsigned char *)src)[7]);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void
-sph_enc64le(void *dst, sph_u64 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    val = sph_bswap64(val);
-#endif
-    *(sph_u64 *)dst = val;
-#else
-    if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_BIG_ENDIAN
-        val = sph_bswap64(val);
-#endif
-        *(sph_u64 *)dst = val;
-    } else {
-        ((unsigned char *)dst)[0] = val;
-        ((unsigned char *)dst)[1] = (val >> 8);
-        ((unsigned char *)dst)[2] = (val >> 16);
-        ((unsigned char *)dst)[3] = (val >> 24);
-        ((unsigned char *)dst)[4] = (val >> 32);
-        ((unsigned char *)dst)[5] = (val >> 40);
-        ((unsigned char *)dst)[6] = (val >> 48);
-        ((unsigned char *)dst)[7] = (val >> 56);
-    }
-#endif
-#else
-    ((unsigned char *)dst)[0] = val;
-    ((unsigned char *)dst)[1] = (val >> 8);
-    ((unsigned char *)dst)[2] = (val >> 16);
-    ((unsigned char *)dst)[3] = (val >> 24);
-    ((unsigned char *)dst)[4] = (val >> 32);
-    ((unsigned char *)dst)[5] = (val >> 40);
-    ((unsigned char *)dst)[6] = (val >> 48);
-    ((unsigned char *)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc64le_aligned(void *dst, sph_u64 val)
-{
-#if SPH_LITTLE_ENDIAN
-    *(sph_u64 *)dst = val;
-#elif SPH_BIG_ENDIAN
-    *(sph_u64 *)dst = sph_bswap64(val);
-#else
-    ((unsigned char *)dst)[0] = val;
-    ((unsigned char *)dst)[1] = (val >> 8);
-    ((unsigned char *)dst)[2] = (val >> 16);
-    ((unsigned char *)dst)[3] = (val >> 24);
-    ((unsigned char *)dst)[4] = (val >> 32);
-    ((unsigned char *)dst)[5] = (val >> 40);
-    ((unsigned char *)dst)[6] = (val >> 48);
-    ((unsigned char *)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64le(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    return sph_bswap64(*(const sph_u64 *)src);
-#else
-    return *(const sph_u64 *)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-        sph_u64 tmp;
-
-        __asm__ __volatile__ (
-            "ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-        return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-        return (sph_u64)sph_dec32le_aligned(src)
-            | ((sph_u64)sph_dec32le_aligned(
-                (const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-        sph_u64 tmp;
-
-        __asm__ __volatile__ (
-            "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-        return tmp;
- */
-#else
-        return sph_bswap64(*(const sph_u64 *)src);
-#endif
-#else
-        return *(const sph_u64 *)src;
-#endif
-    } else {
-        return (sph_u64)(((const unsigned char *)src)[0])
-            | ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-            | ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-            | ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-            | ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-            | ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-            | ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-            | ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-    }
-#endif
-#else
-    return (sph_u64)(((const unsigned char *)src)[0])
-        | ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-        | ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-        | ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-        | ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-        | ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-        | ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-        | ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64le_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-    return *(const sph_u64 *)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-    sph_u64 tmp;
-
-    __asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-    return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-    return (sph_u64)sph_dec32le_aligned(src)
-        | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-    sph_u64 tmp;
-
-    __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-    return tmp;
- */
-#else
-    return sph_bswap64(*(const sph_u64 *)src);
-#endif
-#else
-    return (sph_u64)(((const unsigned char *)src)[0])
-        | ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-        | ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-        | ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-        | ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-        | ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-        | ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-        | ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-#endif
-}
-
-#endif
-
-#endif /* Doxygen excluded block */
-
-#endif
diff --git a/algo/yespower/yespower-blake2b.c b/algo/yespower/yespower-blake2b.c
index 8dd85c4f..41dec41b 100644
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -95,7 +95,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include "crypto/blake2b-yp.h"
+#include "crypto/hmac-blake2b.h"
 #include "yespower.h"
 
 #ifdef __unix__
@@ -1136,6 +1136,7 @@ int yespower_b2b(yespower_local_t *local,
     salsa20_blk_t *V, *XY;
     pwxform_ctx_t ctx;
     uint8_t init_hash[32];
+    sph_blake2b_ctx blake2b_ctx;
 
     /* Sanity-check parameters */
     if ((N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
@@ -1167,7 +1168,9 @@ int yespower_b2b(yespower_local_t *local,
     ctx.S0 = S;
     ctx.S1 = S + Swidth_to_Sbytes1(Swidth);
 
-    blake2b_yp_hash(init_hash, src, srclen);
+    sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 );
+    sph_blake2b_update( &blake2b_ctx, src, srclen );
+    sph_blake2b_final( &blake2b_ctx, init_hash );
 
     ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
     ctx.w = 0;
@@ -1181,7 +1184,7 @@ int yespower_b2b(yespower_local_t *local,
 
     if ( work_restart[thrid].restart ) return false;
     
-    pbkdf2_blake2b_yp(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
+    pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
 
     if ( work_restart[thrid].restart ) return false;
 
@@ -1190,7 +1193,7 @@ int yespower_b2b(yespower_local_t *local,
 
     if ( work_restart[thrid].restart ) return false;
 
-    hmac_blake2b_yp_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
+    hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
 
     /* Success! */
     return 1;
diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c
index 89680371..a52dea2c 100644
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -249,7 +249,7 @@ bool register_power2b_algo( algo_gate_t* gate )
   applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
   applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );
 
-  gate->optimizations = SSE2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
   gate->scanhash      = (void*)&scanhash_yespower_b2b;
   gate->hash          = (void*)&yespower_b2b_hash;
   opt_target_factor = 65536.0;
diff --git a/configure b/configure
index 8778a4c6..93e54874 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.20.0'
-PACKAGE_STRING='cpuminer-opt 3.20.0'
+PACKAGE_VERSION='3.20.1'
+PACKAGE_STRING='cpuminer-opt 3.20.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.20.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.20.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.20.0
+cpuminer-opt configure 3.20.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.20.0, which was
+It was created by cpuminer-opt $as_me 3.20.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.20.0'
+ VERSION='3.20.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -5820,6 +5820,34 @@ $as_echo "#define USE_AVX2 1" >>confdefs.h
 
       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
+$as_echo_n "checking whether we can compile AVX512 code... " >&6; }
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define USE_AVX512 1" >>confdefs.h
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
+$as_echo "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
@@ -6690,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.20.0, which was
+This file was extended by cpuminer-opt $as_me 3.20.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.20.0
+cpuminer-opt config.status 3.20.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 1e0589b0..d0835055 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.20.0])
+AC_INIT([cpuminer-opt], [3.20.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -93,6 +93,14 @@ then
     AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
       AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
       AC_MSG_RESULT(yes)
+      AC_MSG_CHECKING(whether we can compile AVX512 code)
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
+        AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
+        AC_MSG_RESULT(yes)
+      ,
+        AC_MSG_RESULT(no)
+        AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
+      )
     ,
       AC_MSG_RESULT(no)
       AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
diff --git a/cpu-miner.c b/cpu-miner.c
index 8fc6c7a0..64f30935 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1300,6 +1300,7 @@ static int share_result( int result, struct work *work,
            my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
            bres, CL_N, share_time, latency );
 
+/*   
    if ( unlikely( opt_debug || !result || solved ) )
    {
       if ( have_stratum )
@@ -1309,14 +1310,27 @@ static int share_result( int result, struct work *work,
          applog2( LOG_INFO, "Diff %.5g, Block %d",
                my_stats.share_diff, work ? work->height : last_block_height );
    }
+*/
 
    if ( unlikely( !( opt_quiet || result || stale ) ) )
    {
-      uint32_t str[8];
-      uint32_t *targ;
+//      uint32_t str[8];
+//      uint32_t *targ;
+
+      if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason );
+      {
+         // The exact hash is not avaiable here, it's just an imprecise
+         // approximation calculated from the share difficulty. It's useless
+         // for anything other than low diff rejects. Until and unless a
+         // solution is implemented to make the hash and targets avaiable
+         // don't bother displaying them. In the meantime display the diff for
+         // low diff rejects.
+
+         if ( strstr( reason, "difficulty" ) )
+            applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g",
+                               my_stats.share_diff, my_stats.target_diff );
 
-      if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
-         
+/*
       diff_to_hash( str, my_stats.share_diff );
       applog2( LOG_INFO, "Hash:   %08x%08x%08x%08x%08x%08x", str[7], str[6],
                str[5], str[4], str[3],str[2], str[1], str[0] );
@@ -1330,6 +1344,8 @@ static int share_result( int result, struct work *work,
       }
       applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
                targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
+*/
+      }
    }
    return 1;
 }
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index c91a35e9..2a52eb2f 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -546,14 +546,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 
 
 // Two input shuffle-rotate.
-// Concatenate v1 & v2 and rotate as one 256 bit vector.
-// Continue to use vror/vrol for now to avoid confusion with
-// shufl2r/shufl2l function macros available with AVX512.
+// Concatenate v1 & v2 and bit rotate as one 256 bit vector.
 
 #if defined(__SSSE3__)
 
-// Function macro with two inputs and one output, inputs are preserved.
-// Two input functions are not available without SSSE3. Use procedure
+// Function macros with two inputs and one output, inputs are preserved.
+// Returns the high 128 bits, ie updated v1.
+// These two-input functions are not available without SSSE3. Use procedure
 // macros below instead.
 
 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
@@ -568,12 +567,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
 
-// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
-
-// These macros retain the vrol/vror name for now to avoid
-// confusion with the shufl2r/shuffle2l function macros above.
-// These may be renamed to something like shufl2r2 for 2 nputs and
-// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
+// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
+// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
+// with existing code. The function macros above can be used more effciently.
 
 #define mm128_vror256_64( v1, v2 ) \
 do { \