From 19b0ac6d5c0f48dd12596c44c6a0b77c1581dfff Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 13 Feb 2020 04:25:33 -0500
Subject: [PATCH] v3.12.3

---
 RELEASE_NOTES                    |   11 +
 algo-gate-api.c                  |   64 +-
 algo/cubehash/cube-hash-2way.c   |   16 -
 algo/cubehash/cube-hash-2way.h   |   21 +
 algo/echo/echo-hash-4way.h       |   10 +-
 algo/fugue/sph_fugue.h           |    8 +
 algo/quark/anime-4way.c          |  361 ++++++--
 algo/quark/anime-gate.c          |    8 +-
 algo/quark/anime-gate.h          |   15 +-
 algo/quark/hmq1725-4way.c        | 1379 ++++++++++--------------------
 algo/quark/quark-4way.c          |  318 +++----
 algo/scrypt/scrypt.c             |    3 +-
 algo/shavite/sph-shavite-aesni.c |    2 +-
 algo/shavite/sph_shavite.c       |    5 +
 algo/shavite/sph_shavite.h       |   31 +-
 algo/skein/skein-4way.c          |   23 +-
 algo/skein/skein-hash-4way.c     |  157 ++++
 algo/skein/skein-hash-4way.h     |    8 +
 algo/skein/skein2-4way.c         |   37 +-
 algo/whirlpool/sph_whirlpool.h   |    7 +
 algo/x13/skunk-4way.c            |   74 +-
 algo/x16/hex.c                   |   25 +-
 algo/x16/x16r-4way.c             |  152 +---
 algo/x16/x16r-gate.h             |    6 +-
 algo/x16/x16r.c                  |   17 +-
 algo/x16/x16rv2-4way.c           |  303 +++----
 algo/x16/x16rv2.c                |   14 +-
 algo/x17/sonoa-4way.c            |  752 +++++-----------
 algo/x17/x17-4way.c              |  152 +---
 algo/x17/xevan-4way.c            |  256 ++----
 configure                        |   20 +-
 configure.ac                     |    2 +-
 cpu-miner.c                      |    2 +
 simd-utils/simd-256.h            |   25 +-
 34 files changed, 1714 insertions(+), 2570 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 257fd843..4d0e4476 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,17 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.12.3
+
+Issue #238: Fixed skunk AVX2.
+
+Issue #239: Faster AVX2 & AVX512 for skein +44%, skein2 +30%, plus marginal
+increases for skunk, x16r, x16rv2, x16rt, x16rt-veil, x16s, x21s.
+
+Faster anime VAES +57%, AVX512 +21%, AVX2 +3%.
+
+Redesigned code reponsible for #236.
+
 v3.12.2
 
 Fixed xevan, skein, skein2 AVX2, #238.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index c8517227..4047d94c 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -281,39 +281,37 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-crds",      "argon2d250"   },
-  { "argon2d-dyn",       "argon2d500"   },
-  { "argon2d-uis",       "argon2d4096"  },
-  { "bcd",               "x13bcd"       },
-  { "bitcore",           "timetravel10" },
-  { "bitzeny",           "yescryptr8"   },
-  { "blake256r8",        "blakecoin"    },
-  { "blake256r8vnl",     "vanilla"      },
-  { "blake256r14",       "blake"        },
-  { "blake256r14dcr",    "decred"       },
-  { "cryptonote",        "cryptonight"  },
-  { "cryptonight-light", "cryptolight"  },
-  { "diamond",           "dmd-gr"       },
-  { "droplp",            "drop"         },
-  { "espers",            "hmq1725"      },
-  { "flax",              "c11"          },
-  { "hsr",               "x13sm3"       },
-  { "jackpot",           "jha"          },
-  { "jane",              "scryptjane"   }, 
-  { "lyra2",             "lyra2re"      },
-  { "lyra2v2",           "lyra2rev2"    },
-  { "lyra2v3",           "lyra2rev3"    },
-  { "myrgr",             "myr-gr"       },
-  { "myriad",            "myr-gr"       },
-  { "neo",               "neoscrypt"    },
-  { "phi",               "phi1612"      },
-  { "sib",               "x11gost"      },
-  { "timetravel8",       "timetravel"   },
-  { "veil",              "x16rt-veil"   },
-  { "x16r-hex",          "hex"          },
-  { "yenten",            "yescryptr16"  },
-  { "ziftr",             "zr5"          },
-  { NULL,                NULL           }   
+  { "argon2d-crds",      "argon2d250"     },
+  { "argon2d-dyn",       "argon2d500"     },
+  { "argon2d-uis",       "argon2d4096"    },
+  { "bcd",               "x13bcd"         },
+  { "bitcore",           "timetravel10"   },
+  { "bitzeny",           "yescryptr8"     },
+  { "blake256r8",        "blakecoin"      },
+  { "blake256r8vnl",     "vanilla"        },
+  { "blake256r14",       "blake"          },
+  { "blake256r14dcr",    "decred"         },
+  { "diamond",           "dmd-gr"         },
+  { "espers",            "hmq1725"        },
+  { "flax",              "c11"            },
+  { "hsr",               "x13sm3"         },
+  { "jackpot",           "jha"            },
+  { "jane",              "scryptjane"     }, 
+  { "lyra2",             "lyra2re"        },
+  { "lyra2v2",           "lyra2rev2"      },
+  { "lyra2v3",           "lyra2rev3"      },
+  { "myrgr",             "myr-gr"         },
+  { "myriad",            "myr-gr"         },
+  { "neo",               "neoscrypt"      },
+  { "phi",               "phi1612"        },
+  { "scryptn2",          "scrypt:1048576" },
+  { "sib",               "x11gost"        },
+  { "timetravel8",       "timetravel"     },
+  { "veil",              "x16rt-veil"     },
+  { "x16r-hex",          "hex"            },
+  { "yenten",            "yescryptr16"    },
+  { "ziftr",             "zr5"            },
+  { NULL,                NULL             }   
 };
 
 // if arg is a valid alias for a known algo it is updated with the proper
diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c
index 9a9dfc81..1201b8f2 100644
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -179,14 +179,6 @@ int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen,
     sp->rounds    = 16;
     sp->pos       = 0;
 
-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
     h[ 0] = m512_const1_128( iv[0] );
     h[ 1] = m512_const1_128( iv[1] );
     h[ 2] = m512_const1_128( iv[2] );
@@ -447,14 +439,6 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
     sp->rounds    = 16;
     sp->pos       = 0;
 
-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
     h[ 0] = m256_const1_128( iv[0] );
     h[ 1] = m256_const1_128( iv[1] );
     h[ 2] = m256_const1_128( iv[2] );
diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h
index eddd8130..25df10e8 100644
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -28,6 +28,27 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
 int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
                     const void *data, size_t size );
 
+int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
+                     const void *data, size_t size );
+
+#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
+#define cube512_4way_update cube_4way_update
+#define cube512_4way_update_close cube_4way_update
+#define cube512_4way_close cube_4way_update
+#define cube512_4way_full( sp, output, data, size ) \
+           cube_4way_full( sp, output, 512, data, size )
+#define cube512_4x256_full( sp, output, data, size ) \
+           cube_4x256_full( sp, output, 512, data, size )
+
+#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
+#define cube256_4way_update cube_4way_update
+#define cube256_4way_update_close cube_4way_update
+#define cube256_4way_close cube_4way_update
+#define cube256_4way_full( sp, output, data, size ) \
+           cube_4way_full( sp, output, 256, data, size )
+#define cube256_4x256_full( sp, output, data, size ) \
+           cube_4x256_full( sp, output, 256, data, size )
+
 #endif
 
 // 2x128, 2 way parallel SSE2
diff --git a/algo/echo/echo-hash-4way.h b/algo/echo/echo-hash-4way.h
index 014c789e..f9e906f2 100644
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -22,18 +22,26 @@ typedef struct
 } echo_4way_context __attribute__ ((aligned (64)));
 
 int echo_4way_init( echo_4way_context *state, int hashbitlen );
-
+#define echo512_4way_init( state ) echo_4way_init( state, 512 )
+#define echo256_4way_init( state ) echo_4way_init( state, 256 )
 
 int echo_4way_update( echo_4way_context *state, const void *data,
     unsigned int databitlen);
+#define echo512_4way_update echo_4way_update
 
 int echo_close( echo_4way_context *state, void *hashval );
+#define echo512_4way_close echo_4way_close
 
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                               const void *data, int databitlen );
+#define echo512_4way_update_close echo_4way_update_close
 
 int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
                     const void *data, int datalen );
+#define echo512_4way_full( state, hashval, data, datalen ) \
+           echo_4way_full( state, hashval, 512, data, datalen )
+#define echo256_4way_full( state, hashval, data, datalen ) \
+           echo_4way_full( state, hashval, 256, data, datalen )
 
 #endif 
 #endif
diff --git a/algo/fugue/sph_fugue.h b/algo/fugue/sph_fugue.h
index d8d0ea04..08d4dde0 100644
--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -74,6 +74,14 @@ void sph_fugue512_close(void *cc, void *dst);
 void sph_fugue512_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
+#define sph_fugue512_full( cc, dst, data, len ) \
+do{ \
+   sph_fugue512_init( cc ); \
+   sph_fugue512( cc, data, len ); \
+   sph_fugue512_close( cc, dst ); \
+}while(0)
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c
index a329d593..994d2909 100644
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -1,18 +1,241 @@
 #include "cpuminer-config.h"
 #include "anime-gate.h"
-
-#if defined (ANIME_4WAY)
-
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+#endif
+
+#if defined (ANIME_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+#else
+    hashState_groestl       groestl;
+#endif
+    jh512_8way_context      jh;
+    skein512_8way_context   skein;
+    keccak512_8way_context  keccak;
+} anime_8way_ctx_holder;
+
+anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
+
+void init_anime_8way_ctx()
+{
+     blake512_8way_init( &anime_8way_ctx.blake );
+     bmw512_8way_init( &anime_8way_ctx.bmw );
+#if defined(__VAES__)
+     groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
+#else
+     init_groestl( &anime_8way_ctx.groestl, 64 );
+#endif
+     skein512_8way_init( &anime_8way_ctx.skein );
+     jh512_8way_init( &anime_8way_ctx.jh );
+     keccak512_8way_init( &anime_8way_ctx.keccak );
+}
+
+void anime_8way_hash( void *state, const void *input )
+{
+    uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+    uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashC[8*8] __attribute__ ((aligned (64)));
+#if !defined(__VAES__)
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t hash4[8] __attribute__ ((aligned (64)));
+    uint64_t hash5[8] __attribute__ ((aligned (64)));
+    uint64_t hash6[8] __attribute__ ((aligned (64)));
+    uint64_t hash7[8] __attribute__ ((aligned (64)));
+#endif
+    __m512i* vh  = (__m512i*)vhash;
+    __m512i* vhA = (__m512i*)vhashA;
+    __m512i* vhB = (__m512i*)vhashB;
+    __m512i* vhC = (__m512i*)vhashC;
+    const __m512i bit3_mask = m512_const1_64( 8 );
+    const __m512i zero = _mm512_setzero_si512();
+    __mmask8 vh_mask;
+    anime_8way_ctx_holder ctx;
+    memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
+
+    bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
+
+    blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+#if defined(__VAES__)
+
+    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+    if ( ( vh_mask & 0x0f ) != 0x0f )
+       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+    if ( ( vh_mask & 0xf0 ) != 0xf0 )
+       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+
+#else
+    
+    dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                      hash4, hash5, hash6, hash7, vhash );
+
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); 
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    if ( hash4[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    if ( hash5[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    if ( hash6[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    if ( hash7[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+    intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7 );
+
+#endif
+
+    if ( vh_mask & 0xff )
+       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
+
+    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
+
+#if defined(__VAES__)
+
+    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+    groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+    groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+    rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+    
+    dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                      hash4, hash5, hash6, hash7, vhash );
+
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+    intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                            hash4, hash5, hash6, hash7 );
+
+#endif
+
+    jh512_8way_init( &ctx.jh );
+    jh512_8way_update( &ctx.jh, vhash, 64 );
+    jh512_8way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
+    if ( vh_mask & 0xff )
+       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    keccak512_8way_init( &ctx.keccak );
+    keccak512_8way_update( &ctx.keccak, vhash, 64 );
+    keccak512_8way_close( &ctx.keccak, vhash );
+
+    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), 
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       keccak512_8way_init( &ctx.keccak );
+       keccak512_8way_update( &ctx.keccak, vhash, 64 );
+       keccak512_8way_close( &ctx.keccak, vhashA );
+    }
+    if ( vh_mask & 0xff )
+    {
+       jh512_8way_init( &ctx.jh );
+       jh512_8way_update( &ctx.jh, vhash, 64 );
+       jh512_8way_close( &ctx.jh, vhashB );
+    }
+
+   casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
+   casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
+   casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
+   casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
+}
+
+int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash64[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint64_t *hash64_q3 = &(hash64[3*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    __m512i  *noncev = (__m512i*)vdata + 9; 
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+             _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                               n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+
+    do
+    {
+       anime_8way_hash( hash64, vdata );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
+       {
+          extr_lane_8x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       *noncev = _mm512_add_epi32( *noncev,
+                                   m512_const1_64( 0x0000000800000000 ) );
+       n += 8;
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined (ANIME_4WAY)
 
 typedef struct {
     blake512_4way_context  blake;
@@ -23,18 +246,6 @@ typedef struct {
     keccak512_4way_context keccak;
 } anime_4way_ctx_holder;
 
-anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64)));
-
-void init_anime_4way_ctx()
-{
-     blake512_4way_init( &anime_4way_ctx.blake );
-     bmw512_4way_init( &anime_4way_ctx.bmw );
-     init_groestl( &anime_4way_ctx.groestl, 64 );
-     skein512_4way_init( &anime_4way_ctx.skein );
-     jh512_4way_init( &anime_4way_ctx.jh );
-     keccak512_4way_init( &anime_4way_ctx.keccak );
-}
-
 void anime_4way_hash( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -48,81 +259,61 @@ void anime_4way_hash( void *state, const void *input )
     __m256i* vhA = (__m256i*)vhashA;
     __m256i* vhB = (__m256i*)vhashB;
     __m256i vh_mask;
-    const uint32_t mask = 8;
+    int h_mask;
     const __m256i bit3_mask = m256_const1_64( 8 );
     const __m256i zero = _mm256_setzero_si256();
     anime_4way_ctx_holder ctx;
-    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
 
+    bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, input, 80 );
     bmw512_4way_close( &ctx.bmw, vhash );
 
-    blake512_4way_update( &ctx.blake, vhash, 64 );
-    blake512_4way_close( &ctx.blake, vhash );
+    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-    if ( hash0[0] & mask )
-    {
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-    }
-    if ( hash1[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-    }
-    if ( hash2[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-    }
-    if ( hash3[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-    }
+    // A
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
 
     intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
-    if ( mm256_anybits0( vh_mask ) )
-    {
-       skein512_4way_update( &ctx.skein, vhash, 64 );
-       skein512_4way_close( &ctx.skein, vhashB );
-    }
+    // B
+    if ( h_mask & 0xffffffff )
+       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
 
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
 
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
+    jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-    if ( mm256_anybits1( vh_mask ) )
-    {
-       blake512_4way_init( &ctx.blake );
-       blake512_4way_update( &ctx.blake, vhash, 64 );
-       blake512_4way_close( &ctx.blake, vhashA );
-    }
-    if ( mm256_anybits0( vh_mask ) )
+    // A
+    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
+       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
+    // B
+    if ( h_mask & 0xffffffff )
     {
        bmw512_4way_init( &ctx.bmw );
        bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -131,64 +322,74 @@ void anime_4way_hash( void *state, const void *input )
 
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
+    keccak512_4way_init( &ctx.keccak );
     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 
-    skein512_4way_init( &ctx.skein );
-    skein512_4way_update( &ctx.skein, vhash, 64 );
-    skein512_4way_close( &ctx.skein, vhash );
+    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-    if ( mm256_anybits1( vh_mask ) )
+    // A
+    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
     {
        keccak512_4way_init( &ctx.keccak );
        keccak512_4way_update( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
     }
-    if ( mm256_anybits0( vh_mask ) )
+    // B
+    if ( h_mask & 0xffffffff )
     {
        jh512_4way_init( &ctx.jh );
        jh512_4way_update( &ctx.jh, vhash, 64 );
        jh512_4way_close( &ctx.jh, vhashB );
     }
 
-    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
-
-    dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
+    casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
+    casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
+    casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
 }
 
 int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint64_t hash64[4*4] __attribute__ ((aligned (64)));
     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint64_t *hash64_q3 = &(hash64[3*4]);
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t last_nonce = max_nonce - 4;
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    __m256i  *noncev = (__m256i*)vdata + 9;  
     const int thr_id = mythr->id;  
+    const bool bench = opt_benchmark;
 
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
     *noncev = mm256_intrlv_blend_32(
                    _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-
     do
     {
-       anime_4way_hash( hash, vdata );
+       anime_4way_hash( hash64, vdata );
 
-       for ( int i = 0; i < 4; i++ )
-       if ( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark )
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
        {
-          pdata[19] = bswap_32( n+i );
-          submit_solution( work, hash+(i<<3), mythr );
+          extr_lane_4x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
        }
        *noncev = _mm256_add_epi32( *noncev,
                                    m256_const1_64( 0x0000000400000000 ) );
        n += 4;
-    } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
     pdata[19] = n;
     *hashes_done = n - first_nonce;
     return 0;
diff --git a/algo/quark/anime-gate.c b/algo/quark/anime-gate.c
index 53a06e1d..5e03c4aa 100644
--- a/algo/quark/anime-gate.c
+++ b/algo/quark/anime-gate.c
@@ -2,8 +2,10 @@
 
 bool register_anime_algo( algo_gate_t* gate )
 {
-#if defined (ANIME_4WAY)
-  init_anime_4way_ctx();
+#if defined (ANIME_8WAY)
+  gate->scanhash  = (void*)&scanhash_anime_8way;
+  gate->hash      = (void*)&anime_8way_hash;
+#elif defined (ANIME_4WAY)
   gate->scanhash  = (void*)&scanhash_anime_4way;
   gate->hash      = (void*)&anime_4way_hash;
 #else
@@ -11,7 +13,7 @@ bool register_anime_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_anime;
   gate->hash      = (void*)&anime_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/quark/anime-gate.h b/algo/quark/anime-gate.h
index fdf34b4c..a7b08376 100644
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -4,18 +4,25 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define ANIME_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define ANIME_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define ANIME_4WAY 1
 #endif
 
 bool register_anime_algo( algo_gate_t* gate );
 
-#if defined(ANIME_4WAY)
+#if defined(ANIME_8WAY)
+
+void anime_8way_hash( void *state, const void *input );
+int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(ANIME_4WAY)
 
 void anime_4way_hash( void *state, const void *input );
 int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-void init_anime_4way_ctx();
 
 #endif
 
diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c
index 22d249f2..fd78d5b4 100644
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -81,127 +81,68 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    __m512i* vhB = (__m512i*)vhashB;
    __m512i* vhC = (__m512i*)vhashC;
 
-   bmw512_8way_init( &ctx.bmw );
-   bmw512_8way_update( &ctx.bmw, input, 80 );
-   bmw512_8way_close( &ctx.bmw, vhash );
-
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
-
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash0 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash1 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash2 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash3 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash4 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash5 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash6 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash7 );
-
-   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6,  hash7 );
+
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
 
    // A
-
 #if defined(__VAES__)
 
    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-   if ( likely( ( vh_mask & 0x0f ) != 0x0f ) )
-   {
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-   }
-   if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) )
-   {
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
-   }
+   if ( ( vh_mask & 0x0f ) != 0x0f )
+       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+   if ( ( vh_mask & 0xf0 ) != 0xf0 )
+       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+   
    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
 
 #else
 
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
 
    if ( hash0[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                             (char*)hash0, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    if ( hash1[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                             (char*)hash1, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
    if ( hash2[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                             (char*)hash2, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    if ( hash3[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                             (char*)hash3, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
    if ( hash4[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                             (char*)hash4, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
    if ( hash5[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                             (char*)hash5, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
    if ( hash6[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                             (char*)hash6, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
    if ( hash7[0] & mask )
-   {
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                             (char*)hash7, 512 );
-   }
+       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
-   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3,
+                            hash4, hash5, hash6, hash7 );
 
 #endif
 
    // B
    if ( likely( vh_mask & 0xff ) )
-   {
-      skein512_8way_init( &ctx.skein );
-      skein512_8way_update( &ctx.skein, vhash, 64 );
-      skein512_8way_close( &ctx.skein, vhashB );
-   }
+       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
 
    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
 
@@ -216,32 +157,21 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
 
-   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
-   {
-      blake512_8way_init( &ctx.blake );
-      blake512_8way_update( &ctx.blake, vhash, 64 );
-      blake512_8way_close( &ctx.blake, vhashA );
-   }
-
-   if ( likely( vh_mask & 0xff ) )
-   {
-      bmw512_8way_init( &ctx.bmw );
-      bmw512_8way_update( &ctx.bmw, vhash, 64 );
-      bmw512_8way_close( &ctx.bmw, vhashB );
-   }
+   // A
+   if ( ( vh_mask & 0xff ) != 0xff )
+       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
+   // B
+   if ( vh_mask & 0xff )
+       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
 
    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-   luffa_4way_init( &ctx.luffa, 512 );
-   luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-   luffa_4way_init( &ctx.luffa, 512 );
-   luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+   luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
 
-   cube_4way_init( &ctx.cube, 512, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-   cube_4way_init( &ctx.cube, 512, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+   cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+   cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
 
    rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
@@ -267,114 +197,60 @@ extern void hmq1725_8way_hash(void *state, const void *input)
 
    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-   shavite512_4way_init( &ctx.shavite );
-   shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-   shavite512_4way_init( &ctx.shavite );
-   shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+   shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+   shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
-   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
-     
 #else
 
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
-
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash0, 64 );
-   sph_shavite512_close( &ctx.shavite, hash0 );
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash1, 64 );
-   sph_shavite512_close( &ctx.shavite, hash1 );
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash2, 64 );
-   sph_shavite512_close( &ctx.shavite, hash2 );
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash3, 64 );
-   sph_shavite512_close( &ctx.shavite, hash3 );
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash4, 64 );
-   sph_shavite512_close( &ctx.shavite, hash4 );
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash5, 64 );
-   sph_shavite512_close( &ctx.shavite, hash5 );
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash6, 64 );
-   sph_shavite512_close( &ctx.shavite, hash6 );
-   sph_shavite512_init( &ctx.shavite );
-   sph_shavite512 ( &ctx.shavite, hash7, 64 );
-   sph_shavite512_close( &ctx.shavite, hash7 );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+   shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+   shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+   shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+   shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+   shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+   shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+   shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
    intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
    intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
 #endif
 
-   simd_4way_init( &ctx.simd, 512 );
-   simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-   simd_4way_init( &ctx.simd, 512 );
-   simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+   simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
+   simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
 
    rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
    // 4x32 for haval
-   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
      
    // A
    if ( hash0[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash0 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
    if ( hash1[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash1 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
    if ( hash2[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash2 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
    if ( hash3[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash3 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
    if ( hash4[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash4 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
    if ( hash5[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash5 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
    if ( hash6[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash6 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
    if ( hash7[0] & mask )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash7 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
-   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3,
+                            hash4, hash5, hash6, hash7 );
 
    // B
    if ( likely( vh_mask & 0xff ) )
@@ -392,51 +268,39 @@ extern void hmq1725_8way_hash(void *state, const void *input)
 
    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-   echo_4way_init( &ctx.echo, 512 );
-   echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-   echo_4way_init( &ctx.echo, 512 );
-   echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+   echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+   echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
 
    rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
 
 #else
 
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                     vhash );
-
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                           (const BitSequence *)hash0, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                           (const BitSequence *)hash1, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                           (const BitSequence *)hash2, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                           (const BitSequence *)hash3, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                           (const BitSequence *)hash4, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                           (const BitSequence *)hash5, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                           (const BitSequence *)hash6, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                           (const BitSequence *)hash7, 512 );
-
-   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                    hash7 );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                   (const BitSequence *)hash0, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                   (const BitSequence *)hash1, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                   (const BitSequence *)hash2, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                   (const BitSequence *)hash3, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                   (const BitSequence *)hash4, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                   (const BitSequence *)hash5, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                   (const BitSequence *)hash6, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                   (const BitSequence *)hash7, 64 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
 
 #endif
 
-   blake512_8way_init( &ctx.blake );
-   blake512_8way_update( &ctx.blake, vhash, 64 );
-   blake512_8way_close( &ctx.blake, vhash );
+   blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
 
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
@@ -447,74 +311,36 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
    if ( likely( ( vh_mask & 0x0f ) != 0x0f ) )
-   {
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-   }
+      shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
    if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) )
-   {
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
-   }
+      shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
 
 #else
 
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
 
    if ( hash0[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash0, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash0 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash0, hash0, 64 ); //
    if ( hash1[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash1, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash1 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash1, hash1, 64 ); //
    if ( hash2[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash2, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash2 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash2, hash2, 64 ); //
    if ( hash3[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash3, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash3 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash3, hash3, 64 ); //
    if ( hash4[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash4, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash4 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash4, hash4, 64 ); //
    if ( hash5[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash5, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash5 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash5, hash5, 64 ); //
    if ( hash6[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash6, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash6 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash6, hash6, 64 ); //
    if ( hash7[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash7, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash7 ); //8
-   }
+      shavite512_full( &ctx.shavite, hash7, hash7, 64 ); //
 
-   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3,
+                            hash4, hash5, hash6, hash7 );
 
 #endif
 
@@ -522,15 +348,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
    if ( likely( vh_mask & 0x0f ) )
-   {
-      luffa_4way_init( &ctx.luffa, 512 );
-      luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-   }
+      luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
    if ( likely( vh_mask & 0xf0 ) )
-   {
-      luffa_4way_init( &ctx.luffa, 512 );
-      luffa_4way_update_close( &ctx.luffa, vhash, vhashB, 64 );
-   }
+      luffa512_4way_full( &ctx.luffa, vhash, vhashB, 64 );
 
    rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
 
@@ -540,110 +360,64 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_8way_close( &ctx.hamsi, vhash );
 
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
-
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash0, 64 );
-   sph_fugue512_close( &ctx.fugue, hash0 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash1, 64 );
-   sph_fugue512_close( &ctx.fugue, hash1 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash2, 64 );
-   sph_fugue512_close( &ctx.fugue, hash2 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash3, 64 );
-   sph_fugue512_close( &ctx.fugue, hash3 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash4, 64 );
-   sph_fugue512_close( &ctx.fugue, hash4 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash5, 64 );
-   sph_fugue512_close( &ctx.fugue, hash5 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash6, 64 );
-   sph_fugue512_close( &ctx.fugue, hash6 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash7, 64 );
-   sph_fugue512_close( &ctx.fugue, hash7 );
-
-   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+   sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+   sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+   sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+   sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+   sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+   sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+   sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
 
      // A   
 #if defined(__VAES__)
 
-     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
    if ( likely( ( vh_mask & 0x0f ) != 0x0f ) )
-   {
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-   }
+      echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
    if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) )
-   {
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
-   }
+      echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
 
    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
 
 #else
    
    if ( hash0[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                               (const BitSequence *)hash0, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                       (const BitSequence *)hash0, 64 );
    if ( hash1[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                               (const BitSequence *)hash1, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                       (const BitSequence *)hash1, 64 );
    if ( hash2[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                               (const BitSequence *)hash2, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                       (const BitSequence *)hash2, 64 );
    if ( hash3[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                               (const BitSequence *)hash3, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                       (const BitSequence *)hash3, 64 );
    if ( hash4[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                               (const BitSequence *)hash4, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                       (const BitSequence *)hash4, 64 );
    if ( hash5[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                               (const BitSequence *)hash5, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                       (const BitSequence *)hash5, 64 );
    if ( hash6[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                               (const BitSequence *)hash6, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                       (const BitSequence *)hash6, 64 );
    if ( hash7[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                               (const BitSequence *)hash7, 512 );
-   }
+       echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                       (const BitSequence *)hash7, 64 );
 
-   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3,
+                            hash4, hash5, hash6, hash7 );
    
 #endif
 
@@ -651,15 +425,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
    if ( likely( vh_mask & 0x0f ) )
-   {
-      simd_4way_init( &ctx.simd, 512 );
-      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-   }
+      simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
    if ( likely( vh_mask & 0xf0 ) )
-   {
-      simd_4way_init( &ctx.simd, 512 );
-      simd_4way_update_close( &ctx.simd, vhash, vhashB, 512 );
-   }
+      simd512_4way_full( &ctx.simd, vhash, vhashB, 64 );
 
    rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
 
@@ -671,92 +439,44 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    shabal512_8way_update( &ctx.shabal, vhashA, 64 );
    shabal512_8way_close( &ctx.shabal, vhash );
 
-   dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
-
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash0 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash1 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash2 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash3 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash4 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash5 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash6 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash7 );
+   dintrlv_8x32_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
    // A
 
-   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
 
    if ( hash0[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash0, 64 );
-      sph_fugue512_close( &ctx.fugue, hash0 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
    if ( hash1[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash1, 64 );
-      sph_fugue512_close( &ctx.fugue, hash1 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
    if ( hash2[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash2, 64 );
-      sph_fugue512_close( &ctx.fugue, hash2 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
    if ( hash3[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash3, 64 );
-      sph_fugue512_close( &ctx.fugue, hash3 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
    if ( hash4[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash4, 64 );
-      sph_fugue512_close( &ctx.fugue, hash4 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
    if ( hash5[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash5, 64 );
-      sph_fugue512_close( &ctx.fugue, hash5 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
    if ( hash6[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash6, 64 );
-      sph_fugue512_close( &ctx.fugue, hash6 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
    if ( hash7[0] & mask )
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash7, 64 );
-      sph_fugue512_close( &ctx.fugue, hash7 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
 
-   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                    hash7 );
+   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3,
+                            hash4, hash5, hash6, hash7 );
 
    // B
    if ( likely( vh_mask & 0xff ) )
@@ -770,39 +490,29 @@ extern void hmq1725_8way_hash(void *state, const void *input)
 
 #if defined(__VAES__)
 
-     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+   groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+   groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
 
-     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
 
 #else
 
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
-
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
-
-   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
    
 #endif
 
@@ -812,8 +522,8 @@ extern void hmq1725_8way_hash(void *state, const void *input)
 
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                     vhash );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
 
    // A
    if ( likely( ( vh_mask & 0xff ) != 0xff ) )
@@ -829,53 +539,21 @@ extern void hmq1725_8way_hash(void *state, const void *input)
 
    // B
    if ( !( hash0[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash0 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
    if ( !( hash1[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash1 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
    if ( !( hash2[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash2 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
    if ( !( hash3[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash3 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
    if ( !( hash4[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash4 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
    if ( !( hash5[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash5 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
    if ( !( hash6[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash6 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
    if ( !( hash7[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash7 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
    intrlv_8x64_512( vhashB, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                     hash7 );
@@ -889,41 +567,44 @@ extern void hmq1725_8way_hash(void *state, const void *input)
 int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[16*8] __attribute__ ((aligned (128)));
+    uint64_t hash64[8*8] __attribute__ ((aligned (128)));
     uint32_t vdata[20*8] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[49]);
+    uint64_t *hash64_q3 = &(hash64[3*8]);
     uint32_t *pdata = work->data;
-    uint32_t *ptarget = work->target;
-    const uint32_t Htarg = ptarget[7];
+    uint64_t *ptarget = (uint64_t*)work->target;
+    const uint64_t targ64_q3 = ptarget[3];
     const uint32_t first_nonce = pdata[19];
     uint32_t n = first_nonce;
-    const uint32_t last_nonce = max_nonce - 4;
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;
+    const uint32_t last_nonce = max_nonce - 8;
+    __m512i  *noncev = (__m512i*)vdata + 9;  
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
 
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
     do
     {
-       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
-       hmq1725_8way_hash( hash, vdata );
+       hmq1725_8way_hash( hash64, vdata );
 
        for ( int lane = 0; lane < 8; lane++ )
-       if ( hash7[ lane<<1 ] <= Htarg )
+       if ( hash64_q3[ lane ] <= targ64_q3 && !bench )
        {
-          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          extr_lane_8x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
           {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
              submit_lane_solution( work, lane_hash, mythr, lane );
           }
        }
+       *noncev = _mm512_add_epi32( *noncev,
+                                   m512_const1_64( 0x0000000800000000 ) );
        n += 8;
-    } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
 
+    pdata[19] = n;
     *hashes_done = n - first_nonce;
     return 0;
 }
@@ -939,7 +620,9 @@ union _hmq1725_4way_context_overlay
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     hashState_luffa         luffa;
+    luffa_2way_context      luffa2;
     cubehashParam           cube;
+    cube_2way_context       cube2;
     sph_shavite512_context  shavite;
     hashState_sd            sd;
     simd_2way_context       simd;
@@ -956,338 +639,217 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
 
 extern void hmq1725_4way_hash(void *state, const void *input)
 {
-     uint32_t hash0 [16]    __attribute__ ((aligned (64)));
-     uint32_t hash1 [16]    __attribute__ ((aligned (64)));
-     uint32_t hash2 [16]    __attribute__ ((aligned (64)));
-     uint32_t hash3 [16]    __attribute__ ((aligned (64)));
-     uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
-     uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
-     uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
-     hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
-     __m256i vh_mask;     
-     const __m256i vmask = m256_const1_64( 24 );
-     const uint32_t mask = 24;
-     __m256i* vh  = (__m256i*)vhash;
-     __m256i* vhA = (__m256i*)vhashA;
-     __m256i* vhB = (__m256i*)vhashB;
-
-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, input, 80 );
-     bmw512_4way_close( &ctx.bmw, vhash );
-
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
+   uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
+   uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
+   uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
+   hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
+   __m256i vh_mask;     
+   int h_mask;
+   const __m256i vmask = m256_const1_64( 24 );
+   const uint32_t mask = 24;
+   __m256i* vh  = (__m256i*)vhash;
+   __m256i* vhA = (__m256i*)vhashA;
+   __m256i* vhB = (__m256i*)vhashB;
+
+   bmw512_4way_init( &ctx.bmw );
+   bmw512_4way_update( &ctx.bmw, input, 80 );
+   bmw512_4way_close( &ctx.bmw, vhash );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
 // first fork, A is groestl serial, B is skein parallel.
 
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
-                                   m256_zero );
+   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero );
+   h_mask = _mm256_movemask_epi8( vh_mask );
 
 // A
 
-     if ( hash0[0] & mask )
-     {
-       init_groestl( &ctx.groestl, 64 );
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-     }
-     if ( hash1[0] & mask )
-     {
-       init_groestl( &ctx.groestl, 64 );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-     }
-     if ( hash2[0] & mask )
-     {
-       init_groestl( &ctx.groestl, 64 );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-     }
-     if ( hash3[0] & mask )
-     {
-       init_groestl( &ctx.groestl, 64 );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-     }
-
-     intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+   if ( hash0[0] & mask )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   if ( hash1[0] & mask )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   if ( hash2[0] & mask )
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   if ( hash3[0] & mask )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
 // B
 
-     if ( mm256_anybits0( vh_mask ) )
-     {
-       skein512_4way_init( &ctx.skein );
-       skein512_4way_update( &ctx.skein, vhash, 64 );
-       skein512_4way_close( &ctx.skein, vhashB );
-     }
+    if ( h_mask & 0xffffffff )
+       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
 
-     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+    jh512_4way_init( &ctx.jh );
+    jh512_4way_update( &ctx.jh, vhash, 64 );
+    jh512_4way_close( &ctx.jh, vhash );
 
-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+    keccak512_4way_init( &ctx.keccak );
+    keccak512_4way_update( &ctx.keccak, vhash, 64 );
+    keccak512_4way_close( &ctx.keccak, vhash );
 
 // second fork, A = blake parallel, B= bmw parallel.
     
-     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
-                                   m256_zero );
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-     if ( mm256_anybits1( vh_mask ) )
-     {
-       blake512_4way_init( &ctx.blake );
-       blake512_4way_update( &ctx.blake, vhash, 64 );
-       blake512_4way_close( &ctx.blake, vhashA );
-     }
+    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
+       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
 
-     if ( mm256_anybits0( vh_mask ) )
-     {
+    if ( h_mask & 0xffffffff )
+    {
        bmw512_4way_init( &ctx.bmw );
        bmw512_4way_update( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
-     }
+    }
 
-     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     init_luffa( &ctx.luffa, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                                   (const BitSequence*)hash0, 64 );
-     init_luffa( &ctx.luffa, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                                   (const BitSequence*)hash1, 64 );
-     init_luffa( &ctx.luffa, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                                   (const BitSequence*)hash2, 64 );
-     init_luffa( &ctx.luffa, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                                   (const BitSequence*)hash3, 64 );
-
-     cubehashInit( &ctx.cube, 512, 16, 32 );
-     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0,
-                                (const BitSequence *)hash0, 64 );
-     cubehashInit( &ctx.cube, 512, 16, 32 );
-     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1,
-                                (const BitSequence *)hash1, 64 );
-     cubehashInit( &ctx.cube, 512, 16, 32 );
-     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2,
-                                (const BitSequence *)hash2, 64 );
-     cubehashInit( &ctx.cube, 512, 16, 32 );
-     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3,
-                                (const BitSequence *)hash3, 64 );
-
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+    luffa512_2way_full( &ctx.luffa2, vhashA, vhashA, 64 );
+    luffa512_2way_full( &ctx.luffa2, vhashB, vhashB, 64 );
+
+    cube_2way_full( &ctx.cube2, vhashA, 512, vhashA, 64 );
+    cube_2way_full( &ctx.cube2, vhashB, 512, vhashB, 64 );
+
+    rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 
 // A= keccak parallel, B= jh parallel
     
-     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
-                                  m256_zero );
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-     if ( mm256_anybits1( vh_mask ) )
-     {
+    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
+    {
         keccak512_4way_init( &ctx.keccak );
         keccak512_4way_update( &ctx.keccak, vhash, 64 );
         keccak512_4way_close( &ctx.keccak, vhashA );
-     }
+    }
 
-     if ( mm256_anybits0( vh_mask ) )
-     {
+    if ( h_mask & 0xffffffff )
+    {
         jh512_4way_init( &ctx.jh );
         jh512_4way_update( &ctx.jh, vhash, 64 );
         jh512_4way_close( &ctx.jh, vhashB );
-     }
+    }
 
-     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512 ( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512 ( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512 ( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512 ( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+    shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+    shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+    shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+    shavite512_full( &ctx.shavite, hash3, hash3, 64 );
 
-     intrlv_2x128_512( vhashA, hash0, hash1 );
-     intrlv_2x128_512( vhashB, hash2, hash3 );
+    intrlv_2x128_512( vhashA, hash0, hash1 );
+    intrlv_2x128_512( vhashB, hash2, hash3 );
 
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
+    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
-     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );     
+    rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );     
 
-     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
-                                   m256_zero );
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+    dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+    intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
-     // 4x32 for haval
-     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
+    // A
+    if ( hash0[0] & mask )
+       sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+    if ( hash1[0] & mask )
+       sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+    if ( hash2[0] & mask )
+       sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+    if ( hash3[0] & mask )
+       sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
-     // A
-    
-     if ( hash0[0] & mask )
-     {
-        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-        sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     }
-     if ( hash1[0] & mask )
-     {
-        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-        sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     }
-     if ( hash2[0] & mask )
-     {
-        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-        sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     }
-     if ( hash3[0] & mask )
-     {
-        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-        sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     }
-
-     intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
-// B
-     if ( mm256_anybits0( vh_mask ) )
-     {
-        haval256_5_4way_init( &ctx.haval );
-        haval256_5_4way_update( &ctx.haval, vhash, 64 );
-        haval256_5_4way_close( &ctx.haval, vhash );
-        memset( &vhash[8<<2], 0, 32<<2 );
-        rintrlv_4x32_4x64( vhashB, vhash, 512 );
-     }
-
-     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
-
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-    
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                             (const BitSequence *)hash0, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                             (const BitSequence *)hash1, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                             (const BitSequence *)hash2, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                             (const BitSequence *)hash3, 512 );
-
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     
-     blake512_4way_init( &ctx.blake );
-     blake512_4way_update( &ctx.blake, vhash, 64 );
-     blake512_4way_close( &ctx.blake, vhash );
+    // B
+    if ( h_mask & 0xffffffff )
+    {
+       haval256_5_4way_init( &ctx.haval );
+       haval256_5_4way_update( &ctx.haval, vhash, 64 );
+       haval256_5_4way_close( &ctx.haval, vhash );
+       memset( &vhash[8<<2], 0, 32<<2 );
+       rintrlv_4x32_4x64( vhashB, vhash, 512 );
+    }
 
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
-// shavite & luffa, both serial, select individually.
+    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    
+    echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                    (const BitSequence *)hash0, 64 );
+    echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                    (const BitSequence *)hash1, 64 );
+    echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                    (const BitSequence *)hash2, 64 );
+    echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                    (const BitSequence *)hash3, 64 );
 
-   if ( hash0[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash0, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash0 ); //8
-   }
-   else
-   {
-      init_luffa( &ctx.luffa, 512 );
-      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0,
-                                    (const BitSequence *)hash0, 64 );
-   }
+    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     
+    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
 
-   if ( hash1[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash1, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash1 ); //8
-   }
-   else
-   {
-      init_luffa( &ctx.luffa, 512 );
-      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1,
-                                    (const BitSequence *)hash1, 64 );
-   }
+    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-   if ( hash2[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash2, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash2 ); //8
-   }
-   else
-   {
-      init_luffa( &ctx.luffa, 512 );
-      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2,
-                                    (const BitSequence *)hash2, 64 );
-   }
+// shavite & luffa, both serial, select individually.
 
-   if ( hash3[0] & mask )
-   {
-      sph_shavite512_init( &ctx.shavite );
-      sph_shavite512( &ctx.shavite, hash3, 64 ); //
-      sph_shavite512_close( &ctx.shavite, hash3 ); //8
-   }
-   else
-   {
-      init_luffa( &ctx.luffa, 512 );
-      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3,
-                                    (const BitSequence *)hash3, 64 );
-   }
+    if ( hash0[0] & mask )
+       shavite512_full( &ctx.shavite, hash0, hash0, 64 ); //
+    else
+       luffa_full( &ctx.luffa, (BitSequence*)hash0, 512,
+                         (const BitSequence*)hash0, 64 );
+
+    if ( hash1[0] & mask )
+       shavite512_full( &ctx.shavite, hash1, hash1, 64 ); //
+    else
+       luffa_full( &ctx.luffa, (BitSequence*)hash1, 512,
+                         (const BitSequence*)hash1, 64 );
+
+    if ( hash2[0] & mask )
+       shavite512_full( &ctx.shavite, hash2, hash2, 64 ); //
+    else
+       luffa_full( &ctx.luffa, (BitSequence*)hash2, 512,
+                         (const BitSequence*)hash2, 64 );
+
+    if ( hash3[0] & mask )
+       shavite512_full( &ctx.shavite, hash3, hash3, 64 ); //
+    else
+       luffa_full( &ctx.luffa, (BitSequence*)hash3, 512,
+                         (const BitSequence*)hash3, 64 );
 
-   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-   hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_4way_close( &ctx.hamsi, vhash );
+    hamsi512_4way_init( &ctx.hamsi );
+    hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
+    hamsi512_4way_close( &ctx.hamsi, vhash );
 
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash0, 64 );
-   sph_fugue512_close( &ctx.fugue, hash0 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash1, 64 );
-   sph_fugue512_close( &ctx.fugue, hash1 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash2, 64 );
-   sph_fugue512_close( &ctx.fugue, hash2 );
-   sph_fugue512_init( &ctx.fugue );
-   sph_fugue512( &ctx.fugue, hash3, 64 );
-   sph_fugue512_close( &ctx.fugue, hash3 );
+    sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+    sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+    sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+    sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
     // In this situation serial simd seems to be faster.
 
@@ -1295,61 +857,46 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                    m256_zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-   if ( hash0[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                               (const BitSequence *)hash0, 512 );
-   }
-
-   else
-   {
+    if ( hash0[0] & mask ) //4
+       echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                       (const BitSequence *)hash0, 64 );
+    else
+    {
        init_sd( &ctx.sd, 512 );
        update_final_sd( &ctx.sd, (BitSequence *)hash0,
-                             (const BitSequence *)hash0, 512 );
-   }
+                           (const BitSequence *)hash0, 512 );
+    }
 
    if ( hash1[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                               (const BitSequence *)hash1, 512 );
-   }
-
+       echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                       (const BitSequence *)hash1, 64 );
    else
    {
        init_sd( &ctx.sd, 512 );
        update_final_sd( &ctx.sd, (BitSequence *)hash1,
-                             (const BitSequence *)hash1, 512 );
+                           (const BitSequence *)hash1, 512 );
    }
 
    if ( hash2[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                               (const BitSequence *)hash2, 512 );
-   }
-
+       echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                       (const BitSequence *)hash2, 64 );
    else
    {
        init_sd( &ctx.sd, 512 );
        update_final_sd( &ctx.sd, (BitSequence *)hash2,
-                             (const BitSequence *)hash2, 512 );
+                           (const BitSequence *)hash2, 512 );
    }
 
    if ( hash3[0] & mask ) //4
-   {
-       init_echo( &ctx.echo, 512 );
-       update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                               (const BitSequence *)hash3, 512 );
-   }
-
+       echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                       (const BitSequence *)hash3, 64 );
    else
    {
        init_sd( &ctx.sd, 512 );
        update_final_sd( &ctx.sd, (BitSequence *)hash3,
-                             (const BitSequence *)hash3, 512 );
+                           (const BitSequence *)hash3, 512 );
    }
 
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
@@ -1360,54 +907,30 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 
    dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
 
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash0 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash1 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash2 );
-   sph_whirlpool_init( &ctx.whirlpool );
-   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-   sph_whirlpool_close( &ctx.whirlpool, hash3 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+   sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
 // A = fugue serial, B = sha512 prarallel
    
    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
-                                 m256_zero );
+   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero );
+   h_mask = _mm256_movemask_epi8( vh_mask );
 
    if ( hash0[0] & mask ) 
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash0, 64 );
-      sph_fugue512_close( &ctx.fugue, hash0 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
    if ( hash1[0] & mask ) 
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash1, 64 );
-      sph_fugue512_close( &ctx.fugue, hash1 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
    if ( hash2[0] & mask ) 
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash2, 64 );
-      sph_fugue512_close( &ctx.fugue, hash2 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
    if ( hash3[0] & mask ) 
-   {
-      sph_fugue512_init( &ctx.fugue );
-      sph_fugue512( &ctx.fugue, hash3, 64 );
-      sph_fugue512_close( &ctx.fugue, hash3 );
-   }
+      sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
-   if ( mm256_anybits0( vh_mask ) )
+   if ( h_mask & 0xffffffff )
    {
       sha512_4way_init( &ctx.sha512 );
       sha512_4way_update( &ctx.sha512, vhash, 64 );
@@ -1418,14 +941,10 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
 
    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
@@ -1435,15 +954,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 
 // A = haval parallel, B = Whirlpool serial
 
-   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
-                                 m256_zero );
+   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero );
+   h_mask = _mm256_movemask_epi8( vh_mask );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-   // 4x32 for haval
    intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
-   if ( mm256_anybits1( vh_mask ) )
+   if ( ( h_mask & 0xffffffff ) != 0xffffffff )
    {
       haval256_5_4way_init( &ctx.haval );
       haval256_5_4way_update( &ctx.haval, vhash, 64 );
@@ -1453,29 +970,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    }
 
    if ( !( hash0[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash0 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
    if ( !( hash1[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash1 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
    if ( !( hash2[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash2 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
    if ( !( hash3[0] & mask ) )
-   {
-      sph_whirlpool_init( &ctx.whirlpool );
-      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-      sph_whirlpool_close( &ctx.whirlpool, hash3 );
-   }
+      sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
    intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
 
@@ -1483,48 +984,48 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 
    bmw512_4way_init( &ctx.bmw );
    bmw512_4way_update( &ctx.bmw, vhash, 64 );
-   bmw512_4way_close( &ctx.bmw, vhash );
-
- 	memcpy(state, vhash, 32<<2 );
+   bmw512_4way_close( &ctx.bmw, state );
 }
 
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
-                           uint64_t *hashes_done, struct thr_info *mythr )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[16*4] __attribute__ ((aligned (64)));
+    uint64_t hash64[8*4] __attribute__ ((aligned (64)));
     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[25]);
+    uint64_t *hash64_q3 = &(hash64[3*4]);
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
-    const uint32_t Htarg = ptarget[7];
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
+    uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-    uint32_t n = first_nonce;
     const uint32_t last_nonce = max_nonce - 4;
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id; 
+    __m256i  *noncev = (__m256i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
 
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
     do
     {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
-       hmq1725_4way_hash( hash, vdata );
+       hmq1725_4way_hash( hash64, vdata );
 
        for ( int lane = 0; lane < 4; lane++ )
-       if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
+       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
        {
-          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+          extr_lane_4x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
           {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
              submit_lane_solution( work, lane_hash, mythr, lane );
           }
        }
+       *noncev = _mm256_add_epi32( *noncev,
+                                   m256_const1_64( 0x0000000400000000 ) );
        n += 4;
-    } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+    pdata[19] = n;
     *hashes_done = n - first_nonce;
     return 0;
 }
diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c
index c8b96012..5e02c390 100644
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -72,12 +72,10 @@ void quark_8way_hash( void *state, const void *input )
 
     memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
 
-    blake512_8way_update( &ctx.blake, input, 80 );
-    blake512_8way_close( &ctx.blake, vhash );
-
-    bmw512_8way_update( &ctx.bmw, vhash, 64 );
-    bmw512_8way_close( &ctx.bmw, vhash );
+    blake512_8way_full( &ctx.blake, vhash, input, 80 );
 
+    bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+    
     vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                        zero );
 
@@ -86,70 +84,34 @@ void quark_8way_hash( void *state, const void *input )
 
      rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     if ( ( vh_mask & 0x0f ) != 0x0f )
-     {
-        groestl512_4way_init( &ctx.groestl, 64 );
-        groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     }
-     if ( ( vh_mask & 0xf0 ) != 0xf0 )
-     {     
-        groestl512_4way_init( &ctx.groestl, 64 );
-        groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
-     }
-     rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+    if ( ( vh_mask & 0x0f ) != 0x0f )
+       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+    if ( ( vh_mask & 0xf0 ) != 0xf0 )
+       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
 
 #else
 
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, 512 );
 
-    if ( hash0[0] & mask )
-    {
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-    }
-    if ( hash1[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-    }
-    if ( hash2[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-    }
-    if ( hash3[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-    }
-    if ( hash4[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                               (char*)hash4, 512 );
-    }
-    if ( hash5[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                               (char*)hash5, 512 );
-    }
-    if ( hash6[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                               (char*)hash6, 512 );
-    }
-    if ( hash7[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                               (char*)hash7, 512 );
-    }
+     if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     if ( hash4[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     if ( hash5[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     if ( hash6[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     if ( hash7[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
     intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, 512 );
@@ -157,10 +119,7 @@ void quark_8way_hash( void *state, const void *input )
 #endif
 
     if ( vh_mask & 0xff )
-    {
-       skein512_8way_update( &ctx.skein, vhash, 64 );
-       skein512_8way_close( &ctx.skein, vhashB );
-    }
+       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
 
     mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
 
@@ -168,10 +127,10 @@ void quark_8way_hash( void *state, const void *input )
 
      rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+     if ( ( vh_mask & 0x0f ) != 0x0f )
+       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     if ( ( vh_mask & 0xf0 ) != 0xf0 )
+       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
 
      rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
 
@@ -180,22 +139,22 @@ void quark_8way_hash( void *state, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, 512 );
 
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    if ( hash4[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    if ( hash5[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    if ( hash6[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    if ( hash7[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  512 );
@@ -209,27 +168,16 @@ void quark_8way_hash( void *state, const void *input )
                                        zero );
 
     if ( ( vh_mask & 0xff ) != 0xff )
-    {
-       blake512_8way_init( &ctx.blake );
-       blake512_8way_update( &ctx.blake, vhash, 64 );
-       blake512_8way_close( &ctx.blake, vhashA );
-    }
-
+       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
     if ( vh_mask & 0xff )
-    {
-       bmw512_8way_init( &ctx.bmw );
-       bmw512_8way_update( &ctx.bmw, vhash, 64 );
-       bmw512_8way_close( &ctx.bmw, vhashB );
-    }
+       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
 
     mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
 
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
 
-    skein512_8way_init( &ctx.skein );
-    skein512_8way_update( &ctx.skein, vhash, 64 );
-    skein512_8way_close( &ctx.skein, vhash );
+    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
 
     vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                        zero );
@@ -258,41 +206,44 @@ void quark_8way_hash( void *state, const void *input )
 int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[8*8] __attribute__ ((aligned (128)));
-    uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+    uint64_t hash64[4*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[49]);
-    uint32_t *pdata = work->data;
+    uint64_t *hash64_q3 = &(hash64[3*8]);
     uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
+    uint32_t *pdata = work->data;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-    int thr_id = mythr->id; 
-    const uint32_t Htarg = ptarget[7];
+    const uint32_t last_nonce = max_nonce - 8;
+    __m512i  *noncev = (__m512i*)vdata + 9;
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;
 
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
     do
     {
-       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
-
-       quark_8way_hash( hash, vdata );
-       pdata[19] = n;
+       quark_8way_hash( hash64, vdata );
 
-       for ( int i = 0; i < 8; i++ )
-       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
        {
-          extr_lane_8x64( lane_hash, hash, i, 256 );
-          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+          extr_lane_8x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
           {
-            pdata[19] = n+i;
-            submit_lane_solution( work, lane_hash, mythr, i );
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
           }
        }
+       *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
        n += 8;
-    } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
 
+    pdata[19] = n;
     *hashes_done = n - first_nonce;
     return 0;
 }
@@ -333,67 +284,47 @@ void quark_4way_hash( void *state, const void *input )
     __m256i* vhA = (__m256i*)vhashA;
     __m256i* vhB = (__m256i*)vhashB;
     __m256i vh_mask;
+    int h_mask;
     quark_4way_ctx_holder ctx;
     const __m256i bit3_mask = m256_const1_64( 8 );
-    const uint32_t mask = 8;
     const __m256i zero = _mm256_setzero_si256();
 
     memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
 
-    blake512_4way_update( &ctx.blake, input, 80 );
-    blake512_4way_close( &ctx.blake, vhash );
+    blake512_4way_full( &ctx.blake, vhash, input, 80 );
 
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-    if ( hash0[0] & mask )
-    {
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-    }
-    if ( hash1[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-    }
-    if ( hash2[0] & mask )
-    {   
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-    }
-    if ( hash3[0] & mask )
-    {   
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-    }
+    // A
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
 
     intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
-    if ( mm256_anybits1( vh_mask ) )   
-    {
-       skein512_4way_update( &ctx.skein, vhash, 64 );
-       skein512_4way_close( &ctx.skein, vhashB );
-    }
+    // B
+    if ( likely( h_mask & 0xffffffff ) )
+       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
 
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
 
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
@@ -401,15 +332,13 @@ void quark_4way_hash( void *state, const void *input )
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-    if ( mm256_anybits1( vh_mask ) )   
-    {
-       blake512_4way_init( &ctx.blake );
-       blake512_4way_update( &ctx.blake, vhash, 64 );
-       blake512_4way_close( &ctx.blake, vhashA );
-    }
-
-    if ( mm256_anybits0( vh_mask ) )
+    // A
+    if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) )
+       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
+    // B
+    if ( likely( h_mask & 0xffffffff ) )
     {
        bmw512_4way_init( &ctx.bmw );
        bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -421,20 +350,20 @@ void quark_4way_hash( void *state, const void *input )
     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 
-    skein512_4way_init( &ctx.skein );
-    skein512_4way_update( &ctx.skein, vhash, 64 );
-    skein512_4way_close( &ctx.skein, vhash );
+    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );
 
-    if ( mm256_anybits1( vh_mask ) )    
+    // A
+    if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) )
     {
        keccak512_4way_init( &ctx.keccak );
        keccak512_4way_update( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
     }
-
-    if ( mm256_anybits0( vh_mask ) )
+    // B
+    if ( likely( h_mask & 0xffffffff ) )
     {
        jh512_4way_init( &ctx.jh );
        jh512_4way_update( &ctx.jh, vhash, 64 );
@@ -451,41 +380,44 @@ void quark_4way_hash( void *state, const void *input )
 int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[4*8] __attribute__ ((aligned (64)));
-    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+    uint64_t hash64[4*4] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[25]);
+    uint64_t *hash64_q3 = &(hash64[3*4]);
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;
-    const uint32_t Htarg = ptarget[7];
+    const uint32_t last_nonce = max_nonce - 4;
+    __m256i  *noncev = (__m256i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
  
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
     do
     {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
-       quark_4way_hash( hash, vdata );
-       pdata[19] = n;
+       quark_4way_hash( hash64, vdata );
 
-       for ( int i = 0; i < 4; i++ )
-       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash64_q3[ lane ] <= targ64_q3 && !bench )
        {
-          extr_lane_4x64( lane_hash, hash, i, 256 );
-          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+          extr_lane_4x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
           {
-            pdata[19] = n+i;
-            submit_lane_solution( work, lane_hash, mythr, i );
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
           }
        }
+       *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
        n += 4;
-    } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
 
-    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
     return 0;
 }
 
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index a14821ce..003af36b 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -707,6 +707,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
    int thr_id = mythr->id;  // thr_id arg is deprecated
 	int throughput = scrypt_best_throughput();
 	int i;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
 #ifdef HAVE_SHA256_4WAY
 	if (sha256_use_4way())
@@ -757,7 +758,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
             submit_solution( work, hash, mythr );
 			}
 		}
-	} while (likely(n < max_nonce && !work_restart[thr_id].restart));
+	} while ( likely( n < max_nonce && !(*restart) ) );
 	
 	*hashes_done = n - pdata[19] + 1;
 	pdata[19] = n;
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index e9536a0b..e047d778 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>
 
-#ifdef __AES__
+#if defined(__AES__)
 
 #include "sph_shavite.h"
 #include "simd-utils.h"
diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c
index ba4384b4..41988f97 100644
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -35,6 +35,8 @@
 
 #include "sph_shavite.h"
 
+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -1762,3 +1764,6 @@ sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst
 #ifdef __cplusplus
 }
 #endif
+
+#endif   // !AES
+
diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h
index ed06ca69..cca59726 100644
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -262,15 +262,9 @@ void sph_shavite384_close(void *cc, void *dst);
 void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
-// Always define sw but only define aesni when available
-// Define fptrs for aesni or sw, not both.
-void sph_shavite512_sw_init(void *cc);
-void sph_shavite512_sw(void *cc, const void *data, size_t len);
-void sph_shavite512_sw_close(void *cc, void *dst);
-void sph_shavite512_sw_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
+//Don't call these directly from application code, use the macros below.
 #ifdef __AES__
+
 void sph_shavite512_aesni_init(void *cc);
 void sph_shavite512_aesni(void *cc, const void *data, size_t len);
 void sph_shavite512_aesni_close(void *cc, void *dst);
@@ -285,6 +279,13 @@ void sph_shavite512_aesni_addbits_and_close(
 
 #else
 
+void sph_shavite512_sw_init(void *cc);
+void sph_shavite512_sw(void *cc, const void *data, size_t len);
+void sph_shavite512_sw_close(void *cc, void *dst);
+void sph_shavite512_sw_addbits_and_close(
+   void *cc, unsigned ub, unsigned n, void *dst);
+
+
 #define sph_shavite512_init  sph_shavite512_sw_init
 #define sph_shavite512       sph_shavite512_sw
 #define sph_shavite512_close sph_shavite512_sw_close
@@ -293,6 +294,20 @@ void sph_shavite512_aesni_addbits_and_close(
 
 #endif
 
+// Use these macros from application code.
+#define shavite512_context sph_shavite512_context
+
+#define shavite512_init   sph_shavite512_init
+#define shavite512_update sph_shavite512
+#define shavite512_close  sph_shavite512_close
+
+#define shavite512_full( cc, dst, data, len ) \
+do{ \
+   shavite512_init( cc ); \
+   shavite512_update( cc, data, len ); \
+   shavite512_close( cc, dst ); \
+}while(0)
+
 #ifdef __cplusplus
 }
 #endif	
diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c
index 68467855..14957273 100644
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -24,11 +24,7 @@ void skeinhash_8way( void *state, const void *input )
      uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
      sha256_8way_context ctx_sha256;
 
-     skein512_8way_full( &ctx_skein, vhash64, input, 80 );
-     
-//     skein512_8way_update( &ctx_skein, input + (64*8), 16 );
-//     skein512_8way_close( &ctx_skein, vhash64 );
-
+     skein512_8way_final16( &ctx_skein, vhash64, input + (64*8) );
      rintrlv_8x64_8x32( vhash32, vhash64, 512 );
 
      sha256_8way_init( &ctx_sha256 );
@@ -57,8 +53,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
    *noncev = mm512_intrlv_blend_32(
                 _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                   n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-//   skein512_8way_init( &skein512_8way_ctx );
-//   skein512_8way_update( &skein512_8way_ctx, vdata, 64 );
+   skein512_8way_prehash64( &skein512_8way_ctx, vdata );
    do
    {
        skeinhash_8way( hash, vdata );
@@ -85,14 +80,14 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
 
 #elif defined (SKEIN_4WAY)
 
-//static __thread skein512_4way_context skein512_4way_ctx
-//                                            __attribute__ ((aligned (64)));
+static __thread skein512_4way_context skein512_4way_ctx
+                                            __attribute__ ((aligned (64)));
 
 void skeinhash_4way( void *state, const void *input )
 {
      uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
      skein512_4way_context ctx_skein;
-//     memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) );
+     memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) );
 #if defined(__SHA__)
      uint32_t hash0[16] __attribute__ ((aligned (64)));
      uint32_t hash1[16] __attribute__ ((aligned (64)));
@@ -104,10 +99,7 @@ void skeinhash_4way( void *state, const void *input )
      sha256_4way_context ctx_sha256;
 #endif
 
-     skein512_4way_full( &ctx_skein, vhash64, input, 80 );
-
-//     skein512_4way_update( &ctx_skein, input + (64*4), 16 );
-//     skein512_4way_close( &ctx_skein, vhash64 );
+     skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) );
 
 #if defined(__SHA__)      
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
@@ -156,8 +148,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
     const bool bench = opt_benchmark;
 
    mm256_bswap32_intrlv80_4x64( vdata, pdata );
-//   skein512_4way_init( &skein512_4way_ctx );
-//   skein512_4way_update( &skein512_4way_ctx, vdata, 64 );
+   skein512_4way_prehash64( &skein512_4way_ctx, vdata );
 
    *noncev = mm256_intrlv_blend_32(
                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c
index 51b63ddc..2a36d558 100644
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -728,6 +728,86 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
    casti_m512i( out, 7 ) = h7;
 }
 
+void
+skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
+{
+   __m512i *vdata = (__m512*)data;
+   __m512i *buf = sc->buf;
+   buf[0] = vdata[0];
+   buf[1] = vdata[1];
+   buf[2] = vdata[2];
+   buf[3] = vdata[3];
+   buf[4] = vdata[4];
+   buf[5] = vdata[5];
+   buf[6] = vdata[6];
+   buf[7] = vdata[7];
+   register __m512i h0 = m512_const1_64( 0x4903ADFF749C51CE );
+   register __m512i h1 = m512_const1_64( 0x0D95DE399746DF03 );
+   register __m512i h2 = m512_const1_64( 0x8FD1934127C79BCE );
+   register __m512i h3 = m512_const1_64( 0x9A255629FF352CB1 );
+   register __m512i h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
+   register __m512i h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
+   register __m512i h6 = m512_const1_64( 0x991112C71A75B523 );
+   register __m512i h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+   uint64_t bcount = 1;
+
+   UBI_BIG_8WAY( 224, 0 );
+   sc->h0 = h0;
+   sc->h1 = h1;
+   sc->h2 = h2;
+   sc->h3 = h3;
+   sc->h4 = h4;
+   sc->h5 = h5;
+   sc->h6 = h6;
+   sc->h7 = h7;
+}
+
+void
+skein512_8way_final16( skein512_8way_context *sc,  void *output,
+                       const void *data )
+{
+   __m512i *in = (__m512i*)data;
+   __m512i *buf = sc->buf;
+   __m512i *out = (__m512i*)output;
+   register __m512i h0 = sc->h0;
+   register __m512i    h1 = sc->h1;
+   register __m512i    h2 = sc->h2;
+   register __m512i    h3 = sc->h3;
+   register __m512i    h4 = sc->h4;
+   register __m512i    h5 = sc->h5;
+   register __m512i    h6 = sc->h6;
+   register __m512i    h7 = sc->h7;
+
+   const __m512i zero = m512_zero;
+   buf[0] = in[0];
+   buf[1] = in[1];
+   buf[2] = zero;
+   buf[3] = zero;
+   buf[4] = zero;
+   buf[5] = zero;
+   buf[6] = zero;
+   buf[7] = zero;
+
+   uint64_t bcount = 1;
+   UBI_BIG_8WAY( 352, 16 );
+
+   buf[0] = zero;
+   buf[1] = zero;
+
+   bcount = 0;
+   UBI_BIG_8WAY( 510, 8 );
+
+   out[0] = h0;
+   out[1] = h1;
+   out[2] = h2;
+   out[3] = h3;
+   out[4] = h4;
+   out[5] = h5;
+   out[6] = h6;
+   out[7] = h7;
+}
+
+
 void
 skein256_8way_update(void *cc, const void *data, size_t len)
 {
@@ -942,6 +1022,83 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
    casti_m256i( out, 7 ) = h7;
 }
 
+void
+skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf = sc->buf;
+   buf[0] = vdata[0];
+   buf[1] = vdata[1];
+   buf[2] = vdata[2];
+   buf[3] = vdata[3];
+   buf[4] = vdata[4];
+   buf[5] = vdata[5];
+   buf[6] = vdata[6];
+   buf[7] = vdata[7];
+   register __m256i h0 = m256_const1_64( 0x4903ADFF749C51CE );
+   register __m256i h1 = m256_const1_64( 0x0D95DE399746DF03 );
+   register __m256i h2 = m256_const1_64( 0x8FD1934127C79BCE );
+   register __m256i h3 = m256_const1_64( 0x9A255629FF352CB1 );
+   register __m256i h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
+   register __m256i h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
+   register __m256i h6 = m256_const1_64( 0x991112C71A75B523 );
+   register __m256i h7 = m256_const1_64( 0xAE18A40B660FCC33 );
+   uint64_t bcount = 1;
+
+   UBI_BIG_4WAY( 224, 0 );
+   sc->h0 = h0;
+   sc->h1 = h1;
+   sc->h2 = h2;
+   sc->h3 = h3;
+   sc->h4 = h4;
+   sc->h5 = h5;
+   sc->h6 = h6;
+   sc->h7 = h7;
+}
+
+void
+skein512_4way_final16( skein512_4way_context *sc,  void *out, const void *data )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf = sc->buf;
+   register __m256i h0 = sc->h0;
+   register __m256i    h1 = sc->h1;
+   register __m256i    h2 = sc->h2;
+   register __m256i    h3 = sc->h3;
+   register __m256i    h4 = sc->h4;
+   register __m256i    h5 = sc->h5;
+   register __m256i    h6 = sc->h6;
+   register __m256i    h7 = sc->h7;
+
+   const __m256i zero = m256_zero;
+   buf[0] = vdata[0];
+   buf[1] = vdata[1];
+   buf[2] = zero;
+   buf[3] = zero;
+   buf[4] = zero;
+   buf[5] = zero;
+   buf[6] = zero;
+   buf[7] = zero;
+
+   uint64_t bcount = 1;
+   UBI_BIG_4WAY( 352, 16 );
+
+   buf[0] = zero;
+   buf[1] = zero;
+
+   bcount = 0;
+   UBI_BIG_4WAY( 510, 8 );
+
+   casti_m256i( out, 0 ) = h0;
+   casti_m256i( out, 1 ) = h1;
+   casti_m256i( out, 2 ) = h2;
+   casti_m256i( out, 3 ) = h3;
+   casti_m256i( out, 4 ) = h4;
+   casti_m256i( out, 5 ) = h5;
+   casti_m256i( out, 6 ) = h6;
+   casti_m256i( out, 7 ) = h7;
+}
+
 void
 skein256_4way_update(void *cc, const void *data, size_t len)
 {
diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h
index c60ba5d3..eb857d29 100644
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -69,6 +69,10 @@ void skein512_8way_init( skein512_8way_context *sc );
 void skein512_8way_update( void *cc, const void *data, size_t len );
 void skein512_8way_close( void *cc, void *dst );
 
+void skein512_8way_prehash64( skein512_8way_context *sc, const void *data );
+void skein512_8way_final16( skein512_8way_context *sc, void *out,
+     const void *data );
+
 void skein256_8way_init( skein256_8way_context *sc );
 void skein256_8way_update( void *cc, const void *data, size_t len );
 void skein256_8way_close( void *cc, void *dst );
@@ -96,6 +100,10 @@ void skein256_4way_init( skein256_4way_context *sc );
 void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
 
+void skein512_4way_prehash64( skein512_4way_context *sc, const void *data );
+void skein512_4way_final16( skein512_4way_context *sc, void *out,
+     const void *data );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c
index 2469271b..6fd1c274 100644
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -5,20 +5,16 @@
 
 #if defined(SKEIN_8WAY)
 
-// static __thread skein512_8way_context skein512_8way_ctx
-//                                             __attribute__ ((aligned (64)));
+ static __thread skein512_8way_context skein512_8way_ctx
+                                             __attribute__ ((aligned (64)));
 
 void skein2hash_8way( void *output, const void *input )
 {
    uint64_t hash[16*8] __attribute__ ((aligned (128)));
    skein512_8way_context ctx;
-//   memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
-
-   skein512_8way_full( &ctx, hash, input, 80 );
-
-//   skein512_8way_update( &ctx, input + (64*8), 16 );
-//   skein512_8way_close( &ctx, hash );
+   memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
 
+   skein512_8way_final16( &ctx, hash, input + (64*8) );
    skein512_8way_full( &ctx, output, hash, 64 );
 }
 
@@ -38,16 +34,17 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
     __m512i  *noncev = (__m512i*)vdata + 9; 
     const int thr_id = mythr->id; 
     const bool bench = opt_benchmark;
+    skein512_8way_context ctx;
 
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     *noncev = mm512_intrlv_blend_32(
                 _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                   n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-//    skein512_8way_init( &skein512_8way_ctx );
-//    skein512_8way_update( &skein512_8way_ctx, vdata, 64 );
+    skein512_8way_prehash64( &ctx, vdata );
     do
     {
-       skein2hash_8way( hash, vdata );
+       skein512_8way_final16( &ctx, hash, vdata + (16*8) );
+       skein512_8way_full( &ctx, hash, hash, 64 );
 
        for ( int lane = 0; lane < 8; lane++ )
        if ( unlikely( hashq3[ lane ] <= targq3 && !bench ) )
@@ -71,19 +68,16 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
 
 #elif defined(SKEIN_4WAY)
 
-//static __thread skein512_4way_context skein512_4way_ctx
-//                                           __attribute__ ((aligned (64)));
+static __thread skein512_4way_context skein512_4way_ctx
+                                           __attribute__ ((aligned (64)));
 
 void skein2hash_4way( void *output, const void *input )
 {
    skein512_4way_context ctx;
-//   memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) ); 
+   memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) ); 
    uint64_t hash[16*4] __attribute__ ((aligned (64)));
 
-//   skein512_4way_update( &ctx, input + (64*4), 16 );
-//   skein512_4way_close( &ctx, hash );
-
-   skein512_4way_full( &ctx, hash, input, 80 );
+   skein512_4way_final16( &ctx, hash, input + (64*4) );
    skein512_4way_full( &ctx, output, hash, 64 );
 }
 
@@ -103,15 +97,16 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
     __m256i  *noncev = (__m256i*)vdata + 9; 
     const int thr_id = mythr->id;  
     const bool bench = opt_benchmark;
+    skein512_4way_context ctx;
 
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
-//    skein512_4way_init( &skein512_4way_ctx );
-//    skein512_4way_update( &skein512_4way_ctx, vdata, 64 );
+    skein512_4way_prehash64( &ctx, vdata );
     *noncev = mm256_intrlv_blend_32(
                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
     do 
     {
-       skein2hash_4way( hash, vdata );
+       skein512_4way_final16( &ctx, hash, vdata + (16*4) );
+       skein512_4way_full( &ctx, hash, hash, 64 );
 
        for ( int lane = 0; lane < 4; lane++ )
        if ( hash_q3[ lane ] <= targ_q3 )
diff --git a/algo/whirlpool/sph_whirlpool.h b/algo/whirlpool/sph_whirlpool.h
index 70dc7fa4..801a9f92 100644
--- a/algo/whirlpool/sph_whirlpool.h
+++ b/algo/whirlpool/sph_whirlpool.h
@@ -120,6 +120,13 @@ void sph_whirlpool(void *cc, const void *data, size_t len);
  */
 void sph_whirlpool_close(void *cc, void *dst);
 
+#define sph_whirlpool512_full( cc, dst, data, len ) \
+do{ \
+   sph_whirlpool_init( cc ); \
+   sph_whirlpool( cc, data, len ); \
+   sph_whirlpool_close( cc, dst ); \
+}while(0)
+
 /**
  * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
  */
diff --git a/algo/x13/skunk-4way.c b/algo/x13/skunk-4way.c
index 566f5458..c7f6fd68 100644
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -35,8 +35,7 @@ void skunk_8way_hash( void *output, const void *input )
      skunk_8way_ctx_holder ctx __attribute__ ((aligned (64)));
      memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) );
 
-     skein512_8way_update( &ctx.skein, input, 80 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_final16( &ctx.skein, vhash, input );
      dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                         hash7, vhash, 512 );
   
@@ -104,35 +103,35 @@ int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  
+   __m512i  *noncev = (__m512i*)vdata + 9; 
+   const int thr_id = mythr->id;  
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-      ((uint32_t*)ptarget)[7] = 0x0cff;
+   if ( bench )  ptarget[7] = 0x0fff;
 
    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   skein512_8way_prehash64( &skunk_8way_ctx.skein, vdata );
+   *noncev = mm512_intrlv_blend_32( 
+             _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                               n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
    do
    {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
-
       skunk_8way_hash( hash, vdata );
-      pdata[19] = n;
 
       for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n +=8;
-   } while ( likely( ( n < max_nonce-8 ) && !(*restart) ) );
-
+   } while ( likely( ( n < last_nonce ) && !( *restart ) ) );
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
@@ -159,17 +158,16 @@ static __thread skunk_4way_ctx_holder skunk_4way_ctx;
 
 void skunk_4way_hash( void *output, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
      uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
 
      skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
      memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
 
-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
@@ -213,40 +211,40 @@ void skunk_4way_hash( void *output, const void *input )
 int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[4*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   __m256i  *noncev = (__m256i*)vdata + 9; 
+   const int thr_id = mythr->id; 
+   volatile uint8_t *restart = &( work_restart[ thr_id ].restart );
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-      ((uint32_t*)ptarget)[7] = 0x0cff;
+   if ( bench )  ptarget[7] = 0x0fff;
 
    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   skein512_4way_prehash64( &skunk_4way_ctx.skein, vdata );
+   *noncev = mm256_intrlv_blend_32(
+             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do
    {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
       skunk_4way_hash( hash, vdata );
-      pdata[19] = n;
 
       for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n + i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n +=4;
-   } while ( ( n < max_nonce ) && !(*restart) );
-
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( ( n < last_nonce ) && !( *restart ) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/x16/hex.c b/algo/x16/hex.c
index bd9294e2..2ed56479 100644
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -47,6 +47,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
    *sptr = '\0';
 }
 
+/*
 union _hex_context_overlay
 {
 #if defined(__AES__)
@@ -63,7 +64,7 @@ union _hex_context_overlay
         sph_keccak512_context   keccak;
         hashState_luffa         luffa;
         cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        shavite512_context      shavite;
         hashState_sd            simd;
         sph_hamsi512_context    hamsi;
         sph_fugue512_context    fugue;
@@ -72,13 +73,14 @@ union _hex_context_overlay
         SHA512_CTX              sha512;
 };
 typedef union _hex_context_overlay hex_context_overlay;
+*/
 
-static __thread hex_context_overlay hex_ctx;
+static __thread x16r_context_overlay hex_ctx;
 
 void hex_hash( void* output, const void* input )
 {
    uint32_t _ALIGN(128) hash[16];
-   hex_context_overlay ctx;
+   x16r_context_overlay ctx;
    memcpy( &ctx, &hex_ctx, sizeof(ctx) );
    void *in = (void*) input;
    int size = 80;
@@ -157,9 +159,7 @@ void hex_hash( void* output, const void* input )
             }
          break;
          case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
+            shavite512_full( &ctx.shavite, hash, in, size );
          break;
          case SIMD:
              init_sd( &ctx.simd, 512 );
@@ -187,9 +187,7 @@ void hex_hash( void* output, const void* input )
             sph_hamsi512_close( &ctx.hamsi, hash );
          break;
          case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in, size );
-             sph_fugue512_close( &ctx.fugue, hash );
+             sph_fugue512_full( &ctx.fugue, hash, in, size );
          break;
          case SHABAL:
             if ( i == 0 ) 
@@ -203,13 +201,12 @@ void hex_hash( void* output, const void* input )
          break;
          case WHIRLPOOL:
             if ( i == 0 ) 
-                sph_whirlpool( &ctx.whirlpool, in+64, 16 );
-            else
             {
-                sph_whirlpool_init( &ctx.whirlpool );
-                sph_whirlpool( &ctx.whirlpool, in, size );
+                sph_whirlpool( &ctx.whirlpool, in+64, 16 );
+                sph_whirlpool_close( &ctx.whirlpool, hash );
             }
-            sph_whirlpool_close( &ctx.whirlpool, hash );
+            else
+                sph_whirlpool512_full( &ctx.whirlpool, hash, in,  size );
          break;
          case SHA_512:
              SHA512_Init( &ctx.sha512 );
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 32a39a8f..4f3880ab 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -287,30 +287,14 @@ void x16r_8way_hash_generic( void* output, const void* input )
             shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
+            shavite512_full( &ctx.shavite, hash4, in4, size );
+            shavite512_full( &ctx.shavite, hash5, in5, size );
+            shavite512_full( &ctx.shavite, hash6, in6, size );
+            shavite512_full( &ctx.shavite, hash7, in7, size );
 #endif
          break;
          case SIMD:
@@ -363,30 +347,14 @@ void x16r_8way_hash_generic( void* output, const void* input )
                           hash7, vhash );
          break;
          case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
+             sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+             sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+             sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+             sph_fugue512_full( &ctx.fugue, hash3, in3, size );
+             sph_fugue512_full( &ctx.fugue, hash4, in4, size );
+             sph_fugue512_full( &ctx.fugue, hash5, in5, size );
+             sph_fugue512_full( &ctx.fugue, hash6, in6, size );
+             sph_fugue512_full( &ctx.fugue, hash7, in7, size );
          break;
          case SHABAL:
              intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -431,30 +399,14 @@ void x16r_8way_hash_generic( void* output, const void* input )
             }
             else
             {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in4, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash4 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in5, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash5 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in6, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash6 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in7, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash4, in4, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash5, in5, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash6, in6, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash7, in7, size );
             }
          break;
          case SHA_512:
@@ -576,8 +528,7 @@ void x16r_4way_prehash( void *vdata, void *pdata )
       break;
       case SKEIN:
          mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_init( &x16r_ctx.skein );
-         skein512_4way_update( &x16r_ctx.skein, vdata, 64 );
+         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
       break;
       case LUFFA:
          mm128_bswap32_80( edata, pdata );
@@ -692,10 +643,7 @@ void x16r_4way_hash_generic( void* output, const void* input )
          break;
          case SKEIN:
             if ( i == 0 )
-            {
-               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
-               skein512_4way_close( &ctx.skein, vhash );
-            }
+               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -756,18 +704,10 @@ void x16r_4way_hash_generic( void* output, const void* input )
             }
          break;
          case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
          break;
          case SIMD:
             intrlv_2x128( vhash, in0, in1, size<<3 );
@@ -800,18 +740,10 @@ void x16r_4way_hash_generic( void* output, const void* input )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+             sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+             sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+             sph_fugue512_full( &ctx.fugue, hash3, in3, size );
          break;
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -842,18 +774,10 @@ void x16r_4way_hash_generic( void* output, const void* input )
             }
             else
             {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
             }
          break;
          case SHA_512:
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index d58aab83..a75ca829 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -121,7 +121,7 @@ union _x16r_8way_context_overlay
     echo_4way_context       echo;
 #else
     hashState_groestl       groestl;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
     hashState_echo          echo;
 #endif
 } __attribute__ ((aligned (64)));
@@ -152,7 +152,7 @@ union _x16r_4way_context_overlay
     luffa_2way_context      luffa;
     hashState_luffa         luffa1;
     cubehashParam           cube;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
     simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -191,7 +191,7 @@ union _x16r_context_overlay
         sph_keccak512_context   keccak;
         hashState_luffa         luffa;
         cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        shavite512_context      shavite;
         hashState_sd            simd;
         sph_hamsi512_context    hamsi;
         sph_fugue512_context    fugue;
diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c
index 08fd5317..09e89665 100644
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -124,9 +124,7 @@ void x16r_hash_generic( void* output, const void* input )
                                          (byte*)in, size );
          break;
          case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
+            shavite512_full( &ctx.shavite, hash, in, size );
          break;
          case SIMD:
             simd_full( &ctx.simd, (BitSequence *)hash,
@@ -153,9 +151,7 @@ void x16r_hash_generic( void* output, const void* input )
             sph_hamsi512_close( &ctx.hamsi, hash );
          break;
          case FUGUE:
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in, size );
-            sph_fugue512_close( &ctx.fugue, hash );
+            sph_fugue512_full( &ctx.fugue, hash, in, size );
          break;
          case SHABAL:
             if ( i == 0 )
@@ -169,13 +165,12 @@ void x16r_hash_generic( void* output, const void* input )
          break;
          case WHIRLPOOL:
             if ( i == 0 )
-               sph_whirlpool( &ctx.whirlpool, in+64, 16 );
-            else
             {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in, size );
+               sph_whirlpool( &ctx.whirlpool, in+64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash );
             }
-            sph_whirlpool_close( &ctx.whirlpool, hash );
+            else
+               sph_whirlpool512_full( &ctx.whirlpool, hash, in, size );
          break;
          case SHA_512:
             SHA512_Init( &ctx.sha512 );
diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c
index 33d4a89f..9a7ea443 100644
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -57,7 +57,7 @@ union _x16rv2_8way_context_overlay
     echo_4way_context       echo;
 #else
     hashState_groestl       groestl;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
     hashState_echo          echo;
 #endif
 } __attribute__ ((aligned (64)));
@@ -371,30 +371,14 @@ void x16rv2_8way_hash( void* output, const void* input )
             shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
+            shavite512_full( &ctx.shavite, hash4, in4, size );
+            shavite512_full( &ctx.shavite, hash5, in5, size );
+            shavite512_full( &ctx.shavite, hash6, in6, size );
+            shavite512_full( &ctx.shavite, hash7, in7, size );
 #endif
          break;
          case SIMD:
@@ -448,30 +432,14 @@ void x16rv2_8way_hash( void* output, const void* input )
                           hash7, vhash );
          break;
          case FUGUE:
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in0, size );
-            sph_fugue512_close( &ctx.fugue, hash0 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in1, size );
-            sph_fugue512_close( &ctx.fugue, hash1 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in2, size );
-            sph_fugue512_close( &ctx.fugue, hash2 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in3, size );
-            sph_fugue512_close( &ctx.fugue, hash3 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in4, size );
-            sph_fugue512_close( &ctx.fugue, hash4 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in5, size );
-            sph_fugue512_close( &ctx.fugue, hash5 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in6, size );
-            sph_fugue512_close( &ctx.fugue, hash6 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in7, size );
-            sph_fugue512_close( &ctx.fugue, hash7 );
+            sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+            sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+            sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+            sph_fugue512_full( &ctx.fugue, hash3, in3, size );
+            sph_fugue512_full( &ctx.fugue, hash4, in4, size );
+            sph_fugue512_full( &ctx.fugue, hash5, in5, size );
+            sph_fugue512_full( &ctx.fugue, hash6, in6, size );
+            sph_fugue512_full( &ctx.fugue, hash7, in7, size );
          break;
          case SHABAL:
             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -516,30 +484,14 @@ void x16rv2_8way_hash( void* output, const void* input )
             }
             else
             {
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+              sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash4, in4, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash5, in5, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash6, in6, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash7, in7, size );
             }
          break;
          case SHA_512:
@@ -747,7 +699,7 @@ union _x16rv2_4way_context_overlay
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
     cubehashParam           cube;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
     simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -831,47 +783,47 @@ void x16rv2_4way_hash( void* output, const void* input )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case KECCAK:
-             if ( i == 0 )
-             {
-                sph_tiger( &ctx.tiger, in0 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in1 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in2 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in3 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             else
-             {
-                sph_tiger_init( &ctx.tiger );
-			       sph_tiger( &ctx.tiger, in0, size );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in1, size );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in2, size );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in3, size );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             for ( int i = (24/4); i < (64/4); i++ )
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in2 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in3 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+		         sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in2, size );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in3, size );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
                 hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
 
-             intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-             keccak512_4way_init( &ctx.keccak );
-             keccak512_4way_update( &ctx.keccak, vhash, 64 );
-             keccak512_4way_close( &ctx.keccak, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+            keccak512_4way_init( &ctx.keccak );
+            keccak512_4way_update( &ctx.keccak, vhash, 64 );
+            keccak512_4way_close( &ctx.keccak, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case SKEIN:
             if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
+               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -882,46 +834,46 @@ void x16rv2_4way_hash( void* output, const void* input )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case LUFFA:
-             if ( i == 0 )
-             {
-                sph_tiger( &ctx.tiger, in0 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in1 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in2 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in3 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             else
-             {
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in0, size );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in1, size );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in2, size );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in3, size );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             for ( int i = (24/4); i < (64/4); i++ )
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in2 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in3 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in2, size );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in3, size );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
                 hash0[i] = hash1[i] =  hash2[i] = hash3[i] = 0;
 
-             intrlv_2x128( vhash, hash0, hash1, 512 );
-             luffa_2way_init( &ctx.luffa, 512 );
-             luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-             dintrlv_2x128( hash0, hash1, vhash, 512 );
-             intrlv_2x128( vhash, hash2, hash3, 512 );
-             luffa_2way_init( &ctx.luffa, 512 );
-             luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-             dintrlv_2x128( hash2, hash3, vhash, 512 );
+            intrlv_2x128( vhash, hash0, hash1, 512 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            intrlv_2x128( vhash, hash2, hash3, 512 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
          break;
          case CUBEHASH:
             if ( i == 0 )
@@ -955,18 +907,10 @@ void x16rv2_4way_hash( void* output, const void* input )
             }
          break;
          case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
          break;
          case SIMD:
             intrlv_2x128( vhash, in0, in1, size<<3 );
@@ -999,18 +943,10 @@ void x16rv2_4way_hash( void* output, const void* input )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
+            sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+            sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+            sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+            sph_fugue512_full( &ctx.fugue, hash3, in3, size );
          break;
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1041,18 +977,10 @@ void x16rv2_4way_hash( void* output, const void* input )
             }
             else
             {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
             }
          break;
          case SHA_512:
@@ -1117,7 +1045,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
    const int thr_id = mythr->id; 
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    __m256i  *noncev = (__m256i*)vdata + 9; 
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
    const bool bench = opt_benchmark;
 
@@ -1134,7 +1062,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
       x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
       s_ntime = ntime;
       if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
    }
 
    // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -1157,8 +1085,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
       break;
       case SKEIN:
          mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_init( &x16rv2_ctx.skein );
-         skein512_4way_update( &x16rv2_ctx.skein, vdata, 64 );
+         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
       break;
       case CUBEHASH:
          mm128_bswap32_80( edata, pdata );
diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c
index e56dd1db..2c85c885 100644
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -51,7 +51,7 @@ union _x16rv2_context_overlay
         sph_keccak512_context   keccak;
         hashState_luffa         luffa;
         cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        shavite512_context      shavite;
         hashState_sd            simd;
         sph_hamsi512_context    hamsi;
         sph_fugue512_context    fugue;
@@ -136,9 +136,7 @@ void x16rv2_hash( void* output, const void* input )
                                   (const byte*)in, size );
          break;
          case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
+            shavite512_full( &ctx.shavite, hash, in, size );
          break;
          case SIMD:
              init_sd( &ctx.simd, 512 );
@@ -162,9 +160,7 @@ void x16rv2_hash( void* output, const void* input )
              sph_hamsi512_close( &ctx.hamsi, hash );
          break;
          case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in, size );
-             sph_fugue512_close( &ctx.fugue, hash );
+             sph_fugue512_full( &ctx.fugue, hash, in, size );
          break;
          case SHABAL:
              sph_shabal512_init( &ctx.shabal );
@@ -172,9 +168,7 @@ void x16rv2_hash( void* output, const void* input )
              sph_shabal512_close( &ctx.shabal, hash );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash );
+             sph_whirlpool512_full( &ctx.whirlpool, hash, in, size );
          break;
          case SHA_512:
              sph_tiger_init( &ctx.tiger );
diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c
index ce01f58b..9b9380bc 100644
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -127,40 +127,22 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
 #else
      
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
      
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -236,9 +218,7 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #endif
 
-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_8way_init( &ctx.jh );
      jh512_8way_update( &ctx.jh, vhash, 64 );
@@ -258,40 +238,22 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
 #else
 
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -393,40 +355,22 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
 #else
 
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -477,30 +421,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
 
 // 4
 
@@ -537,9 +465,7 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #endif
 
-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_8way_init( &ctx.jh );
      jh512_8way_update( &ctx.jh, vhash, 64 );
@@ -559,40 +485,22 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
 #else
 
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -643,30 +551,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
 
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -714,39 +606,21 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
      rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
 
 #else
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
@@ -791,9 +665,7 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #endif
 
-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_8way_init( &ctx.jh );
      jh512_8way_update( &ctx.jh, vhash, 64 );
@@ -813,40 +685,22 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
 #else
 
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -897,30 +751,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
 
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -932,30 +770,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
 // 6
 
@@ -992,9 +814,7 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #endif
 
-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_8way_init( &ctx.jh );
      jh512_8way_update( &ctx.jh, vhash, 64 );
@@ -1014,40 +834,22 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
 #else
 
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -1098,30 +900,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
 
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -1133,30 +919,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -1168,30 +938,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
 // 7
 
@@ -1248,40 +1002,22 @@ void sonoa_8way_hash( void *state, const void *input )
 
 #if defined(__VAES__)
 
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
 
 #else
 
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -1332,30 +1068,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
 
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -1367,30 +1087,14 @@ void sonoa_8way_hash( void *state, const void *input )
      dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -1657,18 +1361,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
 // 4
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
@@ -1730,18 +1426,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -1840,18 +1528,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -1861,18 +1541,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
 // 6
 
@@ -1935,18 +1607,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -1956,18 +1620,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -1977,18 +1633,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
 // 7
 
@@ -2051,18 +1699,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -2072,18 +1712,10 @@ void sonoa_4way_hash( void *state, const void *input )
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index 7f1780f8..8846dbd5 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -132,30 +132,14 @@ void x17_8way_hash( void *state, const void *input )
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
 
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -206,30 +190,14 @@ void x17_8way_hash( void *state, const void *input )
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );
 
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -241,30 +209,14 @@ void x17_8way_hash( void *state, const void *input )
      dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
@@ -283,10 +235,10 @@ void x17_8way_hash( void *state, const void *input )
 int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t hash32[8*8] __attribute__ ((aligned (128)));
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &(hash[7*8]);
+   uint32_t *hash32_d7 = &(hash32[7*8]);
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
@@ -294,7 +246,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
    __m512i  *noncev = (__m512i*)vdata + 9; 
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
-   const uint32_t targ32 = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
    const bool bench = opt_benchmark;
 
    mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -303,12 +255,12 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
                                 n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
    do
    {
-      x17_8way_hash( hash, vdata );
+      x17_8way_hash( hash32, vdata );
 
       for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) )
+      if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
       {
-         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
          if ( likely( valid_hash( lane_hash, ptarget ) ) )
          {
             pdata[19] = bswap_32( n + lane );
@@ -418,18 +370,10 @@ void x17_4way_hash( void *state, const void *input )
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -439,18 +383,10 @@ void x17_4way_hash( void *state, const void *input )
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
        
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -468,10 +404,10 @@ void x17_4way_hash( void *state, const void *input )
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t hash32[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &(hash[ 7*4 ]);
+   uint32_t *hash32_d7 = &(hash32[ 7*4 ]);
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
@@ -479,7 +415,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
    __m256i  *noncev = (__m256i*)vdata + 9;
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
-   const uint32_t targ32 = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
    const bool bench = opt_benchmark;
 
    mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -487,12 +423,12 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                    _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do
    {
-      x17_4way_hash( hash, vdata );
+      x17_4way_hash( hash32, vdata );
 
       for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hashd7[ lane ] <= targ32 && !bench ) )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 && !bench ) )
       {  
-         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
          if ( valid_hash( lane_hash, ptarget ) )
          {
             pdata[19] = bswap_32( n + lane );
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index 8d40a11d..02b0e13c 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -134,30 +134,14 @@ void xevan_8way_hash( void *output, const void *input )
      dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
      dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, dataLen );
+     shavite512_full( &ctx.shavite, hash1, hash1, dataLen );
+     shavite512_full( &ctx.shavite, hash2, hash2, dataLen );
+     shavite512_full( &ctx.shavite, hash3, hash3, dataLen );
+     shavite512_full( &ctx.shavite, hash4, hash4, dataLen );
+     shavite512_full( &ctx.shavite, hash5, hash5, dataLen );
+     shavite512_full( &ctx.shavite, hash6, hash6, dataLen );
+     shavite512_full( &ctx.shavite, hash7, hash7, dataLen );
 
      intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
      intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
@@ -208,30 +192,14 @@ void xevan_8way_hash( void *output, const void *input )
      dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash, dataLen<<3 );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, dataLen );
 
      intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
@@ -243,30 +211,14 @@ void xevan_8way_hash( void *output, const void *input )
      dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash, dataLen<<3 );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, dataLen );
 
      intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
@@ -345,30 +297,14 @@ void xevan_8way_hash( void *output, const void *input )
      dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
      dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
 
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, dataLen );
+     shavite512_full( &ctx.shavite, hash1, hash1, dataLen );
+     shavite512_full( &ctx.shavite, hash2, hash2, dataLen );
+     shavite512_full( &ctx.shavite, hash3, hash3, dataLen );
+     shavite512_full( &ctx.shavite, hash4, hash4, dataLen );
+     shavite512_full( &ctx.shavite, hash5, hash5, dataLen );
+     shavite512_full( &ctx.shavite, hash6, hash6, dataLen );
+     shavite512_full( &ctx.shavite, hash7, hash7, dataLen );
 
      intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
      intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
@@ -419,30 +355,14 @@ void xevan_8way_hash( void *output, const void *input )
      dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash, dataLen<<3 );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, dataLen );
 
      intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
@@ -454,30 +374,14 @@ void xevan_8way_hash( void *output, const void *input )
      dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash, dataLen<<3 );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, dataLen );
 
      intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
@@ -636,18 +540,10 @@ void xevan_4way_hash( void *output, const void *input )
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
      // Parallel 4way 32 bit
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -659,18 +555,10 @@ void xevan_4way_hash( void *output, const void *input )
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
      // Serial
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );
 
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
@@ -749,18 +637,10 @@ void xevan_4way_hash( void *output, const void *input )
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
 
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
@@ -770,18 +650,10 @@ void xevan_4way_hash( void *output, const void *input )
 
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );
 
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
diff --git a/configure b/configure
index e48b7208..7657c1ef 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.12.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.12.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.12.2'
-PACKAGE_STRING='cpuminer-opt 3.12.2'
+PACKAGE_VERSION='3.12.3'
+PACKAGE_STRING='cpuminer-opt 3.12.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.12.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.12.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.12.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.12.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.12.2
+cpuminer-opt configure 3.12.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.12.2, which was
+It was created by cpuminer-opt $as_me 3.12.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.12.2'
+ VERSION='3.12.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.12.2, which was
+This file was extended by cpuminer-opt $as_me 3.12.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.12.2
+cpuminer-opt config.status 3.12.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 382d0df8..f8acae9a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.12.2])
+AC_INIT([cpuminer-opt], [3.12.3])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 2c89fc19..43cd4f52 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2458,6 +2458,8 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 
    pthread_mutex_unlock( &sctx->work_lock );
 
+   restart_threads();
+
    if ( opt_debug )
    {
       unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 7e8f61b1..e3df4403 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -120,11 +120,26 @@ do { \
 } while(0)
 
 
-// Horizontal vector testing
-#define mm256_allbits0( a )    _mm256_testc_si256(   a, m256_neg1 )
-#define mm256_allbits1( a )    _mm256_testz_si256(   a, a )
-#define mm256_anybits0( a )   !mm256_allbits1( a )
-#define mm256_anybits1( a )   !mm256_allbits0( a )
+// Bytewise test of all 256 bits
+#define mm256_all0_8( a ) \
+     ( _mm256_movemask_epi8( a ) == 0 )
+
+#define mm256_all1_8( a ) \
+    ( _mm256_movemask_epi8( a ) == -1 )
+
+
+#define mm256_anybits0( a ) \
+   (  _mm256_movemask_epi8( a ) & 0xffffffff  )
+
+#define mm256_anybits1( a ) \
+   ( ( _mm256_movemask_epi8( a ) & 0xffffffff ) != 0xffffffff )
+
+
+// Bitwise test of all 256 bits
+#define mm256_allbits0( a )   _mm256_testc_si256( a, m256_neg1 )
+#define mm256_allbits1( a )   _mm256_testc_si256( m256_zero, a )
+//#define mm256_anybits0( a )   !mm256_allbits1( a )
+//#define mm256_anybits1( a )   !mm256_allbits0( a )
 
 
 // Parallel AES, for when x is expected to be in a 256 bit register.