From dc6b007a18383dbb4270de8ee12b43952bf6ed72 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Fri, 12 Feb 2021 15:16:53 -0500
Subject: [PATCH 01/20] v3.15.6

---
 README.txt                  |  2 +-
 RELEASE_NOTES               |  8 +++
 algo-gate-api.c             |  1 -
 algo/keccak/keccak-macros.c |  1 +
 algo/x16/x16r-4way.c        | 28 +++++++----
 configure                   | 20 ++++----
 configure.ac                |  2 +-
 cpu-miner.c                 | 99 ++++++++++++++++++-------------------
 miner.h                     | 11 ++---
 util.c                      | 36 +++++++-------
 10 files changed, 111 insertions(+), 97 deletions(-)

diff --git a/README.txt b/README.txt
index 36298c00..08c34b9d 100644
--- a/README.txt
+++ b/README.txt
@@ -59,7 +59,7 @@ Notes about included DLL files:
 
 Downloading DLL files from alternative sources presents an inherent
 security risk if their source is unknown. All DLL files included have
-been copied from the Ubuntu-20.04 instalation or compiled by me from
+been copied from the Ubuntu-20.04 installation or compiled by me from
 source code obtained from the author's official repository. The exact
 procedure is documented in the build instructions for Windows:
 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 1e2f27ad..9415c721 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,14 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.15.6
+
+Implement keccak pre-hash optimization for x16* algos.
+Move conditional mining test to before get_new_work in miner thread.
+Add test for share reject reason when solo mining.
+Add support for floating point, as well as integer, "networkhasps" in
+RPC getmininginfo method.
+
 v3.15.5
 
 Fix stratum jobs lost if 2 jobs received in less than one second.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index e407ef76..6f273ccf 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -419,7 +419,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-crds",      "argon2d250"     },
   { "argon2d-dyn",       "argon2d500"     },
   { "argon2d-uis",       "argon2d4096"    },
   { "bcd",               "x13bcd"         },
diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c
index 64606c37..9666e7d8 100644
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -201,6 +201,7 @@
 #define IOTA(r)   XOR64_IOTA(a00, a00, r)
 
 #ifdef P0
+#undef P0
 #undef P1
 #undef P2
 #undef P3
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 4d12029d..29739525 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -16,8 +16,7 @@
 
 #if defined (X16R_8WAY)
 
-// Perform midstate prehash of hash functions with block size <= 64 bytes
-// and interleave 4x64 before nonce insertion for final hash.
+// Perform midstate prehash of hash functions with block size <= 72 bytes.
 
 void x16r_8way_prehash( void *vdata, void *pdata )
 {
@@ -34,6 +33,11 @@ void x16r_8way_prehash( void *vdata, void *pdata )
          jh512_8way_init( &x16r_ctx.jh );
          jh512_8way_update( &x16r_ctx.jh, vdata, 64 );
       break;
+      case KECCAK:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         keccak512_8way_init( &x16r_ctx.keccak );
+         keccak512_8way_update( &x16r_ctx.keccak, vdata, 72 );
+      break;
       case SKEIN:
          mm512_bswap32_intrlv80_8x64( vdata, pdata );
          skein512_8way_init( &x16r_ctx.skein );
@@ -173,13 +177,13 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
                           hash7, vhash );
          break;
          case KECCAK:
-            keccak512_8way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_8way_update( &ctx.keccak, input, size );
+           if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input + (72<<3), 8 );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                             size<<3 );
+               keccak512_8way_init( &ctx.keccak );
                keccak512_8way_update( &ctx.keccak, vhash, size );
             }
             keccak512_8way_close( &ctx.keccak, vhash );
@@ -490,6 +494,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
    {
       x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
       s_ntime = ntime;
+
       if ( opt_debug && !thr_id )
           applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
    }
@@ -533,6 +538,11 @@ void x16r_4way_prehash( void *vdata, void *pdata )
          jh512_4way_init( &x16r_ctx.jh );
          jh512_4way_update( &x16r_ctx.jh, vdata, 64 );
       break;
+      case KECCAK:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         keccak512_4way_init( &x16r_ctx.keccak );
+         keccak512_4way_update( &x16r_ctx.keccak, vdata, 72 );
+      break;
       case SKEIN:
          mm256_bswap32_intrlv80_4x64( vdata, pdata );
          skein512_4way_prehash64( &x16r_ctx.skein, vdata );
@@ -646,12 +656,12 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case KECCAK:
-            keccak512_4way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_4way_update( &ctx.keccak, input, size );
+           if ( i == 0 )
+               keccak512_4way_update( &ctx.keccak, input + (72<<2), 8 );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               keccak512_4way_init( &ctx.keccak );
                keccak512_4way_update( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
@@ -883,7 +893,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
       x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
       s_ntime = ntime;
       if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
    }
 
    x16r_4way_prehash( vdata, pdata );
diff --git a/configure b/configure
index 80d8d218..e18473f6 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.5.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.5'
-PACKAGE_STRING='cpuminer-opt 3.15.5'
+PACKAGE_VERSION='3.15.6'
+PACKAGE_STRING='cpuminer-opt 3.15.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.15.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.15.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.15.5
+cpuminer-opt configure 3.15.6
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.15.5, which was
+It was created by cpuminer-opt $as_me 3.15.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.15.5'
+ VERSION='3.15.6'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.5, which was
+This file was extended by cpuminer-opt $as_me 3.15.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.5
+cpuminer-opt config.status 3.15.6
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 31bdb562..95d825e3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.15.5])
+AC_INIT([cpuminer-opt], [3.15.6])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 3761313f..254f38f8 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -490,8 +490,13 @@ static bool get_mininginfo( CURL *curl, struct work *work )
 	   }
 
       key = json_object_get( res, "networkhashps" );
-	   if ( key && json_is_integer( key ) )
-		   net_hashrate = (double) json_integer_value( key );
+      if ( key )
+      {
+         if ( json_is_integer( key ) )
+            net_hashrate = (double) json_integer_value( key );
+         else if ( json_is_real( key ) )
+            net_hashrate = (double) json_real_value( key );
+      }
 
       key = json_object_get( res, "blocks" );
 	   if ( key && json_is_integer( key ) )
@@ -506,26 +511,7 @@ static bool get_mininginfo( CURL *curl, struct work *work )
 	      // complete missing data from getwork
 	      work->height = (uint32_t) net_blocks + 1;
 	      if ( work->height > g_work.height )
-         {
             restart_threads();
-
-/* redundant with new block log
-            if ( !opt_quiet )
-            {
-		         char netinfo[64] = { 0 };
-		         char srate[32] = { 0 };
-		         sprintf( netinfo, "diff %.2f", net_diff );
-		         if ( net_hashrate )
-               {
-	               format_hashrate( net_hashrate, srate );
-                  strcat( netinfo, ", net " );
-			         strcat( netinfo, srate );
-		         }
-		         applog( LOG_BLUE, "%s block %d, %s",
-			                algo_names[opt_algo], work->height, netinfo );
-		      }
-*/
-         } 
 	   }  // res
 	}
 	json_decref( val );
@@ -920,12 +906,12 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
    tmp = json_object_get( val, "workid" );
    if ( tmp )
    {
-   if ( !json_is_string( tmp ) )
-   {
-      applog( LOG_ERR, "JSON invalid workid" );
-      goto out;
-   }
-   work->workid = strdup( json_string_value( tmp ) );
+      if ( !json_is_string( tmp ) )
+      {
+         applog( LOG_ERR, "JSON invalid workid" );
+         goto out;
+      }
+      work->workid = strdup( json_string_value( tmp ) );
    }
 
    rc = true;
@@ -1078,13 +1064,12 @@ void report_summary_log( bool force )
 
    if ( accepted_share_count < submitted_share_count )
    {
+      double ltd = exp32 * last_targetdiff;
       double lost_ghrate = uptime.tv_sec == 0 ? 0.
-                  : exp32 * last_targetdiff
-                      * (double)(submitted_share_count - accepted_share_count )
-                    / (double)uptime.tv_sec;
+                : ltd * (double)(submitted_share_count - accepted_share_count )
+                  / (double)uptime.tv_sec;
       double lost_shrate = share_time == 0. ? 0.
-               : exp32 * last_targetdiff  * (double)(submits - accepts )
-                / share_time;
+               : ltd  * (double)(submits - accepts ) / share_time;
       char lshr_units[4] = {0};
       char lghr_units[4] = {0};
       scale_hash_for_display( &lost_shrate, lshr_units );
@@ -1190,9 +1175,11 @@ static int share_result( int result, struct work *work,
    {
      sprintf( ares, "A%d", accepted_share_count );
      sprintf( bres, "B%d", solved_block_count );
-     stale = work ? work->data[ algo_gate.ntime_index ]
-                 != g_work.data[ algo_gate.ntime_index ] : false; 
-     if ( reason ) stale = stale || strstr( reason, "job" );
+     if ( reason )
+        stale = strstr( reason, "job" );
+     else if ( work )
+        stale =  work->data[ algo_gate.ntime_index ]
+             != g_work.data[ algo_gate.ntime_index ];
      if ( stale )
      {
         stale_share_count++;
@@ -1260,14 +1247,13 @@ static int share_result( int result, struct work *work,
    if ( unlikely( !( opt_quiet || result || stale ) ) )
    {
       uint32_t str[8];
+      uint32_t *targ;
 
-      if ( reason )
-         applog( LOG_WARNING, "Reject reason: %s", reason );
+      if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason );
          
-      // display share hash and target for troubleshooting
       diff_to_hash( str, my_stats.share_diff );
       applog2( LOG_INFO, "Hash:   %08x%08x%08x...", str[7], str[6], str[5] );
-      uint32_t *targ;
+
       if ( work )
          targ = work->target;
       else
@@ -1580,6 +1566,7 @@ static bool get_upstream_work( CURL *curl, struct work *work )
          {
             double miner_hr = 0.;
             double net_hr = net_hashrate;
+            double nd = net_diff * exp32;
             char net_hr_units[4] = {0};
             char miner_hr_units[4] = {0};
             char net_ttf[32];
@@ -1594,11 +1581,11 @@ static bool get_upstream_work( CURL *curl, struct work *work )
             pthread_mutex_unlock( &stats_lock );
 
             if ( net_hr > 0. )
-               sprintf_et( net_ttf, ( net_diff * exp32 ) / net_hr );
+               sprintf_et( net_ttf, nd / net_hr );
             else
                sprintf( net_ttf, "NA" );
             if ( miner_hr > 0. )
-               sprintf_et( miner_ttf, ( net_diff * exp32 ) / miner_hr );
+               sprintf_et( miner_ttf, nd / miner_hr );
             else
                sprintf( miner_ttf, "NA" );
 
@@ -1848,10 +1835,19 @@ bool submit_solution( struct work *work, const void *hash,
                    work->data[ algo_gate.ntime_index ] );
      }
 
-     if ( unlikely( lowdiff_debug ) )
+     if ( opt_debug )
      {
         uint32_t* h = (uint32_t*)hash;
         uint32_t* t = (uint32_t*)work->target;
+        uint32_t* d = (uint32_t*)work->data;
+
+        unsigned char *xnonce2str = abin2hex( work->xnonce2,
+                                              work->xnonce2_len );
+        applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id,
+                       work->data[ algo_gate.nonce_index ], xnonce2str );
+        free( xnonce2str );
+        applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
+        applog(LOG_INFO,"          : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]);
         applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
                                     h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
         applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
@@ -2066,11 +2062,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 
          if ( likely( hr > 0. ) )
          {
+            double nd = net_diff * exp32;
             char hr_units[4] = {0};
             char block_ttf[32];
             char share_ttf[32];
 
-            sprintf_et( block_ttf, ( net_diff * exp32 ) /  hr );
+            sprintf_et( block_ttf, nd /  hr );
             sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
             scale_hash_for_display ( &hr, hr_units );
             applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
@@ -2086,7 +2083,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                     : et.tv_sec / ( last_block_height - session_first_block );
                if ( net_diff && net_ttf )
                {
-                  double net_hr = net_diff * exp32 / net_ttf;
+                  double net_hr = nd / net_ttf;
                   char net_hr_units[4] = {0};
 
                   scale_hash_for_display ( &net_hr, net_hr_units );
@@ -2253,12 +2250,6 @@ static void *miner_thread( void *userdata )
 
        if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
           continue;
-       // conditional mining
-       if ( unlikely( !wanna_mine( thr_id ) ) )
-       {
-          sleep(5);
-	       continue;
-       }
 
 // LP_SCANTIME overrides opt_scantime option, is this right?
 
@@ -2445,6 +2436,14 @@ static void *miner_thread( void *userdata )
 #endif
 	       }
        }  // benchmark
+
+       // conditional mining
+       if ( unlikely( !wanna_mine( thr_id ) ) )
+       {
+          sleep(5);
+          continue;
+       }
+
    }  // miner_thread loop
 
 out:
diff --git a/miner.h b/miner.h
index 63f17f00..119c8a75 100644
--- a/miner.h
+++ b/miner.h
@@ -643,7 +643,7 @@ static const char* const algo_names[] = {
         "lyra2z330",
         "m7m",
         "minotaur",
-	"myr-gr",
+        "myr-gr",
         "neoscrypt",
         "nist5",
         "pentablake",
@@ -771,7 +771,7 @@ Options:\n\
                           allium        Garlicoin (GRLC)\n\
                           anime         Animecoin (ANI)\n\
                           argon2        Argon2 Coin (AR2)\n\
-                          argon2d250    argon2d-crds, Credits (CRDS)\n\
+                          argon2d250\n\
                           argon2d500    argon2d-dyn, Dynamic (DYN)\n\
                           argon2d4096   argon2d-uis, Unitus (UIS)\n\
                           axiom         Shabal-256 MemoHash\n\
@@ -796,13 +796,13 @@ Options:\n\
                           lyra2h        Hppcoin\n\
                           lyra2re       lyra2\n\
                           lyra2rev2     lyrav2\n\
-                          lyra2rev3     lyrav2v3, Vertcoin\n\
+                          lyra2rev3     lyrav2v3\n\
                           lyra2z\n\
                           lyra2z330     Lyra2 330 rows\n\
                           m7m           Magi (XMG)\n\
                           myr-gr        Myriad-Groestl\n\
                           minotaur      Ringcoin (RNG)\n\
-			  neoscrypt     NeoScrypt(128, 2, 1)\n\
+                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                           nist5         Nist5\n\
                           pentablake    5 x blake512\n\
                           phi1612       phi\n\
@@ -816,7 +816,7 @@ Options:\n\
                           sha256d       Double SHA-256\n\
                           sha256q       Quad SHA-256, Pyrite (PYE)\n\
                           sha256t       Triple SHA-256, Onecoin (OC)\n\
-			                 sha3d         Double Keccak256 (BSHA3)\n\
+                          sha3d         Double Keccak256 (BSHA3)\n\
                           shavite3      Shavite3\n\
                           skein         Skein+Sha (Skeincoin)\n\
                           skein2        Double Skein (Woodcoin)\n\
@@ -875,7 +875,6 @@ Options:\n\
   -s, --scantime=N      upper bound on time spent scanning current work when\n\
                           long polling is unavailable, in seconds (default: 5)\n\
       --randomize       Randomize scan range start to reduce duplicates\n\
-      --reset-on-stale  Workaround reset stratum if too many stale shares\n\
   -f, --diff-factor     Divide req. difficulty by this factor (std is 1.0)\n\
   -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
       --hash-meter      Display thread hash rates\n\
diff --git a/util.c b/util.c
index 5df1eb93..6a7a0503 100644
--- a/util.c
+++ b/util.c
@@ -1048,53 +1048,51 @@ bool fulltest( const uint32_t *hash, const uint32_t *target )
 	return rc;
 }
 
-// Mathmatically the difficulty is simply the reciprocal of the hash.
+// Mathmatically the difficulty is simply the reciprocal of the hash: d = 1/h.
 // Both are real numbers but the hash (target) is represented as a 256 bit
-// number with the upper 32 bits representing the whole integer part and the
-// lower 224 bits representing the fractional part:
+// fixed point number with the upper 32 bits representing the whole integer
+// part and the lower 224 bits representing the fractional part:
 //   target[ 255:224 ] = trunc( 1/diff )
 //   target[ 223:  0 ] = frac( 1/diff )
 //
 // The 256 bit hash is exact but any floating point representation is not.
-// Stratum provides the target difficulty as double precision, inexcact, and
+// Stratum provides the target difficulty as double precision, inexcact,
 // which must be converted to a hash target. The converted hash target will
-// likely be less precise to to inexact input and conversion error.
-// converted to 256 bit hash which will also be inexact and likelyless
-// accurate to to error in conversion.
+// likely be less precise due to inexact input and conversion error.
 // On the other hand getwork provides a 256 bit hash target which is exact.
 //
 // How much precision is needed?
 //
-// 128 bit types are implemented in software by the compiler using 64 bit
+// 128 bit types are implemented in software by the compiler on 64 bit
 // hardware resulting in lower performance and more error than would be
-// expected with a hardware 128 bit implementtaion.
+// expected with a hardware 128 bit implementaion.
 // Float80 exploits the internals of the FP unit which provide a 64 bit
 // mantissa in an 80 bit register with hardware rounding. When the destination
 // is double the data is rounded to float64 format. Long double returns all
 // 80 bits without rounding and including any accumulated computation error.
 // Float80 does not fit efficiently in memory.
 //
-// 256 bit hash: 76
+// Significant digits:
+// 256 bit hash: 76     
 // float:         7     (float32, 80 bits with rounding to 32 bits)
 // double:       15     (float64, 80 bits with rounding to 64 bits)
-// long double   19     (float80, 80 bits with no rounding)
-// __float128    33     (128 bits with no rounding)
+// long double:  19     (float80, 80 bits with no rounding)
+// __float128:   33     (128 bits with no rounding)
 // uint32_t:      9
 // uint64_t:     19
 // uint128_t     38
 //
 // The concept of significant digits doesn't apply to the 256 bit hash
-// representation. It's fixed point making leading zeros significant
-// Leading zeros count in the 256 bit 
+// representation. It's fixed point making leading zeros significant,
+// limiting its range and precision due to fewer zon-zero significant digits.
 //
 // Doing calculations with float128 and uint128 increases precision for
 // target_to_diff, but doesn't help with stratum diff being limited to
 // double precision. Is the extra precision really worth the extra cost?
-//
-// With double the error rate is 1/1e15, or one hash in every Petahash
-// with a very low difficulty, not a likely sitiation. Higher difficulty
-// increases the effective precision. Due to the floating nature of the 
-// decimal point leading zeros aren't counted.
+// With float128 the error rate is 1/1e33 compared with 1/1e15 for double.
+// For double that's 1 error in every petahash with a very low difficulty,
+// not a likely situation. With higher difficulty effective precision
+// increases.
 //
 // Unfortunately I can't get float128 to work so long double (float80) is
 // as precise as it gets.

From 40089428c5fdd72f26bd8749a70b770537fe7043 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Mon, 8 Mar 2021 22:44:44 -0500
Subject: [PATCH 02/20] v3.15.7

---
 INSTALL_WINDOWS                     |  141 +++-
 RELEASE_NOTES                       |    8 +
 algo/echo/aes_ni/hash.c             |    4 +-
 algo/echo/echo-hash-4way.c          |   17 +-
 algo/fugue/fugue-aesni.h            |    6 +-
 algo/groestl/groestl256-hash-4way.c |   54 +-
 algo/groestl/groestl512-hash-4way.c |   20 +-
 algo/hamsi/hamsi-hash-4way.c        |   18 +-
 algo/keccak/keccak-macros.c         |   48 +-
 algo/luffa/luffa-hash-2way.c        |   72 +-
 algo/luffa/luffa_for_sse2.c         |   12 +-
 algo/sha/sph_sha2.c                 |    2 +
 algo/shavite/shavite-hash-2way.c    |   27 +-
 algo/shavite/shavite-hash-4way.c    |   10 +-
 algo/swifftx/stdbool.h              |   47 --
 algo/swifftx/swifftx.c.bak          | 1155 ---------------------------
 configure                           |   20 +-
 configure.ac                        |    2 +-
 cpu-miner.c                         |  130 +--
 miner.h                             |    5 +-
 simd-utils.h                        |    2 +-
 simd-utils/simd-128.h               |  160 ++--
 simd-utils/simd-256.h               |  359 ++-------
 simd-utils/simd-512.h               |  283 +++----
 simd-utils/simd-64.h                |   83 +-
 simd-utils/simd-int.h               |   74 +-
 26 files changed, 598 insertions(+), 2161 deletions(-)
 delete mode 100644 algo/swifftx/stdbool.h
 delete mode 100644 algo/swifftx/swifftx.c.bak

diff --git a/INSTALL_WINDOWS b/INSTALL_WINDOWS
index f2e2c80a..02a829ed 100644
--- a/INSTALL_WINDOWS
+++ b/INSTALL_WINDOWS
@@ -1,5 +1,9 @@
 Instructions for compiling cpuminer-opt for Windows.
 
+Thwaw intructions nay be out of date. Please consult the wiki for
+the latest:
+
+https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
 
 Windows compilation using Visual Studio is not supported. Mingw64 is
 used on a Linux system (bare metal or virtual machine) to cross-compile
@@ -24,79 +28,76 @@ Refer to Linux compile instructions and install required packages.
 
 Additionally, install mingw-w64.
 
-sudo apt-get install mingw-w64
+sudo apt-get install mingw-w64 libz-mingw-w64-dev
 
 
 2. Create a local library directory for packages to be compiled in the next
    step. Suggested location is $HOME/usr/lib/
 
+$ mkdir $HOME/usr/lib
+
 3. Download and build other packages for mingw that don't have a mingw64
    version available in the repositories.
 
 Download the following source code packages from their respective and
 respected download locations, copy them to ~/usr/lib/ and uncompress them. 
 
-openssl
-curl
-gmp
+openssl: https://github.com/openssl/openssl/releases
+
+curl: https://github.com/curl/curl/releases
+
+gmp: https://gmplib.org/download/gmp/
 
-In most cases the latest vesrion is ok but it's safest to download
-the same major and minor version as included in your distribution.
+In most cases the latest version is ok but it's safest to download the same major and minor version as included in your distribution. The following uses versions from Ubuntu 20.04. Change version numbers as required.
 
-Run the following commands or follow the supplied instructions.
-Do not run "make install" unless you are using ~/usr/lib, which isn't
-recommended.
+Run the following commands or follow the supplied instructions. Do not run "make install" unless you are using /usr/lib, which isn't recommended.
 
-Some instructions insist on running "make check". If make check fails
-it may still work, YMMV.
+Some instructions insist on running "make check". If make check fails it may still work, YMMV.
 
-You can speed up "make" by using all CPU cores available with "-j n" where
-n is the number of CPU threads you want to use.
+You can speed up "make" by using all CPU cores available with "-j n" where n is the number of CPU threads you want to use.
 
 openssl:
 
-./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32
-make
+$ ./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32-
+$ make
+
+Make may fail with an ld error, just ensure libcrypto-1_1-x64.dll is created.
 
 curl:
 
-./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32
-make
+$ ./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32
+$ make
 
 gmp:
 
-./configure --host=x86_64-w64-mingw32 
-make
-
-
+$ ./configure --host=x86_64-w64-mingw32
+$ make
 
 4. Tweak the environment.
 
-This step is required everytime you login or the commands can be added to
-.bashrc.
+This step is required everytime you login or the commands can be added to .bashrc.
 
-Define some local variables to point to local library. 
+Define some local variables to point to local library.
 
-export LOCAL_LIB="$HOME/usr/lib"
+$ export LOCAL_LIB="$HOME/usr/lib"
 
-export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+$ export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 
-export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
+$ export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
 
-Create a release directory and copy some dll files previously built.
-This can be done outside of cpuminer-opt and only needs to be done once.
-If the release directory is in cpuminer-opt directory it needs to be
-recreated every a source package is decompressed.
+Adjust for gcc version:
 
-mkdir release
-cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
-cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
-cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
-cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
-cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
-cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
+$ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 
+Create a release directory and copy some dll files previously built. This can be done outside of cpuminer-opt and only needs to be done once. If the release directory is in cpuminer-opt directory it needs to be recreated every time a source package is decompressed.
 
+$ mkdir release
+$ cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
+$ cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
+$ cp $GCC_MINGW_LIB/libstdc++-6.dll release/
+$ cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
+$ cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
+$ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 
 The following steps need to be done every time a new source package is
 opened.
@@ -110,13 +111,73 @@ https://github.com/JayDDee/cpuminer-opt/releases
 
 Decompress and change to the cpuminer-opt directory.
 
+6. compile
+
+Create a link to the locally compiled version of gmp.h
+
+$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
+
+$ ./autogen.sh
+
+Configure the compiler for the CPU architecture of the host machine:
+
+CFLAGS="-O3 -march=native -Wall" ./configure $CONFIGURE_ARGS
+
+or cross compile for a specific CPU architecture:
+
+CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
+
+This will compile for AMD Ryzen.
+
+You can compile more generically for a set of specific CPU features if you know what features you want:
+
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
+
+This will compile for an older CPU that does not have AVX.
+
+You can find several examples in README.txt
+
+If you have a CPU with more than 64 threads and Windows 7 or higher you can enable the CPU Groups feature by adding the following to CFLAGS:
+
+"-D_WIN32_WINNT=0x0601"
+
+Once you have run configure successfully run the compiler with n CPU threads:
+
+$ make -j n
+
+Copy cpuminer.exe to the release directory, compress and copy the release directory to a Windows system and run cpuminer.exe from the command line.
+
+Run cpuminer
+
+In a command windows change directories to the unzipped release folder. to get a list of all options:
+
+cpuminer.exe --help
+
+Command options are specific to where you mine. Refer to the pool's instructions on how to set them.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 
-6. Prepare to compile
 
 Create a link to the locally compiled version of gmp.h
 
-ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
+$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
 
 Edit configure.ac to fix lipthread package name.
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 9415c721..732b5e64 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,14 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.15.7
+
+Added accepted/stale/rejected percentage to summary log report.
+Added warning if share counters mismatch which could corrupt stats.
+Linux: CPU temperature reporting is more responsive to rising temperature.
+A few AVX2 & AVX512 tweaks.
+Removed some dead code and other cleanup.
+
 v3.15.6
 
 Implement keccak pre-hash optimization for x16* algos.
diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c
index 55b27c60..a4e3958c 100644
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -55,8 +55,8 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 
 #define ECHO_SUBBYTES(state, i, j) \
 	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
-	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
-	k1 = _mm_add_epi32(k1, M128(const1))
+   k1 = _mm_add_epi32(k1, M128(const1));\
+	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero))
 
 #define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
 	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c
index eb3c41c2..51a9f0a8 100644
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -10,22 +10,20 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
    0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
 };
 */
-// do these need to be reversed?
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 
-#define mul2mask \
-     m512_const2_64( 0, 0x00001b00 )
+//#define mul2mask    m512_const2_64( 0, 0x00001b00 )
 //_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
-//   _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
+//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
 
-#define lsbmask    m512_const1_32( 0x01010101 ) 
+//#define lsbmask    m512_const1_32( 0x01010101 ) 
 
 #define ECHO_SUBBYTES( state, i, j ) \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
-	state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
-	k1 = _mm512_add_epi32( k1, m512_one_128 );
+   k1 = _mm512_add_epi32( k1, one ); \
+	state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero );
 
 #define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
 { \
@@ -140,6 +138,9 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
   unsigned int r, b, i, j;
   __m512i t1, t2, s2, k1;
   __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+  __m512i one = m512_one_128;
+  __m512i mul2mask = m512_const2_64( 0, 0x00001b00 );
+  __m512i lsbmask  = m512_const1_32( 0x01010101 ); 
 
   _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
   _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
@@ -406,8 +407,8 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
 
 #define ECHO_SUBBYTES_2WAY( state, i, j ) \
         state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
+        k1 = _mm256_add_epi32( k1, m256_one_128 ); \
         state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
-        k1 = _mm256_add_epi32( k1, m256_one_128 );
 
 #define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
 { \
diff --git a/algo/fugue/fugue-aesni.h b/algo/fugue/fugue-aesni.h
index be9806f4..d1536641 100644
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -14,7 +14,11 @@
 #ifndef FUGUE_HASH_API_H
 #define FUGUE_HASH_API_H
 
-#if defined(__AES__)
+#if defined(__AES__) 
+
+#if !defined(__SSE4_1__)
+#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
+#endif
 
 #include "algo/sha/sha3_common.h"
 #include "simd-utils.h"
diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c
index dd82a867..adbdf664 100644
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -51,7 +51,7 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
    const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
    const int hash_offset = SIZE256 - hashlen_m128i;
    int rem = ctx->rem_ptr;
-   int blocks = len / SIZE256;
+   uint64_t blocks = len / SIZE256;
    __m512i* in = (__m512i*)input;
    int i;
 
@@ -89,21 +89,21 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
    if ( i == SIZE256 - 1 )
    {        
        // only 1 vector left in buffer, all padding at once
-      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 ); 
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 ); 
    }   
    else
    {
        // add first padding
-       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
        // add zero padding
        for ( i += 1; i < SIZE256 - 1; i++ )
            ctx->buffer[i] = m512_zero;
 
        // add length padding, second last byte is zero unless blocks > 255
-      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 );
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
    }
 
-// digest final padding block and do output transform
+   // digest final padding block and do output transform
    TF512_4way( ctx->chaining, ctx->buffer );
 
    OF512_4way( ctx->chaining );
@@ -122,7 +122,7 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
    const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
    const int hash_offset = SIZE256 - hashlen_m128i;
    int rem = ctx->rem_ptr;
-   int blocks = len / SIZE256;
+   uint64_t blocks = len / SIZE256;
    __m512i* in = (__m512i*)input;
    int i;
 
@@ -146,20 +146,18 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
    if ( i == SIZE256 - 1 )
    {
        // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
-                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
    }
    else
    {
        // add first padding
-       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
        // add zero padding
        for ( i += 1; i < SIZE256 - 1; i++ )
            ctx->buffer[i] = m512_zero;
 
        // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
-                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
    }
 
 // digest final padding block and do output transform
@@ -209,23 +207,23 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
    const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
    const int hash_offset = SIZE256 - hashlen_m128i;
    int rem = ctx->rem_ptr;
-   int blocks = len / SIZE256;
+   uint64_t blocks = len / SIZE256;
    __m256i* in = (__m256i*)input;
    int i;
 
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+   if (ctx->chaining == NULL || ctx->buffer == NULL)
+     return 1;
 
-  for ( i = 0; i < SIZE256; i++ )
-  {
+   for ( i = 0; i < SIZE256; i++ )
+   {
      ctx->chaining[i] = m256_zero;
      ctx->buffer[i]   = m256_zero;
-  }
+   }
 
-  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
-  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
+   // The only non-zero in the IV is len. It can be hard coded.
+   ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;
 
    // --- update ---
 
@@ -247,7 +245,7 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
    if ( i == SIZE256 - 1 )
    {
        // only 1 vector left in buffer, all padding at once
-      ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 );
+      ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
    }
    else
    {
@@ -258,10 +256,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
            ctx->buffer[i] = m256_zero;
 
        // add length padding, second last byte is zero unless blocks > 255
-      ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 );
+       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
    }
 
-// digest final padding block and do output transform
+   // digest final padding block and do output transform
    TF512_2way( ctx->chaining, ctx->buffer );
 
    OF512_2way( ctx->chaining );
@@ -279,7 +277,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
    const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
    const int hash_offset = SIZE256 - hashlen_m128i;
    int rem = ctx->rem_ptr;
-   int blocks = len / SIZE256;
+   uint64_t blocks = len / SIZE256;
    __m256i* in = (__m256i*)input;
    int i;
 
@@ -303,8 +301,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
    if ( i == SIZE256 - 1 )
    {
        // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
-                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+       ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
    }
    else
    {
@@ -315,8 +312,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
            ctx->buffer[i] = m256_zero;
 
        // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
-                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
    }
 
 // digest final padding block and do output transform
diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c
index bff6af53..b7547339 100644
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -43,7 +43,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
    const int hashlen_m128i = 64 / 16;   // bytes to __m128i
    const int hash_offset = SIZE512 - hashlen_m128i;
    int rem = ctx->rem_ptr;
-   int blocks = len / SIZE512;
+   uint64_t blocks = len / SIZE512;
    __m512i* in = (__m512i*)input;
    int i;
 
@@ -64,16 +64,14 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
    if ( i == SIZE512 - 1 )
    {        
        // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
-                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
    }   
    else
    {
-       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
        for ( i += 1; i < SIZE512 - 1; i++ )
            ctx->buffer[i] = m512_zero;
-       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
-                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
    }
 
    TF1024_4way( ctx->chaining, ctx->buffer );
@@ -124,7 +122,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
    }
    else
    {
-       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
        for ( i += 1; i < SIZE512 - 1; i++ )
            ctx->buffer[i] = m512_zero;
        ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
@@ -168,7 +166,7 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
    const int hashlen_m128i = 64 / 16;   // bytes to __m128i
    const int hash_offset = SIZE512 - hashlen_m128i;
    int rem = ctx->rem_ptr;
-   int blocks = len / SIZE512;
+   uint64_t blocks = len / SIZE512;
    __m256i* in = (__m256i*)input;
    int i;
 
@@ -189,16 +187,14 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
    if ( i == SIZE512 - 1 )
    {
        // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
-                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+       ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
    }
    else
    {
        ctx->buffer[i] = m256_const2_64( 0, 0x80 );
        for ( i += 1; i < SIZE512 - 1; i++ )
            ctx->buffer[i] = m256_zero;
-       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
-                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
    }
 
    TF1024_2way( ctx->chaining, ctx->buffer );
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index d86bd42d..2a952a73 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -548,7 +548,7 @@ static const sph_u32 T512[64][16] = {
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-// Hamsi 8 way 
+// Hamsi 8 way AVX512 
 
 #define INPUT_BIG8 \
 do { \
@@ -849,13 +849,11 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
 void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 {
    __m512i pad[1];
-   int ch, cl;
+   uint32_t ch, cl;
 
    sph_enc32be( &ch, sc->count_high );
    sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
-   pad[0] =  _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
-                               cl, ch, cl, ch, cl, ch, cl, ch );
-//   pad[0] =  m512_const2_32( cl, ch );
+   pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
    sc->buf[0] = m512_const1_64( 0x80 );
    hamsi_8way_big( sc, sc->buf, 1 );
    hamsi_8way_big_final( sc, pad );
@@ -863,11 +861,9 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
    mm512_block_bswap_32( (__m512i*)dst, sc->h );
 }
 
-
 #endif // AVX512
 
-
-// Hamsi 4 way
+// Hamsi 4 way AVX2
 
 #define INPUT_BIG \
 do { \
@@ -1186,14 +1182,12 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
 void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 {
    __m256i pad[1];
-   int ch, cl;
+   uint32_t ch, cl;
 
    sph_enc32be( &ch, sc->count_high );
    sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
-   pad[0] =  _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
+   pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch );
    sc->buf[0] = m256_const1_64( 0x80 );
-//      sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
-//                                  0UL, 0x80UL, 0UL, 0x80UL );
    hamsi_big( sc, sc->buf, 1 );
    hamsi_big_final( sc, pad );
 
diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c
index 9666e7d8..8d5197c3 100644
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -134,65 +134,47 @@
         do { \
                 DECL64(c0); \
                 DECL64(c1); \
-                DECL64(c2); \
-                DECL64(c3); \
-                DECL64(c4); \
                 DECL64(bnn); \
                 NOT64(bnn, b20); \
                 KHI_XO(c0, b00, b10, b20); \
                 KHI_XO(c1, b10, bnn, b30); \
-                KHI_XA(c2, b20, b30, b40); \
-                KHI_XO(c3, b30, b40, b00); \
-                KHI_XA(c4, b40, b00, b10); \
+                KHI_XA(b20, b20, b30, b40); \
+                KHI_XO(b30, b30, b40, b00); \
+                KHI_XA(b40, b40, b00, b10); \
                 MOV64(b00, c0); \
                 MOV64(b10, c1); \
-                MOV64(b20, c2); \
-                MOV64(b30, c3); \
-                MOV64(b40, c4); \
                 NOT64(bnn, b41); \
                 KHI_XO(c0, b01, b11, b21); \
                 KHI_XA(c1, b11, b21, b31); \
-                KHI_XO(c2, b21, b31, bnn); \
-                KHI_XO(c3, b31, b41, b01); \
-                KHI_XA(c4, b41, b01, b11); \
+                KHI_XO(b21, b21, b31, bnn); \
+                KHI_XO(b31, b31, b41, b01); \
+                KHI_XA(b41, b41, b01, b11); \
                 MOV64(b01, c0); \
                 MOV64(b11, c1); \
-                MOV64(b21, c2); \
-                MOV64(b31, c3); \
-                MOV64(b41, c4); \
                 NOT64(bnn, b32); \
                 KHI_XO(c0, b02, b12, b22); \
                 KHI_XA(c1, b12, b22, b32); \
-                KHI_XA(c2, b22, bnn, b42); \
-                KHI_XO(c3, bnn, b42, b02); \
-                KHI_XA(c4, b42, b02, b12); \
+                KHI_XA(b22, b22, bnn, b42); \
+                KHI_XO(b32, bnn, b42, b02); \
+                KHI_XA(b42, b42, b02, b12); \
                 MOV64(b02, c0); \
                 MOV64(b12, c1); \
-                MOV64(b22, c2); \
-                MOV64(b32, c3); \
-                MOV64(b42, c4); \
                 NOT64(bnn, b33); \
                 KHI_XA(c0, b03, b13, b23); \
                 KHI_XO(c1, b13, b23, b33); \
-                KHI_XO(c2, b23, bnn, b43); \
-                KHI_XA(c3, bnn, b43, b03); \
-                KHI_XO(c4, b43, b03, b13); \
+                KHI_XO(b23, b23, bnn, b43); \
+                KHI_XA(b33, bnn, b43, b03); \
+                KHI_XO(b43, b43, b03, b13); \
                 MOV64(b03, c0); \
                 MOV64(b13, c1); \
-                MOV64(b23, c2); \
-                MOV64(b33, c3); \
-                MOV64(b43, c4); \
                 NOT64(bnn, b14); \
                 KHI_XA(c0, b04, bnn, b24); \
                 KHI_XO(c1, bnn, b24, b34); \
-                KHI_XA(c2, b24, b34, b44); \
-                KHI_XO(c3, b34, b44, b04); \
-                KHI_XA(c4, b44, b04, b14); \
+                KHI_XA(b24, b24, b34, b44); \
+                KHI_XO(b34, b34, b44, b04); \
+                KHI_XA(b44, b44, b04, b14); \
                 MOV64(b04, c0); \
                 MOV64(b14, c1); \
-                MOV64(b24, c2); \
-                MOV64(b34, c3); \
-                MOV64(b44, c4); \
         } while (0)
 
 #ifdef IOTA
diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c
index aad56b63..bbc31b9b 100644
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -66,6 +66,17 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
     a = _mm512_xor_si512(a,c0);\
     b = _mm512_xor_si512(b,c1);
 
+#define MULT24W( a0, a1 ) \
+do { \
+  __m512i b = _mm512_xor_si512( a0, \
+                     _mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
+  a0 = _mm512_or_si512( _mm512_bsrli_epi128(  b, 4 ), \
+                        _mm512_bslli_epi128( a1,12 ) ); \
+  a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
+                        _mm512_bslli_epi128(  b,12 ) ); \
+} while(0)
+
+/*
 #define MULT24W( a0, a1, mask ) \
 do { \
   __m512i b = _mm512_xor_si512( a0, \
@@ -73,6 +84,7 @@ do { \
   a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
   a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
 } while(0)
+*/
 
 // confirm pointer arithmetic
 // ok but use array indexes
@@ -235,7 +247,6 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
     __m512i msg0, msg1;
     __m512i tmp[2];
     __m512i x[8];
-    const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff );
 
     t0 = chainv[0];
     t1 = chainv[1];
@@ -249,7 +260,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
     t0 = _mm512_xor_si512( t0, chainv[8] );
     t1 = _mm512_xor_si512( t1, chainv[9] );
 
-    MULT24W( t0, t1, MASK );
+    MULT24W( t0, t1 );
 
     msg0 = _mm512_shuffle_epi32( msg[0], 27 );
     msg1 = _mm512_shuffle_epi32( msg[1], 27 );
@@ -268,68 +279,67 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
     t0 = chainv[0];
     t1 = chainv[1];
 
-    MULT24W( chainv[0], chainv[1], MASK );
+    MULT24W( chainv[0], chainv[1] );
     chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
     chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
 
-    MULT24W( chainv[2], chainv[3], MASK );
+    MULT24W( chainv[2], chainv[3] );
     chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
     chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
 
-    MULT24W( chainv[4], chainv[5], MASK );
+    MULT24W( chainv[4], chainv[5] );
     chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
     chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
 
-    MULT24W( chainv[6], chainv[7], MASK );
+    MULT24W( chainv[6], chainv[7] );
     chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
     chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
 
-    MULT24W( chainv[8], chainv[9], MASK );
+    MULT24W( chainv[8], chainv[9] );
     chainv[8] = _mm512_xor_si512( chainv[8], t0 );
     chainv[9] = _mm512_xor_si512( chainv[9], t1 );
 
     t0 = chainv[8];
     t1 = chainv[9];
 
-    MULT24W( chainv[8], chainv[9], MASK );
+    MULT24W( chainv[8], chainv[9] );
     chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
     chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
 
-    MULT24W( chainv[6], chainv[7], MASK );
+    MULT24W( chainv[6], chainv[7] );
     chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
     chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
 
-    MULT24W( chainv[4], chainv[5], MASK );
+    MULT24W( chainv[4], chainv[5] );
     chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
     chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
 
-    MULT24W( chainv[2], chainv[3], MASK );
+    MULT24W( chainv[2], chainv[3] );
     chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
     chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
 
-    MULT24W( chainv[0], chainv[1], MASK );
+    MULT24W( chainv[0], chainv[1] );
     chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
     chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
 
-    MULT24W( msg0, msg1, MASK );
+    MULT24W( msg0, msg1 );
     chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
     chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
 
-    MULT24W( msg0, msg1, MASK );
+    MULT24W( msg0, msg1 );
     chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
     chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
 
-    MULT24W( msg0, msg1, MASK );
+    MULT24W( msg0, msg1 );
     chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
     chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
 
-    MULT24W( msg0, msg1, MASK );
+    MULT24W( msg0, msg1);
     chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
     chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
 
-    MULT24W( msg0, msg1, MASK );
+    MULT24W( msg0, msg1 );
 
-    // replace with ror
     chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
     chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
     chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
@@ -496,7 +506,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
     {
       // remaining data bytes
       buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
-      buffer[1] = m512_const2_64( 0, 0x0000000080000000 );
+      buffer[1] = m512_const1_i128(  0x0000000080000000 );
     }
     return 0;
 }
@@ -520,7 +530,7 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
       rnd512_4way( state, buffer );
     else
     {     // empty pad block, constant data
-      msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+      msg[0] = m512_const1_i128(  0x0000000080000000 );
       msg[1] = m512_zero;
       rnd512_4way( state, msg );
     }
@@ -583,13 +593,13 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
     {
        // padding of partial block
        msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_const1_i128(  0x0000000080000000 );
        rnd512_4way( state, msg );
     }
     else
     {
        // empty pad block
-       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[0] = m512_const1_i128( 0x0000000080000000 );
        msg[1] = m512_zero;
        rnd512_4way( state, msg );
     }
@@ -631,13 +641,13 @@ int luffa_4way_update_close( luffa_4way_context *state,
     {
        // padding of partial block
        msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_const1_i128( 0x0000000080000000 );
        rnd512_4way( state, msg );
     }
     else
     {
        // empty pad block
-       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[0] = m512_const1_i128( 0x0000000080000000 );
        msg[1] = m512_zero;
        rnd512_4way( state, msg );
     }
@@ -832,7 +842,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
     __m256i msg0, msg1;
     __m256i tmp[2];
     __m256i x[8];
-    const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
+    const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
 
     t0 = chainv[0];
     t1 = chainv[1];
@@ -1088,7 +1098,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
     {
       // remaining data bytes
       buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
-      buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
+      buffer[1] = m256_const1_i128( 0x0000000080000000 );
     }
     return 0;
 }
@@ -1104,7 +1114,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
       rnd512_2way( state, buffer );
     else
     {     // empty pad block, constant data
-      msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+      msg[0] = m256_const1_i128( 0x0000000080000000 );
       msg[1] = m256_zero;
       rnd512_2way( state, msg );
     }
@@ -1159,13 +1169,13 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
     {
        // padding of partial block
        msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m256_const1_i128( 0x0000000080000000 );
        rnd512_2way( state, msg );
     }
     else
     {
        // empty pad block
-       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[0] = m256_const1_i128( 0x0000000080000000 );
        msg[1] = m256_zero;
        rnd512_2way( state, msg );
     }
@@ -1206,13 +1216,13 @@ int luffa_2way_update_close( luffa_2way_context *state,
     {
        // padding of partial block
        msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m256_const1_i128( 0x0000000080000000 );
        rnd512_2way( state, msg );
     }
     else
     {
        // empty pad block
-       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[0] = m256_const1_i128( 0x0000000080000000 );
        msg[1] = m256_zero;
        rnd512_2way( state, msg );
     }
diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c
index 780e56d7..fee498a6 100644
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -23,7 +23,7 @@
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"
 
-#define MULT2(a0,a1) do \
+#define MULT2( a0, a1 ) do \
 { \
   __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
   a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
@@ -345,11 +345,11 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
     // 16 byte partial block exists for 80 byte len
     if ( state->rembytes  )
        // padding of partial block
-       rnd512( state, m128_const_64( 0, 0x80000000 ),
+       rnd512( state, m128_const_i128(  0x80000000 ),
                       mm128_bswap_32( cast_m128i( data ) ) );
     else
        // empty pad block
-       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
+       rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
 
     finalization512( state, (uint32*) output );
     if ( state->hashbitlen > 512 )
@@ -394,11 +394,11 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
     // 16 byte partial block exists for 80 byte len
     if ( state->rembytes  )
        // padding of partial block
-       rnd512( state, m128_const_64( 0, 0x80000000 ),
+       rnd512( state, m128_const_i128( 0x80000000 ),
                       mm128_bswap_32( cast_m128i( data ) ) );
     else
        // empty pad block
-       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
+       rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
 
     finalization512( state, (uint32*) output );
     if ( state->hashbitlen > 512 )
@@ -606,7 +606,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
 
     casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-//    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 
     rnd512( state, zero, zero );
 
@@ -621,7 +620,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
 
     casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-//    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 }
 
 #else
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index e87936dd..513a29fd 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -77,6 +77,7 @@ static const sph_u32 H256[8] = {
 
 #else   // no SHA
 
+/*
 static const sph_u32 K[64] = {
 	SPH_C32(0x428A2F98), SPH_C32(0x71374491),
 	SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
@@ -111,6 +112,7 @@ static const sph_u32 K[64] = {
 	SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
 	SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
 };
+*/
 
 #if SPH_SMALL_FOOTPRINT_SHA2
 
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index 83f3e66b..1b774263 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -23,14 +23,23 @@ static const uint32_t IV512[] =
    _mm256_blend_epi32( mm256_ror128_32( a ), \
                        mm256_ror128_32( b ), 0x88 )
 
+#if defined(__VAES__)
+
+#define mm256_aesenc_2x128( x, k ) \
+   _mm256_aesenc_epi128( x, _mm256_castsi128_si256( k ) )
+
+#else
+
+#define mm256_aesenc_2x128( x, k ) \
+   mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \
+                     _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) )
+
+#endif
+
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
 {
-#if defined(__VAES__)
-   const __m256i zero = _mm256_setzero_si256();
-#else
    const __m128i zero = _mm_setzero_si128();
-#endif
    __m256i p0, p1, p2, p3, x;
    __m256i k00, k01, k02, k03, k10, k11, k12, k13;
    __m256i *m = (__m256i*)msg;
@@ -308,7 +317,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
     uint32_t vp = ctx->ptr>>5;
 
     // Terminating byte then zero pad
-    casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
+    casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
 
     // Zero pad full vectors up to count
     for ( ; vp < 6; vp++ )      
@@ -388,13 +397,13 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
 
    if ( vp == 0 )    // empty buf, xevan.
    { 
-      casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
+      casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
       memset_zero_256( (__m256i*)buf + 1, 5 );
       ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
    }
    else     // half full buf, everyone else.
    {
-    casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
+    casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
       memset_zero_256( (__m256i*)buf + vp, 6 - vp );
    }
 
@@ -478,13 +487,13 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
 
    if ( vp == 0 )    // empty buf, xevan.
    {
-      casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
+      casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
       memset_zero_256( (__m256i*)buf + 1, 5 );
       ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
    }
    else     // half full buf, everyone else.
    {
-    casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
+    casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
       memset_zero_256( (__m256i*)buf + vp, 6 - vp );
    }
 
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
index eed4ba14..2b0b7353 100644
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -292,7 +292,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
     uint32_t vp = ctx->ptr>>6;
 
     // Terminating byte then zero pad
-    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+    casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
 
     // Zero pad full vectors up to count
     for ( ; vp < 6; vp++ )      
@@ -372,13 +372,13 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
 
    if ( vp == 0 )    // empty buf, xevan.
    { 
-      casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
+      casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
       memset_zero_512( (__m512i*)buf + 1, 5 );
       ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
    }
    else     // half full buf, everyone else.
    {
-    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+    casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
       memset_zero_512( (__m512i*)buf + vp, 6 - vp );
    }
 
@@ -463,13 +463,13 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
 
    if ( vp == 0 )    // empty buf, xevan.
    {
-      casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
+      casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
       memset_zero_512( (__m512i*)buf + 1, 5 );
       ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
    }
    else     // half full buf, everyone else.
    {
-    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+    casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
       memset_zero_512( (__m512i*)buf + vp, 6 - vp );
    }
 
diff --git a/algo/swifftx/stdbool.h b/algo/swifftx/stdbool.h
deleted file mode 100644
index d6396c35..00000000
--- a/algo/swifftx/stdbool.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2000 Jeroen Ruigrok van der Werven <asmodai@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: src/include/stdbool.h,v 1.6 2002/08/16 07:33:14 alfred Exp $
- */
-
-#ifndef _STDBOOL_H_
-#define _STDBOOL_H_     
-
-#define __bool_true_false_are_defined   1
-
-#ifndef __cplusplus
-
-#define false   0
-#define true    1
-
-//#define bool    _Bool
-//#if __STDC_VERSION__ < 199901L && __GNUC__ < 3
-//typedef int     _Bool;
-//#endif
-typedef int bool;
-
-#endif /* !__cplusplus */
-
-#endif /* !_STDBOOL_H_ */
diff --git a/algo/swifftx/swifftx.c.bak b/algo/swifftx/swifftx.c.bak
deleted file mode 100644
index 24453e21..00000000
--- a/algo/swifftx/swifftx.c.bak
+++ /dev/null
@@ -1,1155 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////////////
-//
-//  SWIFFTX ANSI C OPTIMIZED 32BIT IMPLEMENTATION FOR NIST SHA-3 COMPETITION
-//
-//  SWIFFTX.c
-//
-//  October 2008
-//
-//  This is the source file of the OPTIMIZED 32BIT implementation of SWIFFTX hash function.
-//  SWIFFTX is a candidate function for SHA-3 NIST competition.
-//  More details about SWIFFTX can be found in the accompanying submission documents.
-//
-///////////////////////////////////////////////////////////////////////////////////////////////
-#include "swifftx.h"
-// See the remarks concerning compatibility issues inside stdint.h.
-#include "stdint.h"
-// Remove this while using gcc:
-//#include "stdbool.h"
-#include <memory.h>
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// Constants and static tables portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-// In SWIFFTX we work over Z_257, so this is the modulus and the arithmetic is performed modulo
-// this number.
-#define FIELD_SIZE 257
-
-// The size of FFT we use:
-#define N 64
-
-#define LOGN 6
-
-#define EIGHTH_N (N / 8)
-
-// The number of FFTS done on the input.
-#define M (SWIFFTX_INPUT_BLOCK_SIZE / 8)   // 32
-
-// Omega is the 128th root of unity in Z_257.
-// We choose w = 42.
-#define OMEGA 42
-
-// The size of the inner FFT lookup table:
-#define W 8
-
-// Calculates the sum and the difference of two numbers.
-//
-// Parameters:
-// - A: the first operand. After the operation stores the sum of the two operands.
-// - B: the second operand. After the operation stores the difference between the first and the
-//   second operands.
-#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
-
-// Quickly reduces an integer modulo 257.
-//
-// Parameters:
-// - A: the input.
-#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
-
-// Since we need to do the setup only once, this is the indicator variable:
-static bool wasSetupDone = false;
-
-// This array stores the powers of omegas that correspond to the indices, which are the input
-// values. Known also as the "outer FFT twiddle factors".
-swift_int16_t multipliers[N];
-
-// This array stores the powers of omegas, multiplied by the corresponding values.
-// We store this table to save computation time.
-//
-// To calculate the intermediate value of the compression function (the first out of two
-// stages), we multiply the k-th bit of x_i by w^[(2i + 1) * k]. {x_i} is the input to the
-// compression function, i is between 0 and 31, x_i is a 64-bit value.
-// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
-// formula (2), section 3, page 6.
-swift_int16_t fftTable[256 * EIGHTH_N];
-
-// The A's we use in SWIFFTX shall be random elements of Z_257.
-// We generated these A's from the decimal expansion of PI as follows:  we converted each
-// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
-// element, otherwise move to the next triple of digits in the expansion. This guarntees that
-// the A's are random, provided that PI digits are.
-const swift_int16_t As[3 * M * N] =
-{141,  78, 139,  75, 238, 205, 129, 126,  22, 245, 197, 169, 142, 118, 105,  78,
-  50, 149,  29, 208, 114,  34,  85, 117,  67, 148,  86, 256,  25,  49, 133,  93,
-  95,  36,  68, 231, 211, 102, 151, 128, 224, 117, 193,  27, 102, 187,   7, 105,
-  45, 130, 108, 124, 171, 151, 189, 128, 218, 134, 233, 165,  14, 201, 145, 134,
-  52, 203,  91,  96, 197,  69, 134, 213, 136,  93,   3, 249, 141,  16, 210,  73,
-   6,  92,  58,  74, 174,   6, 254,  91, 201, 107, 110,  76, 103,  11,  73,  16,
-  34, 209,   7, 127, 146, 254,  95, 176,  57,  13, 108, 245,  77,  92, 186, 117,
- 124,  97, 105, 118,  34,  74, 205, 122, 235,  53,  94, 238, 210, 227, 183,  11,
- 129, 159, 105, 183, 142, 129,  86,  21, 137, 138, 224, 223, 190, 188, 179, 188,
- 256,  25, 217, 176,  36, 176, 238, 127, 160, 210, 155, 148, 132,   0,  54, 127,
- 145,   6,  46,  85, 243,  95, 173, 123, 178, 207, 211, 183, 224, 173, 146,  35,
-  71, 114,  50,  22, 175,   1,  28,  19, 112, 129,  21,  34, 161, 159, 115,  52,
-   4, 193, 211,  92, 115,  49,  59, 217, 218,  96,  61,  81,  24, 202, 198,  89,
-  45, 128,   8,  51, 253,  87, 171,  35,   4, 188, 171,  10,   3, 137, 238,  73,
-  19, 208, 124, 163, 103, 177, 155, 147,  46,  84, 253, 233, 171, 241, 211, 217,
- 159,  48,  96,  79, 237,  18, 171, 226,  99,   1,  97, 195, 216, 163, 198,  95,
-   0, 201,  65, 228,  21, 153, 124, 230,  44,  35,  44, 108,  85, 156, 249, 207,
-  26, 222, 131,   1,  60, 242, 197, 150, 181,  19, 116, 213,  75,  98, 124, 240,
- 123, 207,  62, 255,  60, 143, 187, 157, 139,   9,  12, 104,  89,  49, 193, 146,
- 104, 196, 181,  82, 198, 253, 192, 191, 255, 122, 212, 104,  47,  20, 132, 208,
-  46, 170,   2,  69, 234,  36,  56, 163,  28, 152, 104, 238, 162,  56,  24,  58,
-  38, 150, 193, 254, 253, 125, 173,  35,  73, 126, 247, 239, 216,   6, 199,  15,
-  90,  12,  97, 122,   9,  84, 207, 127, 219,  72,  58,  30,  29, 182,  41, 192,
- 235, 248, 237,  74,  72, 176, 210, 252,  45,  64, 165,  87, 202, 241, 236, 223,
- 151, 242, 119, 239,  52, 112, 169,  28,  13,  37, 160,  60, 158,  81, 133,  60,
-  16, 145, 249, 192, 173, 217, 214,  93, 141, 184,  54,  34, 161, 104, 157,  95,
-  38, 133, 218, 227, 211, 181,   9,  66, 137, 143,  77,  33, 248, 159,   4,  55,
- 228,  48,  99, 219, 222, 184,  15,  36, 254, 256, 157, 237,  87, 139, 209, 113,
- 232,  85, 126, 167, 197, 100, 103, 166,  64, 225, 125, 205, 117, 135,  84, 128,
- 231, 112,  90, 241,  28,  22, 210, 147, 186,  49, 230,  21, 108,  39, 194,  47,
- 123, 199, 107, 114,  30, 210, 250, 143,  59, 156, 131, 133, 221,  27,  76,  99,
- 208, 250,  78,  12, 211, 141,  95,  81, 195, 106,   8, 232, 150, 212, 205, 221,
-  11, 225,  87, 219, 126, 136, 137, 180, 198,  48,  68, 203, 239, 252, 194, 235,
- 142, 137, 174, 172, 190, 145, 250, 221, 182, 204,   1, 195, 130, 153,  83, 241,
- 161, 239, 211, 138,  11, 169, 155, 245, 174,  49,  10, 166,  16, 130, 181, 139,
- 222, 222, 112,  99, 124,  94,  51, 243, 133, 194, 244, 136,  35, 248, 201, 177,
- 178, 186, 129, 102,  89, 184, 180,  41, 149,  96, 165,  72, 225, 231, 134, 158,
- 199,  28, 249,  16, 225, 195,  10, 210, 164, 252, 138,   8,  35, 152, 213, 199,
-  82, 116,  97, 230,  63, 199, 241,  35,  79, 120,  54, 174,  67, 112,   1,  76,
-  69, 222, 194,  96,  82,  94,  25, 228, 196, 145, 155, 136, 228, 234,  46, 101,
- 246,  51, 103, 166, 246,  75,   9, 200, 161,   4, 108,  35, 129, 168, 208, 144,
-  50,  14,  13, 220,  41, 132, 122, 127, 194,   9, 232, 234, 107,  28, 187,   8,
-  51, 141,  97, 221, 225,   9, 113, 170, 166, 102, 135,  22, 231, 185, 227, 187,
- 110, 145, 251, 146,  76,  22, 146, 228,   7,  53,  64,  25,  62, 198, 130, 190,
- 221, 232, 169,  64, 188, 199, 237, 249, 173, 218, 196, 191,  48, 224,   5, 113,
- 100, 166, 160,  21, 191, 197,  61, 162, 149, 171, 240, 183, 129, 231, 123, 204,
- 192, 179, 134,  15,  47, 161, 142, 177, 239, 234, 186, 237, 231,  53, 208,  95,
- 146,  36, 225, 231,  89, 142,  93, 248, 137, 124,  83,  39,  69,  77,  89, 208,
- 182,  48,  85, 147, 244, 164, 246,  68,  38, 190, 220,  35, 202,  91, 157, 151,
- 201, 240, 185, 218,   4, 152,   2, 132, 177,  88, 190, 196, 229,  74, 220, 135,
- 137, 196,  11,  47,   5, 251, 106, 144, 163,  60, 222, 127,  52,  57, 202, 102,
-  64, 140, 110, 206,  23, 182,  39, 245,   1, 163, 157, 186, 163,  80,   7, 230,
-  44, 249, 176, 102, 164, 125, 147, 120,  18, 191, 186, 125,  64,  65, 198, 157,
- 164, 213,  95,  61,  13, 181, 208,  91, 242, 197, 158,  34,  98, 169,  91,  14,
-  17,  93, 157,  17,  65,  30, 183,   6, 139,  58, 255, 108, 100, 136, 209, 144,
- 164,   6, 237,  33, 210, 110,  57, 126, 197, 136, 125, 244, 165, 151, 168,   3,
- 143, 251, 247, 155, 136, 130,  88,  14,  74, 121, 250, 133,  21, 226, 185, 232,
- 118, 132,  89,  64, 204, 161,   2,  70, 224, 159,  35, 204, 123, 180,  13,  52,
- 231,  57,  25,  78,  66,  69,  97,  42, 198,  84, 176,  59,   8, 232, 125, 134,
- 193,   2, 232, 109, 216,  69,  90, 142,  32,  38, 249,  37,  75, 180, 184, 188,
-  19,  47, 120,  87, 146,  70, 232, 120, 191,  45,  33,  38,  19, 248, 110, 110,
-  44,  64,   2,  84, 244, 228, 252, 228, 170, 123,  38, 144, 213, 144, 171, 212,
- 243,  87, 189,  46, 128, 110,  84,  77,  65, 183,  61, 184, 101,  44, 168,  68,
-  14, 106, 105,   8, 227, 211, 166,  39, 152,  43,  52, 254, 197,  55, 119,  89,
- 168,  65,  53, 138, 177,  56, 219,   0,  58, 121, 148,  18,  44, 100, 215, 103,
- 145, 229, 117, 196,  91,  89, 113, 143, 172, 239, 249, 184, 154,  39, 112,  65,
- 204,  42,  84,  38, 155, 151, 151,  16, 100,  87, 174, 162, 145, 147, 149, 186,
- 237, 145, 134, 144, 198, 235, 213, 163,  48, 230,  24,  47,  57,  71, 127,   0,
- 150, 219,  12,  81, 197, 150, 131,  13, 169,  63, 175, 184,  48, 235,  65, 243,
- 149, 200, 163, 254, 202, 114, 247,  67, 143, 250, 126, 228,  80, 130, 216, 214,
-  36,   2, 230,  33, 119, 125,   3, 142, 237, 100,   3, 152, 197, 174, 244, 129,
- 232,  30, 206, 199,  39, 210, 220,  43, 237, 221, 201,  54, 179,  42,  28, 133,
- 246, 203, 198, 177,   0,  28, 194,  85, 223, 109, 155, 147, 221,  60, 133, 108,
- 157, 254,  26,  75, 157, 185,  49, 142,  31, 137,  71,  43,  63,  64, 237, 148,
- 237, 172, 159, 160, 155, 254, 234, 224, 140, 193, 114, 140,  62, 109, 136,  39,
- 255,   8, 158, 146, 128,  49, 222,  96,  57, 209, 180, 249, 202, 127, 113, 231,
-  78, 178,  46,  33, 228, 215, 104,  31, 207, 186,  82,  41,  42,  39, 103, 119,
- 123, 133, 243, 254, 238, 156,  90, 186,  37, 212,  33, 107, 252,  51, 177,  36,
- 237,  76, 159, 245,  93, 214,  97,  56, 190,  38, 160,  94, 105, 222, 220, 158,
-  49,  16, 191,  52, 120,  87, 179,   2,  27, 144, 223, 230, 184,   6, 129, 227,
-  69,  47, 215, 181, 162, 139,  72, 200,  45, 163, 159,  62,   2, 221, 124,  40,
- 159, 242,  35, 208, 179, 166,  98,  67, 178,  68, 143, 225, 178, 146, 187, 159,
-  57,  66, 176, 192, 236, 250, 168, 224, 122,  43, 159, 120, 133, 165, 122,  64,
-  87,  74, 161, 241,   9,  87,  90,  24, 255, 113, 203, 220,  57, 139, 197, 159,
-  31, 151,  27, 140,  77, 162,   7,  27,  84, 228, 187, 220,  53, 126, 162, 242,
-  84, 181, 223, 103,  86, 177, 207,  31, 140,  18, 207, 256, 201, 166,  96,  23,
- 233, 103, 197,  84, 161,  75,  59, 149, 138, 154, 119,  92,  16,  53, 116,  97,
- 220, 114,  35,  45,  77, 209,  40, 196,  71,  22,  81, 178, 110,  14,   3, 180,
- 110, 129, 112,  47,  18,  61, 134,  78,  73,  79, 254, 232, 125, 180, 205,  54,
- 220, 119,  63,  89, 181,  52,  77, 109, 151,  77,  80, 207, 144,  25,  20,   6,
- 208,  47, 201, 206, 192,  14,  73, 176, 256, 201, 207,  87, 216,  60,  56,  73,
-  92, 243, 179, 113,  49,  59,  55, 168, 121, 137,  69, 154,  95,  57, 187,  47,
- 129,   4,  15,  92,   6, 116,  69, 196,  48, 134,  84,  81, 111,  56,  38, 176,
- 239,   6, 128,  72, 242, 134,  36, 221,  59,  48, 242,  68, 130, 110, 171,  89,
-  13, 220,  48,  29,   5,  75, 104, 233,  91, 129, 105, 162,  44, 113, 163, 163,
-  85, 147, 190, 111, 197,  80, 213, 153,  81,  68, 203,  33, 161, 165,  10,  61,
- 120, 252,   0, 205,  28,  42, 193,  64,  39,  37,  83, 175,   5, 218, 215, 174,
- 128, 121, 231,  11, 150, 145, 135, 197, 136,  91, 193,   5, 107,  88,  82,   6,
-   4, 188, 256,  70,  40,   2, 167,  57, 169, 203, 115, 254, 215, 172,  84,  80,
- 188, 167,  34, 137,  43, 243,   2,  79, 178,  38, 188, 135, 233, 194, 208,  13,
-  11, 151, 231, 196,  12, 122, 162,  56,  17, 114, 191, 207,  90, 132,  64, 238,
- 187,   6, 198, 176, 240,  88, 118, 236,  15, 226, 166,  22, 193, 229,  82, 246,
- 213,  64,  37,  63,  31, 243, 252,  37, 156,  38, 175, 204, 138, 141, 211,  82,
- 106, 217,  97, 139, 153,  56, 129, 218, 158,   9,  83,  26,  87, 112,  71,  21,
- 250,   5,  65, 141,  68, 116, 231, 113,  10, 218,  99, 205, 201,  92, 157,   4,
-  97,  46,  49, 220,  72, 139, 103, 171, 149, 129, 193,  19,  69, 245,  43,  31,
-  58,  68,  36, 195, 159,  22,  54,  34, 233, 141, 205, 100, 226,  96,  22, 192,
-  41, 231,  24,  79, 234, 138,  30, 120, 117, 216, 172, 197, 172, 107,  86,  29,
- 181, 151,   0,   6, 146, 186,  68,  55,  54,  58, 213, 182,  60, 231,  33, 232,
-  77, 210, 216, 154,  80,  51, 141, 122,  68, 148, 219, 122, 254,  48,  64, 175,
-  41, 115,  62, 243, 141,  81, 119, 121,   5,  68, 121,  88, 239,  29, 230,  90,
- 135, 159,  35, 223, 168, 112,  49,  37, 146,  60, 126, 134,  42, 145, 115,  90,
-  73, 133, 211,  86, 120, 141, 122, 241, 127,  56, 130,  36, 174,  75,  83, 246,
- 112,  45, 136, 194, 201, 115,   1, 156, 114, 167, 208,  12, 176, 147,  32, 170,
- 251, 100, 102, 220, 122, 210,   6,  49,  75, 201,  38, 105, 132, 135, 126, 102,
-  13, 121,  76, 228, 202,  20,  61, 213, 246,  13, 207,  42, 148, 168,  37, 253,
-  34,  94, 141, 185,  18, 234, 157, 109, 104,  64, 250, 125,  49, 236,  86,  48,
- 196,  77,  75, 237, 156, 103, 225,  19, 110, 229,  22,  68, 177,  93, 221, 181,
- 152, 153,  61, 108, 101,  74, 247, 195, 127, 216,  30, 166, 168,  61,  83, 229,
- 120, 156,  96, 120, 201, 124,  43,  27, 253, 250, 120, 143,  89, 235, 189, 243,
- 150,   7, 127, 119, 149, 244,  84, 185, 134,  34, 128, 193, 236, 234, 132, 117,
- 137,  32, 145, 184,  44, 121,  51,  76,  11, 228, 142, 251,  39,  77, 228, 251,
-  41,  58, 246, 107, 125, 187,   9, 240,  35,   8,  11, 162, 242, 220, 158, 163,
-   2, 184, 163, 227, 242,   2, 100, 101,   2,  78, 129,  34,  89,  28,  26, 157,
-  79,  31, 107, 250, 194, 156, 186,  69, 212,  66,  41, 180, 139,  42, 211, 253,
- 256, 239,  29, 129, 104, 248, 182,  68,   1, 189,  48, 226,  36, 229,   3, 158,
-  41,  53, 241,  22, 115, 174,  16, 163, 224,  19, 112, 219, 177, 233,  42,  27,
- 250, 134,  18,  28, 145, 122,  68,  34, 134,  31, 147,  17,  39, 188, 150,  76,
-  45,  42, 167, 249,  12,  16,  23, 182,  13,  79, 121,   3,  70, 197, 239,  44,
-  86, 177, 255,  81,  64, 171, 138, 131,  73, 110,  44, 201, 254, 198, 146,  91,
-  48,   9, 104,  31,  29, 161, 101,  31, 138, 180, 231, 233,  79, 137,  61, 236,
- 140,  15, 249, 218, 234, 119,  99, 195, 110, 137, 237, 207,   8,  31,  45,  24,
-  90, 155, 203, 253, 192, 203,  65, 176, 210, 171, 142, 214, 220, 122, 136, 237,
- 189, 186, 147,  40,  80, 254, 173,  33, 191,  46, 192,  26, 108, 255, 228, 205,
-  61,  76,  39, 107, 225, 126, 228, 182, 140, 251, 143, 134, 252, 168, 221,   8,
- 185,  85,  60, 233, 147, 244,  87, 137,   8, 140,  96,  80,  53,  45, 175, 160,
- 124, 189, 112,  37, 144,  19,  70,  17, 170, 242,   2,   3,  28,  95, 120, 199,
- 212,  43,   9, 117,  86, 151, 101, 241, 200, 145, 241,  19, 178,  69, 204, 197,
- 227, 166,  94,   7, 193,  45, 247, 234,  19, 187, 212, 212, 236, 125,  33,  95,
- 198, 121, 122, 103,  77, 155, 235,  49,  25, 237, 249,  11, 162,   7, 238,  24,
-  16, 150, 129,  25, 152,  17,  42,  67, 247, 162,  77, 154,  31, 133,  55, 137,
-  79, 119, 153,  10,  86,  28, 244, 186,  41, 169, 106,  44,  10,  49, 110, 179,
-  32, 133, 155, 244,  61,  70, 131, 168, 170,  39, 231, 252,  32,  69,  92, 238,
- 239,  35, 132, 136, 236, 167,  90,  32, 123,  88,  69,  22,  20,  89, 145, 166,
-  30, 118,  75,   4,  49,  31, 225,  54,  11,  50,  56, 191, 246,   1, 187,  33,
- 119, 107, 139,  68,  19, 240, 131,  55,  94, 113,  31, 252,  12, 179, 121,   2,
- 120, 252,   0,  76,  41,  80, 185,  42,  62, 121, 105, 159, 121, 109, 111,  98,
-   7, 118,  86,  29, 210,  70, 231, 179, 223, 229, 164,  70,  62,  47,   0, 206,
- 204, 178, 168, 120, 224, 166,  99,  25, 103,  63, 246, 224, 117, 204,  75, 124,
- 140, 133, 110, 110, 222,  88, 151, 118,  46,  37,  22, 143, 158,  40,   2,  50,
- 153,  94, 190, 199,  13, 198, 127, 211, 180,  90, 183,  98,   0, 142, 210, 154,
- 100, 187,  67, 231, 202, 100, 198, 235, 252, 160, 247, 124, 247,  14, 121, 221,
-  57,  88, 253, 243, 185,  89,  45, 249, 221, 194, 108, 175, 193, 119,  50, 141,
- 223, 133, 136,  64, 176, 250, 129, 100, 124,  94, 181, 159,  99, 185, 177, 240,
- 135,  42, 103,  52, 202, 208, 143, 186, 193, 103, 154, 237, 102,  88, 225, 161,
-  50, 188, 191, 109,  12,  87,  19, 227, 247, 183,  13,  52, 205, 170, 205, 146,
-  89, 160,  18, 105, 192,  73, 231, 225, 184, 157, 252, 220,  61,  59, 169, 183,
- 221,  20, 141,  20, 158, 101, 245,   7, 245, 225, 118, 137,  84,  55,  19,  27,
- 164, 110,  35,  25, 202,  94, 150,  46,  91, 152, 130,   1,   7,  46,  16, 237,
- 171, 109,  19, 200,  65,  38,  10, 213,  70,  96, 126, 226, 185, 225, 181,  46,
-  10, 165,  11, 123,  53, 158,  22, 147,  64,  22, 227,  69, 182, 237, 197,  37,
-  39,  49, 186, 223, 139, 128,  55,  36, 166, 178, 220,  20,  98, 172, 166, 253,
-  45,   0, 120, 180, 189, 185, 158, 159, 196,   6, 214,  79, 141,  52, 156, 107,
-   5, 109, 142, 159,  33,  64, 190, 133,  95, 132,  95, 202, 160,  63, 186,  23,
- 231, 107, 163,  33, 234,  15, 244,  77, 108,  49,  51,   7, 164,  87, 142,  99,
- 240, 202,  47, 256, 118, 190, 196, 178, 217,  42,  39, 153,  21, 192, 232, 202,
-  14,  82, 179,  64, 233,   4, 219,  10, 133,  78,  43, 144, 146, 216, 202,  81,
-  71, 252,   8, 201,  68, 256,  85, 233, 164,  88, 176,  30,   5, 152, 126, 179,
- 249,  84, 140, 190, 159,  54, 118,  98,   2, 159,  27, 133,  74, 121, 239, 196,
-  71, 149, 119, 135, 102,  20,  87, 112,  44,  75, 221,   3, 151, 158,   5,  98,
- 152,  25,  97, 106,  63, 171, 240,  79, 234, 240, 230,  92,  76,  70, 173, 196,
-  36, 225, 218, 133,  64, 240, 150,  41, 146,  66, 133,  51, 134,  73, 170, 238,
- 140,  90,  45,  89,  46, 147,  96, 169, 174, 174, 244, 151,  90,  40,  32,  74,
-  38, 154, 246,  57,  31,  14, 189, 151,  83, 243, 197, 183, 220, 185,  53, 225,
-  51, 106, 188, 208, 222, 248,  93,  13,  93, 215, 131,  25, 142, 185, 113, 222,
- 131, 215, 149,  50, 159,  85,  32,   5, 205, 192,   2, 227,  42, 214, 197,  42,
- 126, 182,  68, 123, 109,  36, 237, 179, 170, 199,  77, 256,   5, 128, 214, 243,
- 137, 177, 170, 253, 179, 180, 153, 236, 100, 196, 216, 231, 198,  37, 192,  80,
- 121, 221, 246,   1,  16, 246,  29,  78,  64, 148, 124,  38,  96, 125,  28,  20,
-  48,  51,  73, 187, 139, 208,  98, 253, 221, 188,  84, 129,   1, 205,  95, 205,
- 117,  79,  71, 126, 134, 237,  19, 184, 137, 125, 129, 178, 223,  54, 188, 112,
-  30,   7, 225, 228, 205, 184, 233,  87, 117,  22,  58,  10,   8,  42,   2, 114,
- 254,  19,  17,  13, 150,  92, 233, 179,  63,  12,  60, 171, 127,  35,  50,   5,
- 195, 113, 241,  25, 249, 184, 166,  44, 221,  35, 151, 116,   8,  54, 195,  89,
- 218, 186, 132,   5,  41,  89, 226, 177,  11,  41,  87, 172,   5,  23,  20,  59,
- 228,  94,  76,  33, 137,  43, 151, 221,  61, 232,   4, 120,  93, 217,  80, 228,
- 228,   6,  58,  25,  62,  84,  91,  48, 209,  20, 247, 243,  55, 106,  80,  79,
- 235,  34,  20, 180, 146,   2, 236,  13, 236, 206, 243, 222, 204,  83, 148, 213,
- 214, 117, 237,  98,   0,  90, 204, 168,  32,  41, 126,  67, 191,  74,  27, 255,
-  26,  75, 240, 113, 185, 105, 167, 154, 112,  67, 151,  63, 161, 134, 239, 176,
-  42,  87, 249, 130,  45, 242,  17, 100, 107, 120, 212, 218, 237,  76, 231, 162,
- 175, 172, 118, 155,  92,  36, 124,  17, 121,  71,  13,   9,  82, 126, 147, 142,
- 218, 148, 138,  80, 163, 106, 164, 123, 140, 129,  35,  42, 186, 154, 228, 214,
-  75,  73,   8, 253,  42, 153, 232, 164,  95,  24, 110,  90, 231, 197,  90, 196,
-  57, 164, 252, 181,  31,   7,  97, 256,  35,  77, 200, 212,  99, 179,  92, 227,
-  17, 180,  49, 176,   9, 188,  13, 182,  93,  44, 128, 219, 134,  92, 151,   6,
-  23, 126, 200, 109,  66,  30, 140, 180, 146, 134,  67, 200,   7,   9, 223, 168,
- 186, 221,   3, 154, 150, 165,  43,  53, 138,  27,  86, 213, 235, 160,  70,   2,
- 240,  20,  89, 212,  84, 141, 168, 246, 183, 227,  30, 167, 138, 185, 253,  83,
-  52, 143, 236,  94,  59,  65,  89, 218, 194, 157, 164, 156, 111,  95, 202, 168,
- 245, 256, 151,  28, 222, 194,  72, 130, 217, 134, 253,  77, 246, 100,  76,  32,
- 254, 174, 182, 193,  14, 237,  74,   1,  74,  26, 135, 216, 152, 208, 112,  38,
- 181,  62,  25,  71,  61, 234, 254,  97, 191,  23,  92, 256, 190, 205,   6,  16,
- 134, 147, 210, 219, 148,  59,  73, 185,  24, 247, 174, 143, 116, 220, 128, 144,
- 111, 126, 101,  98, 130, 136, 101, 102,  69, 127,  24, 168, 146, 226, 226, 207,
- 176, 122, 149, 254, 134, 196,  22, 151, 197,  21,  50, 205, 116, 154,  65, 116,
- 177, 224, 127,  77, 177, 159, 225,  69, 176,  54, 100, 104, 140,   8,  11, 126,
-  11, 188, 185, 159, 107,  16, 254, 142,  80,  28,   5, 157, 104,  57, 109,  82,
- 102,  80, 173, 242, 238, 207,  57, 105, 237, 160,  59, 189, 189, 199,  26,  11,
- 190, 156,  97, 118,  20,  12, 254, 189, 165, 147, 142, 199,   5, 213,  64, 133,
- 108, 217, 133,  60,  94,  28, 116, 136,  47, 165, 125,  42, 183, 143,  14, 129,
- 223,  70, 212, 205, 181, 180,   3, 201, 182,  46,  57, 104, 239,  60,  99, 181,
- 220, 231,  45,  79, 156,  89, 149, 143, 190, 103, 153,  61, 235,  73, 136,  20,
-  89, 243,  16, 130, 247, 141, 134,  93,  80,  68,  85,  84,   8,  72, 194,   4,
- 242, 110,  19, 133, 199,  70, 172,  92, 132, 254,  67,  74,  36,  94,  13,  90,
- 154, 184,   9, 109, 118, 243, 214,  71,  36,  95,   0,  90, 201, 105, 112, 215,
-  69, 196, 224, 210, 236, 242, 155, 211,  37, 134,  69, 113, 157,  97,  68,  26,
- 230, 149, 219, 180,  20,  76, 172, 145, 154,  40, 129,   8,  93,  56, 162, 124,
- 207, 233, 105,  19,   3, 183, 155, 134,   8, 244, 213,  78, 139,  88, 156,  37,
-  51, 152, 111, 102, 112, 250, 114, 252, 201, 241, 133,  24, 136, 153,   5,  90,
- 210, 197, 216,  24, 131,  17, 147, 246,  13,  86,   3, 253, 179, 237, 101, 114,
- 243, 191, 207,   2, 220, 133, 244,  53,  87, 125, 154, 158, 197,  20,   8,  83,
-  32, 191,  38, 241, 204,  22, 168,  59, 217, 123, 162,  82,  21,  50, 130,  89,
- 239, 253, 195,  56, 253,  74, 147, 125, 234, 199, 250,  28,  65, 193,  22, 237,
- 193,  94,  58, 229, 139, 176,  69,  42, 179, 164, 150, 168, 246, 214,  86, 174,
-  59, 117,  15,  19,  76,  37, 214, 238, 153, 226, 154,  45, 109, 114, 198, 107,
-  45,  70, 238, 196, 142, 252, 244,  71, 123, 136, 134, 188,  99, 132,  25,  42,
- 240,   0, 196,  33,  26, 124, 256, 145,  27, 102, 153,  35,  28, 132, 221, 167,
- 138, 133,  41, 170,  95, 224,  40, 139, 239, 153,   1, 106, 255, 106, 170, 163,
- 127,  44, 155, 232, 194, 119, 232, 117, 239, 143, 108,  41,   3,   9, 180, 256,
- 144, 113, 133, 200,  79,  69, 128, 216,  31,  50, 102, 209, 249, 136, 150, 154,
- 182,  51, 228,  39, 127, 142,  87,  15,  94,  92, 187, 245,  31, 236,  64,  58,
- 114,  11,  17, 166, 189, 152, 218,  34, 123,  39,  58,  37, 153,  91,  63, 121,
-  31,  34,  12, 254, 106,  96, 171,  14, 155, 247, 214,  69,  24,  98,   3, 204,
- 202, 194, 207,  30, 253,  44, 119,  70,  14,  96,  82, 250,  63,   6, 232,  38,
-  89, 144, 102, 191,  82, 254,  20, 222,  96, 162, 110,   6, 159,  58, 200, 226,
-  98, 128,  42,  70,  84, 247, 128, 211, 136,  54, 143, 166,  60, 118,  99, 218,
-  27, 193,  85,  81, 219, 223,  46,  41,  23, 233, 152, 222,  36, 236,  54, 181,
-  56,  50,   4, 207, 129,  92,  78,  88, 197, 251, 131, 105,  31, 172,  38, 131,
-  19, 204, 129,  47, 227, 106, 202, 183,  23,   6,  77, 224, 102, 147,  11, 218,
- 131, 132,  60, 192, 208, 223, 236,  23, 103, 115,  89,  18, 185, 171,  70, 174,
- 139,   0, 100, 160, 221,  11, 228,  60,  12, 122, 114,  12, 157, 235, 148,  57,
-  83,  62, 173, 131, 169, 126,  85,  99,  93, 243,  81,  80,  29, 245, 206,  82,
- 236, 227, 166,  14, 230, 213, 144,  97,  27, 111,  99, 164, 105, 150,  89, 111,
- 252, 118, 140, 232, 120, 183, 137, 213, 232, 157, 224,  33, 134, 118, 186,  80,
- 159,   2, 186, 193,  54, 242,  25, 237, 232, 249, 226, 213,  90, 149,  90, 160,
- 118,  69,  64,  37,  10, 183, 109, 246,  30,  52, 219,  69, 189,  26, 116, 220,
-  50, 244, 243, 243, 139, 137, 232,  98,  38,  45, 256, 143, 171, 101,  73, 238,
- 123,  45, 194, 167, 250, 123,  12,  29, 136, 237, 141,  21,  89,  96, 199,  44,
-   8, 214, 208,  17, 113,  41, 137,  26, 166, 155,  89,  85,  54,  58,  97, 160,
-  50, 239,  58,  71,  21, 157, 139,  12,  37, 198, 182, 131, 149, 134,  16, 204,
- 164, 181, 248, 166,  52, 216, 136, 201,  37, 255, 187, 240,   5, 101, 147, 231,
-  14, 163, 253, 134, 146, 216,   8,  54, 224,  90, 220, 195,  75, 215, 186,  58,
-  71, 204, 124, 105, 239,  53,  16,  85,  69, 163, 195, 223,  33,  38,  69,  88,
-  88, 203,  99,  55, 176,  13, 156, 204, 236,  99, 194, 134,  75, 247, 126, 129,
- 160, 124, 233, 206, 139, 144, 154,  45, 233,  51, 206,  61,  60,  55, 205, 107,
-  84, 108,  96, 188, 203,  31,  89,  20, 115, 144, 137,  90, 237,  78, 231, 185,
- 120, 217,   1, 176, 169,  30, 155, 176, 100, 113,  53,  42, 193, 108,  14, 121,
- 176, 158, 137,  92, 178,  44, 110, 249, 108, 234,  94, 101, 128,  12, 250, 173,
-  72, 202, 232,  66, 139, 152, 189,  18,  32, 197,   9, 238, 246,  55, 119, 183,
- 196, 119, 113, 247, 191, 100, 200, 245,  46,  16, 234, 112, 136, 116, 232,  48,
- 176, 108,  11, 237,  14, 153,  93, 177, 124,  72,  67, 121, 135, 143,  45,  18,
-  97, 251, 184, 172, 136,  55, 213,   8, 103,  12, 221, 212,  13, 160, 116,  91,
- 237, 127, 218, 190, 103, 131,  77,  82,  36, 100,  22, 252,  79,  69,  54,  26,
-  65, 182, 115, 142, 247,  20,  89,  81, 188, 244,  27, 120, 240, 248,  13, 230,
-  67, 133,  32, 201, 129,  87,   9, 245,  66,  88, 166,  34,  46, 184, 119, 218,
- 144, 235, 163,  40, 138, 134, 127, 217,  64, 227, 116,  67,  55, 202, 130,  48,
- 199,  42, 251, 112, 124, 153, 123, 194, 243,  49, 250,  12,  78, 157, 167, 134,
- 210,  73, 156, 102,  21,  88, 216, 123,  45,  11, 208,  18,  47, 187,  20,  43,
-   3, 180, 124,   2, 136, 176,  77, 111, 138, 139,  91, 225, 126,   8,  74, 255,
-  88, 192, 193, 239, 138, 204, 139, 194, 166, 130, 252, 184, 140, 168,  30, 177,
- 121,  98, 131, 124,  69, 171,  75,  49, 184,  34,  76, 122, 202, 115, 184, 253,
- 120, 182,  33, 251,   1,  74, 216, 217, 243, 168,  70, 162, 119, 158, 197, 198,
-  61,  89,   7,   5,  54, 199, 211, 170,  23, 226,  44, 247, 165, 195,   7, 225,
-  91,  23,  50,  15,  51, 208, 106,  94,  12,  31,  43, 112, 146, 139, 246, 182,
- 113,   1,  97,  15,  66,   2,  51,  76, 164, 184, 237, 200, 218, 176,  72,  98,
-  33, 135,  38, 147, 140, 229,  50,  94,  81, 187, 129,  17, 238, 168, 146, 203,
- 181,  99, 164,   3, 104,  98, 255, 189, 114, 142,  86, 102, 229, 102,  80, 129,
-  64,  84,  79, 161,  81, 156, 128, 111, 164, 197,  18,  15,  55, 196, 198, 191,
-  28, 113, 117,  96, 207, 253,  19, 158, 231,  13,  53, 130, 252, 211,  58, 180,
- 212, 142,   7, 219,  38,  81,  62, 109, 167, 113,  33,  56,  97, 185, 157, 130,
- 186, 129, 119, 182, 196,  26,  54, 110,  65, 170, 166, 236,  30,  22, 162,   0,
- 106,  12, 248,  33,  48,  72, 159,  17,  76, 244, 172, 132,  89, 171, 196,  76,
- 254, 166,  76, 218, 226,   3,  52, 220, 238, 181, 179, 144, 225,  23,   3, 166,
- 158,  35, 228, 154, 204,  23, 203,  71, 134, 189,  18, 168, 236, 141, 117, 138,
-   2, 132,  78,  57, 154,  21, 250, 196, 184,  40, 161,  40,  10, 178, 134, 120,
- 132, 123, 101,  82, 205, 121,  55, 140, 231,  56, 231,  71, 206, 246, 198, 150,
- 146, 192,  45, 105, 242,   1, 125,  18, 176,  46, 222, 122,  19,  80, 113, 133,
- 131, 162,  81,  51,  98, 168, 247, 161, 139,  39,  63, 162,  22, 153, 170,  92,
-  91, 130, 174, 200,  45, 112,  99, 164, 132, 184, 191, 186, 200, 167,  86, 145,
- 167, 227, 130,  44,  12, 158, 172, 249, 204,  17,  54, 249,  16, 200,  21, 174,
-  67, 223, 105, 201,  50,  36, 133, 203, 244, 131, 228,  67,  29, 195,  91,  91,
-  55, 107, 167, 154, 170, 137, 218, 183, 169,  61,  99, 175, 128,  23, 142, 183,
-  66, 255,  59, 187,  66,  85, 212, 109, 168,  82,  16,  43,  67, 139, 114, 176,
- 216, 255, 130,  94, 152,  79, 183,  64, 100,  23, 214,  82,  34, 230,  48,  15,
- 242, 130,  50, 241,  81,  32,   5, 125, 183, 182, 184,  99, 248, 109, 159, 210,
- 226,  61, 119, 129,  39, 149,  78, 214, 107,  78, 147, 124, 228,  18, 143, 188,
-  84, 180, 233, 119,  64,  39, 158, 133, 177, 168,   6, 150,  80, 117, 150,  56,
-  49,  72,  49,  37,  30, 242,  49, 142,  33, 156,  34,  44,  44,  72,  58,  22,
- 249,  46, 168,  80,  25, 196,  64, 174,  97, 179, 244, 134, 213, 105,  63, 151,
-  21,  90, 168,  90, 245,  28, 157,  65, 250, 232, 188,  27,  99, 160, 156, 127,
-  68, 193,  10,  80, 205,  36, 138, 229,  12, 223,  70, 169, 251,  41,  48,  94,
-  41, 177,  99, 256, 158,   0,   6,  83, 231, 191, 120, 135, 157, 146, 218, 213,
- 160,   7,  47, 234,  98, 211,  79, 225, 179,  95, 175, 105, 185,  79, 115,   0,
- 104,  14,  65, 124,  15, 188,  52,   9, 253,  27, 132, 137,  13, 127,  75, 238,
- 185, 253,  33,   8,  52, 157, 164,  68, 232, 188,  69,  28, 209, 233,   5, 129,
- 216,  90, 252, 212,  33, 200, 222,   9, 112,  15,  43,  36, 226, 114,  15, 249,
- 217,   8, 148,  22, 147,  23, 143,  67, 222, 116, 235, 250, 212, 210,  39, 142,
- 108,  64, 209,  83,  73,  66,  99,  34,  17,  29,  45, 151, 244, 114,  28, 241,
- 144, 208, 146, 179, 132,  89, 217, 198, 252, 219, 205, 165,  75, 107,  11, 173,
-  76,   6, 196, 247, 152, 216, 248,  91, 209, 178,  57, 250, 174,  60,  79, 123,
-  18, 135,   9, 241, 230, 159, 184,  68, 156, 251, 215,   9, 113, 234,  75, 235,
- 103, 194, 205, 129, 230,  45,  96,  73, 157,  20, 200, 212, 212, 228, 161,   7,
- 231, 228, 108,  43, 198,  87, 140, 140,   4, 182, 164,   3,  53, 104, 250, 213,
-  85,  38,  89,  61,  52, 187,  35, 204,  86, 249, 100,  71, 248, 213, 163, 215,
-  66, 106, 252, 129,  40, 111,  47,  24, 186, 221,  85, 205, 199, 237, 122, 181,
-  32,  46, 182, 135,  33, 251, 142,  34, 208, 242, 128, 255,   4, 234,  15,  33,
- 167, 222,  32, 186, 191,  34, 255, 244,  98, 240, 228, 204,  30, 142,  32,  70,
-  69,  83, 110, 151,  10, 243, 141,  21, 223,  69,  61,  37,  59, 209, 102, 114,
- 223,  33, 129, 254, 255, 103,  86, 247, 235,  72, 126, 177, 102, 226, 102,  30,
- 149, 221,  62, 247, 251, 120, 163, 173,  57, 202, 204,  24,  39, 106, 120, 143,
- 202, 176, 191, 147,  37,  38,  51, 133,  47, 245, 157, 132, 154,  71, 183, 111,
-  30, 180,  18, 202,  82,  96, 170,  91, 157, 181, 212, 140, 256,   8, 196, 121,
- 149,  79,  66, 127, 113,  78,   4, 197,  84, 256, 111, 222, 102,  63, 228, 104,
- 136, 223,  67, 193,  93, 154, 249,  83, 204, 101, 200, 234,  84, 252, 230, 195,
-  43, 140, 120, 242,  89,  63, 166, 233, 209,  94,  43, 170, 126,   5, 205,  78,
- 112,  80, 143, 151, 146, 248, 137, 203,  45, 183,  61,   1, 155,   8, 102,  59,
-  68, 212, 230,  61, 254, 191, 128, 223, 176, 123, 229,  27, 146, 120,  96, 165,
- 213,  12, 232,  40, 186, 225,  66, 105, 200, 195, 212, 110, 237, 238, 151,  19,
-  12, 171, 150,  82,   7, 228,  79,  52,  15,  78,  62,  43,  21, 154, 114,  21,
-  12, 212, 256, 232, 125, 127,   5,  51,  37, 252, 136,  13,  47, 195, 168, 191,
- 231,  55,  57, 251, 214, 116,  15,  86, 210,  41, 249, 242, 119,  27, 250, 203,
- 107,  69,  90,  43, 206, 154, 127,  54, 100,  78, 187,  54, 244, 177, 234, 167,
- 202, 136, 209, 171,  69, 114, 133, 173,  26, 139,  78, 141, 128,  32, 124,  39,
-  45, 218,  96,  68,  90,  44,  67,  62,  83, 190, 188, 256, 103,  42, 102,  64,
- 249,   0, 141,  11,  61,  69,  70,  66, 233, 237,  29, 200, 251, 157,  71,  51,
-  64, 133, 113,  76,  35, 125,  76, 137, 217, 145,  35,  69, 226, 180,  56, 249,
- 156, 163, 176, 237,  81,  54,  85, 169, 115, 211, 129,  70, 248,  40, 252, 192,
- 194, 101, 247,   8, 181, 124, 217, 191, 194,  93,  99, 127, 117, 177, 144, 151,
- 228, 121,  32,  11,  89,  81,  26,  29, 183,  76, 249, 132, 179,  70,  34, 102,
-  20,  66,  87,  63, 124, 205, 174, 177,  87, 219,  73, 218,  91,  87, 176,  72,
-  15, 211,  47,  61, 251, 165,  39, 247, 146,  70, 150,  57,   1, 212,  36, 162,
-  39,  38,  16, 216,   3,  50, 116, 200,  32, 234,  77, 181, 155,  19,  90, 188,
-  36,   6, 254,  46,  46, 203,  25, 230, 181, 196,   4, 151, 225,  65, 122, 216,
- 168,  86, 158, 131, 136,  16,  49, 102, 233,  64, 154,  88, 228,  52, 146,  69,
-  93, 157, 243, 121,  70, 209, 126, 213,  88, 145, 236,  65,  70,  96, 204,  47,
-  10, 200,  77,   8, 103, 150,  48, 153,   5,  37,  52, 235, 209,  31, 181, 126,
-  83, 142, 224, 140,   6,  32, 200, 171, 160, 179, 115, 229,  75, 194, 208,  39,
-  59, 223,  52, 247,  38, 197, 135,   1,   6, 189, 106, 114, 168,   5, 211, 222,
-  44,  63,  90, 160, 116, 172, 170, 133, 125, 138,  39, 131,  23, 178,  10, 214,
-  36,  93,  28,  59,  68,  17, 123,  25, 255, 184, 204, 102, 194, 214, 129,  94,
- 159, 245, 112, 141,  62,  11,  61, 197, 124, 221, 205,  11,  79,  71, 201,  54,
-  58, 150,  29, 121,  87,  46, 240, 201,  68,  20, 194, 209,  47, 152, 158, 174,
- 193, 164, 120, 255, 216, 165, 247,  58,  85, 130, 220,  23, 122, 223, 188,  98,
-  21,  70,  72, 170, 150, 237,  76, 143, 112, 238, 206, 146, 215, 110,   4, 250,
-  68,  44, 174, 177,  30,  98, 143, 241, 180, 127, 113,  48,   0,   1, 179, 199,
-  59, 106, 201, 114,  29,  86, 173, 133, 217,  44, 200, 141, 107, 172,  16,  60,
-  82,  58, 239,  94, 141, 234, 186, 235, 109, 173, 249, 139, 141,  59, 100, 248,
-  84, 144,  49, 160,  51, 207, 164, 103,  74,  97, 146, 202, 193, 125, 168, 134,
- 236, 111, 135, 121,  59, 145, 168, 200, 181, 173, 109,   2, 255,   6,   9, 245,
-  90, 202, 214, 143, 121,  65,  85, 232, 132,  77, 228,  84,  26,  54, 184,  15,
- 161,  29, 177,  79,  43,   0, 156, 184, 163, 165,  62,  90, 179,  93,  45, 239,
-   1,  16, 120, 189, 127,  47,  74, 166,  20, 214, 233, 226,  89, 217, 229,  26,
- 156,  53, 162,  60,  21,   3, 192,  72, 111,  51,  53, 101, 181, 208,  88,  82,
- 179, 160, 219, 113, 240, 108,  43, 224, 162, 147,  62,  14,  95,  81, 205,   4,
- 160, 177, 225, 115,  29,  69, 235, 168, 148,  29, 128, 114, 124, 129, 172, 165,
- 215, 231, 214,  86, 160,  44, 157,  91, 248, 183,  73, 164,  56, 181, 162,  92,
- 141, 118, 127, 240, 196,  77,   0,   9, 244,  79, 250, 100, 195,  25, 255,  85,
-  94,  35, 212, 137, 107,  34, 110,  20, 200, 104,  17,  32, 231,  43, 150, 159,
- 231, 216, 223, 190, 226, 109, 162, 197,  87,  92, 224,  11, 111,  73,  60, 225,
- 238,  73, 246, 169,  19, 217, 119,  38, 121, 118,  70,  82,  99, 241, 110,  67,
-  31,  76, 146, 215, 124, 240,  31, 103, 139, 224,  75, 160,  31,  78,  93,   4,
-  64,   9, 103, 223,   6, 227, 119,  85, 116,  81,  21,  43,  46, 206, 234, 132,
-  85,  99,  22, 131, 135,  97,  86,  13, 234, 188,  21,  14,  89, 169, 207, 238,
- 219, 177, 190,  72, 157,  41, 114, 140,  92, 141, 186,   1,  63, 107, 225, 184,
- 118, 150, 153, 254, 241, 106, 120, 210, 104, 144, 151, 161,  88, 206, 125, 164,
-  15, 211, 173,  49, 146, 241,  71,  36,  58, 201,  46,  27,  33, 187,  91, 162,
- 117,  19, 210, 213, 187,  97, 193,  50, 190, 114, 217,  60,  61, 167, 207, 213,
- 213,  53, 135,  34, 156,  91, 115, 119,  46,  99, 242,   1,  90,  52, 198, 227,
- 201,  91, 216, 146, 210,  82, 121,  38,  73, 133, 182, 193, 132, 148, 246,  75,
- 109, 157, 179, 113, 176, 134, 205, 159, 148,  58, 103, 171, 132, 156, 133, 147,
- 161, 231,  39, 100, 175,  97, 125,  28, 183, 129, 135, 191, 202, 181,  29, 218,
-  43, 104, 148, 203, 189, 204,   4, 182, 169,   1, 134, 122, 141, 202,  13, 187,
- 177, 112, 162,  35, 231,   6,   8, 241,  99,   6, 191,  45, 113, 113, 101, 104};
-
-// The S-Box we use for further linearity breaking.
-// We created it by taking the digits of decimal expansion of e.
-// The code that created it can be found in 'ProduceRandomSBox.c'.
-unsigned char SBox[256] = {
-//0     1    2      3     4    5     6     7      8    9     A      B    C     D     E     F
-0x7d, 0xd1, 0x70, 0x0b, 0xfa, 0x39, 0x18, 0xc3, 0xf3, 0xbb, 0xa7, 0xd4, 0x84, 0x25, 0x3b, 0x3c,   // 0
-0x2c, 0x15, 0x69, 0x9a, 0xf9, 0x27, 0xfb, 0x02, 0x52, 0xba, 0xa8, 0x4b, 0x20, 0xb5, 0x8b, 0x3a,   // 1
-0x88, 0x8e, 0x26, 0xcb, 0x71, 0x5e, 0xaf, 0xad, 0x0c, 0xac, 0xa1, 0x93, 0xc6, 0x78, 0xce, 0xfc,   // 2
-0x2a, 0x76, 0x17, 0x1f, 0x62, 0xc2, 0x2e, 0x99, 0x11, 0x37, 0x65, 0x40, 0xfd, 0xa0, 0x03, 0xc1,   // 3
-0xca, 0x48, 0xe2, 0x9b, 0x81, 0xe4, 0x1c, 0x01, 0xec, 0x68, 0x7a, 0x5a, 0x50, 0xf8, 0x0e, 0xa3,   // 4
-0xe8, 0x61, 0x2b, 0xa2, 0xeb, 0xcf, 0x8c, 0x3d, 0xb4, 0x95, 0x13, 0x08, 0x46, 0xab, 0x91, 0x7b,   // 5
-0xea, 0x55, 0x67, 0x9d, 0xdd, 0x29, 0x6a, 0x8f, 0x9f, 0x22, 0x4e, 0xf2, 0x57, 0xd2, 0xa9, 0xbd,   // 6
-0x38, 0x16, 0x5f, 0x4c, 0xf7, 0x9e, 0x1b, 0x2f, 0x30, 0xc7, 0x41, 0x24, 0x5c, 0xbf, 0x05, 0xf6,   // 7
-0x0a, 0x31, 0xa5, 0x45, 0x21, 0x33, 0x6b, 0x6d, 0x6c, 0x86, 0xe1, 0xa4, 0xe6, 0x92, 0x9c, 0xdf,   // 8
-0xe7, 0xbe, 0x28, 0xe3, 0xfe, 0x06, 0x4d, 0x98, 0x80, 0x04, 0x96, 0x36, 0x3e, 0x14, 0x4a, 0x34,   // 9
-0xd3, 0xd5, 0xdb, 0x44, 0xcd, 0xf5, 0x54, 0xdc, 0x89, 0x09, 0x90, 0x42, 0x87, 0xff, 0x7e, 0x56,   // A
-0x5d, 0x59, 0xd7, 0x23, 0x75, 0x19, 0x97, 0x73, 0x83, 0x64, 0x53, 0xa6, 0x1e, 0xd8, 0xb0, 0x49,   // B
-0x3f, 0xef, 0xbc, 0x7f, 0x43, 0xf0, 0xc9, 0x72, 0x0f, 0x63, 0x79, 0x2d, 0xc0, 0xda, 0x66, 0xc8,   // C
-0x32, 0xde, 0x47, 0x07, 0xb8, 0xe9, 0x1d, 0xc4, 0x85, 0x74, 0x82, 0xcc, 0x60, 0x51, 0x77, 0x0d,   // D
-0xaa, 0x35, 0xed, 0x58, 0x7c, 0x5b, 0xb9, 0x94, 0x6e, 0x8d, 0xb1, 0xc5, 0xb7, 0xee, 0xb6, 0xae,   // E
-0x10, 0xe0, 0xd6, 0xd9, 0xe5, 0x4f, 0xf1, 0x12, 0x00, 0xd0, 0xf4, 0x1a, 0x6f, 0x8a, 0xb3, 0xb2 }; // F
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-//
-//	Helper functions definition portion.
-//
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-// Translates an input array with values in base 257 to output array with values in base 256.
-// Returns the carry bit.
-//
-// Parameters:
-// - input: the input array of size EIGHTH_N. Each value in the array is a number in Z_257.
-//          The MSB is assumed to be the last one in the array.
-// - output: the input array encoded in base 256.
-//
-// Returns:
-// - The carry bit (MSB).
-swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N]);
-
-// Translates an input integer into the range (-FIELD_SIZE / 2) <= result <= (FIELD_SIZE / 2).
-//
-// Parameters:
-// - x: the input integer.
-//
-// Returns:
-// - The result, which equals (x MOD FIELD_SIZE), such that |result| <= (FIELD_SIZE / 2).
-int Center(int x);
-
-// Calculates bit reversal permutation.
-//
-// Parameters:
-// - input: the input to reverse.
-// - numOfBits: the number of bits in the input to reverse.
-//
-// Returns:
-// - The resulting number, which is obtained from the input by reversing its bits.
-int ReverseBits(int input, int numOfBits);
-
-// Initializes the FFT fast lookup table.
-// Shall be called only once.
-void InitializeSWIFFTX();
-
-// Calculates the FFT.
-//
-// Parameters:
-// - input: the input to the FFT.
-// - output: the resulting output.
-void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output);
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// Helper functions implementation portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N])
-{
-	swift_int32_t pairs[EIGHTH_N / 2];
-	int i;
-
-	for (i = 0; i < EIGHTH_N; i += 2)
-	{
-		// input[i] + 257 * input[i + 1]
-		pairs[i >> 1] = input[i] + input[i + 1] + (input[i + 1] << 8);
-	}
-
-	for (i = (EIGHTH_N / 2) - 1; i > 0; --i)
-	{
-		int j;
-
-		for (j = i - 1; j < (EIGHTH_N / 2) - 1; ++j)
-		{
-			// pairs[j + 1] * 513, because 257^2 = 513 % 256^2.
-			register swift_int32_t temp = pairs[j] + pairs[j + 1] + (pairs[j + 1] << 9);
-			pairs[j] = temp & 0xffff;
-			pairs[j + 1] += (temp >> 16);
-		}
-	}
-
-	for (i = 0; i < EIGHTH_N; i += 2)
-	{
-		output[i] = (unsigned char) (pairs[i >> 1] & 0xff);
-		output[i + 1] = (unsigned char) ((pairs[i >> 1] >> 8) & 0xff);
-	}
-
-	return (pairs[EIGHTH_N/2 - 1] >> 16);
-}
-
-int Center(int x)
-{
-	int result = x % FIELD_SIZE;
-
-	if (result > (FIELD_SIZE / 2))
-		result -= FIELD_SIZE;
-
-	if (result < (FIELD_SIZE / -2))
-		result += FIELD_SIZE;
-
-	return result;
-}
-
-int ReverseBits(int input, int numOfBits)
-{
-	register int reversed = 0;
-
-	for (input |= numOfBits; input > 1; input >>= 1)
-		reversed = (reversed << 1) | (input & 1);
-
-	return reversed;
-}
-
-void InitializeSWIFFTX()
-{
-	int i, j, k, x;
-	// The powers of OMEGA
-	int omegaPowers[2 * N];
-	omegaPowers[0] = 1;
-
-	if (wasSetupDone)
-		return;
-
-	for (i = 1; i < (2 * N); ++i)
-	{
-		omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA);
-	}
-
-	for (i = 0; i < (N / W); ++i)
-	{
-		for (j = 0; j < W; ++j)
-		{
-			multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)];
-		}
-	}
-
-	for (x = 0; x < 256; ++x)
-	{
-		for (j = 0; j < 8; ++j)
-		{
-			register int temp = 0;
-			for (k = 0; k < 8; ++k)
-			{
-				temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)]
-					  * ((x >> k) & 1);
-			}
-
-			fftTable[(x << 3) + j] = Center(temp);
-		}
-	}
-
-	wasSetupDone = true;
-}
-
-void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
-{
-	register swift_int16_t *mult = multipliers;
-	register swift_int32_t F0, F1, F2, F3, F4, F5, F6, F7, F8, F9,
-					 F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
-					 F20, F21, F22, F23, F24, F25, F26, F27, F28, F29,
-					 F30, F31, F32, F33, F34, F35, F36, F37, F38, F39,
-					 F40, F41, F42, F43, F44, F45, F46, F47, F48, F49,
-					 F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
-					 F60, F61, F62, F63;
-
-	// First loop unrolling:
-	register swift_int16_t *table = &(fftTable[input[0] << 3]);
-
-	F0 = mult[0] * table[0];
-	F8 = mult[1] * table[1];
-	F16 = mult[2] * table[2];
-	F24 = mult[3] * table[3];
-	F32 = mult[4] * table[4];
-	F40 = mult[5] * table[5];
-	F48 = mult[6] * table[6];
-	F56 = mult[7] * table[7];
-
-	mult += 8;
-	table = &(fftTable[input[1] << 3]);
-
-	F1 = mult[0] * table[0];
-	F9 = mult[1] * table[1];
-	F17 = mult[2] * table[2];
-	F25 = mult[3] * table[3];
-	F33 = mult[4] * table[4];
-	F41 = mult[5] * table[5];
-	F49 = mult[6] * table[6];
-	F57 = mult[7] * table[7];
-
-	mult += 8;
-	table = &(fftTable[input[2] << 3]);
-
-	F2 = mult[0] * table[0];
-	F10 = mult[1] * table[1];
-	F18 = mult[2] * table[2];
-	F26 = mult[3] * table[3];
-	F34 = mult[4] * table[4];
-	F42 = mult[5] * table[5];
-	F50 = mult[6] * table[6];
-	F58 = mult[7] * table[7];
-
-	mult += 8;
-	table = &(fftTable[input[3] << 3]);
-
-	F3 = mult[0] * table[0];
-	F11 = mult[1] * table[1];
-	F19 = mult[2] * table[2];
-	F27 = mult[3] * table[3];
-	F35 = mult[4] * table[4];
-	F43 = mult[5] * table[5];
-	F51 = mult[6] * table[6];
-	F59 = mult[7] * table[7];
-
-	mult += 8;
-	table = &(fftTable[input[4] << 3]);
-
-	F4 = mult[0] * table[0];
-	F12 = mult[1] * table[1];
-	F20 = mult[2] * table[2];
-	F28 = mult[3] * table[3];
-	F36 = mult[4] * table[4];
-	F44 = mult[5] * table[5];
-	F52 = mult[6] * table[6];
-	F60 = mult[7] * table[7];
-
-	mult += 8;
-	table = &(fftTable[input[5] << 3]);
-
-	F5 = mult[0] * table[0];
-	F13 = mult[1] * table[1];
-	F21 = mult[2] * table[2];
-	F29 = mult[3] * table[3];
-	F37 = mult[4] * table[4];
-	F45 = mult[5] * table[5];
-	F53 = mult[6] * table[6];
-	F61 = mult[7] * table[7];
-
-	mult += 8;
-	table = &(fftTable[input[6] << 3]);
-
-	F6 = mult[0] * table[0];
-	F14 = mult[1] * table[1];
-	F22 = mult[2] * table[2];
-	F30 = mult[3] * table[3];
-	F38 = mult[4] * table[4];
-	F46 = mult[5] * table[5];
-	F54 = mult[6] * table[6];
-	F62 = mult[7] * table[7];
-
-	mult += 8;
-	table = &(fftTable[input[7] << 3]);
-
-	F7 = mult[0] * table[0];
-	F15 = mult[1] * table[1];
-	F23 = mult[2] * table[2];
-	F31 = mult[3] * table[3];
-	F39 = mult[4] * table[4];
-	F47 = mult[5] * table[5];
-	F55 = mult[6] * table[6];
-	F63 = mult[7] * table[7];
-
-	// Second loop unrolling:
-	// Iteration 0:
-	ADD_SUB(F0, F1);
-	ADD_SUB(F2, F3);
-	ADD_SUB(F4, F5);
-	ADD_SUB(F6, F7);
-
-	F3 <<= 4;
-	F7 <<= 4;
-
-	ADD_SUB(F0, F2);
-	ADD_SUB(F1, F3);
-	ADD_SUB(F4, F6);
-	ADD_SUB(F5, F7);
-
-	F5 <<= 2;
-	F6 <<= 4;
-	F7 <<= 6;
-
-	ADD_SUB(F0, F4);
-	ADD_SUB(F1, F5);
-	ADD_SUB(F2, F6);
-	ADD_SUB(F3, F7);
-
-	output[0] = Q_REDUCE(F0);
-	output[8] = Q_REDUCE(F1);
-	output[16] = Q_REDUCE(F2);
-	output[24] = Q_REDUCE(F3);
-	output[32] = Q_REDUCE(F4);
-	output[40] = Q_REDUCE(F5);
-	output[48] = Q_REDUCE(F6);
-	output[56] = Q_REDUCE(F7);
-
-	// Iteration 1:
-	ADD_SUB(F8, F9);
-	ADD_SUB(F10, F11);
-	ADD_SUB(F12, F13);
-	ADD_SUB(F14, F15);
-
-	F11 <<= 4;
-	F15 <<= 4;
-
-	ADD_SUB(F8, F10);
-	ADD_SUB(F9, F11);
-	ADD_SUB(F12, F14);
-	ADD_SUB(F13, F15);
-
-	F13 <<= 2;
-	F14 <<= 4;
-	F15 <<= 6;
-
-	ADD_SUB(F8, F12);
-	ADD_SUB(F9, F13);
-	ADD_SUB(F10, F14);
-	ADD_SUB(F11, F15);
-
-	output[1] = Q_REDUCE(F8);
-	output[9] = Q_REDUCE(F9);
-	output[17] = Q_REDUCE(F10);
-	output[25] = Q_REDUCE(F11);
-	output[33] = Q_REDUCE(F12);
-	output[41] = Q_REDUCE(F13);
-	output[49] = Q_REDUCE(F14);
-	output[57] = Q_REDUCE(F15);
-
-	// Iteration 2:
-	ADD_SUB(F16, F17);
-	ADD_SUB(F18, F19);
-	ADD_SUB(F20, F21);
-	ADD_SUB(F22, F23);
-
-	F19 <<= 4;
-	F23 <<= 4;
-
-	ADD_SUB(F16, F18);
-	ADD_SUB(F17, F19);
-	ADD_SUB(F20, F22);
-	ADD_SUB(F21, F23);
-
-	F21 <<= 2;
-	F22 <<= 4;
-	F23 <<= 6;
-
-	ADD_SUB(F16, F20);
-	ADD_SUB(F17, F21);
-	ADD_SUB(F18, F22);
-	ADD_SUB(F19, F23);
-
-	output[2] = Q_REDUCE(F16);
-	output[10] = Q_REDUCE(F17);
-	output[18] = Q_REDUCE(F18);
-	output[26] = Q_REDUCE(F19);
-	output[34] = Q_REDUCE(F20);
-	output[42] = Q_REDUCE(F21);
-	output[50] = Q_REDUCE(F22);
-	output[58] = Q_REDUCE(F23);
-
-	// Iteration 3:
-	ADD_SUB(F24, F25);
-	ADD_SUB(F26, F27);
-	ADD_SUB(F28, F29);
-	ADD_SUB(F30, F31);
-
-	F27 <<= 4;
-	F31 <<= 4;
-
-	ADD_SUB(F24, F26);
-	ADD_SUB(F25, F27);
-	ADD_SUB(F28, F30);
-	ADD_SUB(F29, F31);
-
-	F29 <<= 2;
-	F30 <<= 4;
-	F31 <<= 6;
-
-	ADD_SUB(F24, F28);
-	ADD_SUB(F25, F29);
-	ADD_SUB(F26, F30);
-	ADD_SUB(F27, F31);
-
-	output[3] = Q_REDUCE(F24);
-	output[11] = Q_REDUCE(F25);
-	output[19] = Q_REDUCE(F26);
-	output[27] = Q_REDUCE(F27);
-	output[35] = Q_REDUCE(F28);
-	output[43] = Q_REDUCE(F29);
-	output[51] = Q_REDUCE(F30);
-	output[59] = Q_REDUCE(F31);
-
-	// Iteration 4:
-	ADD_SUB(F32, F33);
-	ADD_SUB(F34, F35);
-	ADD_SUB(F36, F37);
-	ADD_SUB(F38, F39);
-
-	F35 <<= 4;
-	F39 <<= 4;
-
-	ADD_SUB(F32, F34);
-	ADD_SUB(F33, F35);
-	ADD_SUB(F36, F38);
-	ADD_SUB(F37, F39);
-
-	F37 <<= 2;
-	F38 <<= 4;
-	F39 <<= 6;
-
-	ADD_SUB(F32, F36);
-	ADD_SUB(F33, F37);
-	ADD_SUB(F34, F38);
-	ADD_SUB(F35, F39);
-
-	output[4] = Q_REDUCE(F32);
-	output[12] = Q_REDUCE(F33);
-	output[20] = Q_REDUCE(F34);
-	output[28] = Q_REDUCE(F35);
-	output[36] = Q_REDUCE(F36);
-	output[44] = Q_REDUCE(F37);
-	output[52] = Q_REDUCE(F38);
-	output[60] = Q_REDUCE(F39);
-
-	// Iteration 5:
-	ADD_SUB(F40, F41);
-	ADD_SUB(F42, F43);
-	ADD_SUB(F44, F45);
-	ADD_SUB(F46, F47);
-
-	F43 <<= 4;
-	F47 <<= 4;
-
-	ADD_SUB(F40, F42);
-	ADD_SUB(F41, F43);
-	ADD_SUB(F44, F46);
-	ADD_SUB(F45, F47);
-
-	F45 <<= 2;
-	F46 <<= 4;
-	F47 <<= 6;
-
-	ADD_SUB(F40, F44);
-	ADD_SUB(F41, F45);
-	ADD_SUB(F42, F46);
-	ADD_SUB(F43, F47);
-
-	output[5] = Q_REDUCE(F40);
-	output[13] = Q_REDUCE(F41);
-	output[21] = Q_REDUCE(F42);
-	output[29] = Q_REDUCE(F43);
-	output[37] = Q_REDUCE(F44);
-	output[45] = Q_REDUCE(F45);
-	output[53] = Q_REDUCE(F46);
-	output[61] = Q_REDUCE(F47);
-
-	// Iteration 6:
-	ADD_SUB(F48, F49);
-	ADD_SUB(F50, F51);
-	ADD_SUB(F52, F53);
-	ADD_SUB(F54, F55);
-
-	F51 <<= 4;
-	F55 <<= 4;
-
-	ADD_SUB(F48, F50);
-	ADD_SUB(F49, F51);
-	ADD_SUB(F52, F54);
-	ADD_SUB(F53, F55);
-
-	F53 <<= 2;
-	F54 <<= 4;
-	F55 <<= 6;
-
-	ADD_SUB(F48, F52);
-	ADD_SUB(F49, F53);
-	ADD_SUB(F50, F54);
-	ADD_SUB(F51, F55);
-
-	output[6] = Q_REDUCE(F48);
-	output[14] = Q_REDUCE(F49);
-	output[22] = Q_REDUCE(F50);
-	output[30] = Q_REDUCE(F51);
-	output[38] = Q_REDUCE(F52);
-	output[46] = Q_REDUCE(F53);
-	output[54] = Q_REDUCE(F54);
-	output[62] = Q_REDUCE(F55);
-
-	// Iteration 7:
-	ADD_SUB(F56, F57);
-	ADD_SUB(F58, F59);
-	ADD_SUB(F60, F61);
-	ADD_SUB(F62, F63);
-
-	F59 <<= 4;
-	F63 <<= 4;
-
-	ADD_SUB(F56, F58);
-	ADD_SUB(F57, F59);
-	ADD_SUB(F60, F62);
-	ADD_SUB(F61, F63);
-
-	F61 <<= 2;
-	F62 <<= 4;
-	F63 <<= 6;
-
-	ADD_SUB(F56, F60);
-	ADD_SUB(F57, F61);
-	ADD_SUB(F58, F62);
-	ADD_SUB(F59, F63);
-
-	output[7] = Q_REDUCE(F56);
-	output[15] = Q_REDUCE(F57);
-	output[23] = Q_REDUCE(F58);
-	output[31] = Q_REDUCE(F59);
-	output[39] = Q_REDUCE(F60);
-	output[47] = Q_REDUCE(F61);
-	output[55] = Q_REDUCE(F62);
-	output[63] = Q_REDUCE(F63);
-}
-
-// Calculates the FFT part of SWIFFT.
-// We divided the SWIFFT calculation into two, because that way we could save 2 computations of
-// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs
-// is only the A's part.
-//
-// Parameters:
-// - input: the input to FFT.
-// - m: the input size divided by 8. The function performs m FFTs.
-// - output: will store the result.
-void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
-{
-	int i;
-
-	for (i = 0;
-		 i < m;
-		 i++, input += EIGHTH_N, output += N)
-	{
-		FFT(input, output);
-	}
-}
-
-// Calculates the 'sum' part of SWIFFT, including the base change at the end.
-// We divided the SWIFFT calculation into two, because that way we could save 2 computations of
-// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs
-// is only the A's part.
-//
-// Parameters:
-// - input: the input. Of size 64 * m.
-// - m: the input size divided by 64.
-// - output: will store the result.
-// - a: the coefficients in the sum. Of size 64 * m.
-void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
-{
-	int i, j;
-	swift_int32_t result[N];
-	register swift_int16_t carry = 0;
-
-	for (j = 0; j < N; ++j)
-	{
-		register swift_int32_t sum = 0;
-		const register swift_int32_t *f = input + j;
-		const register swift_int16_t *k = a + j;
-
-		for (i = 0; i < m; i++, f += N,k += N)
-		{
-			sum += (*f) * (*k);
-		}
-
-		result[j] = sum;
-	}
-
-	for (j = 0; j < N; ++j)
-	{
-		result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
-	}
-
-	for (j = 0; j < 8; ++j)
-	{
-		int register carryBit = TranslateToBase256(result + (j << 3), output + (j << 3));
-		carry |= carryBit << j;
-	}
-
-	output[N] = carry;
-}
-
-void ComputeSingleSWIFFTX(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
-                          unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE],
-						  bool doSmooth)
-{
-	int i;
-	// Will store the result of the FFT parts:
-	swift_int32_t fftOut[N * M];
-	unsigned char intermediate[N * 3 + 8];
-	unsigned char carry0,carry1,carry2;
-
-	// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
-	// overriden by the following SWIFFT):
-
-	// 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs:
-	SWIFFTFFT(input, M, fftOut);
-
-	// 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients:
-
-	// 2a. The first SWIFFT:
-	SWIFFTSum(fftOut, M, intermediate, As);
-	// Remember the carry byte:
-	carry0 = intermediate[N];
-
-	// 2b. The second one:
-	SWIFFTSum(fftOut, M, intermediate + N, As + (M * N));
-	carry1 = intermediate[2 * N];
-
-	// 2c. The third one:
-	SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N));
-	carry2 = intermediate[3 * N];
-
-	//2d. Put three carry bytes in their place
-	intermediate[3 * N] = carry0;
-	intermediate[(3 * N) + 1] = carry1;
-	intermediate[(3 * N) + 2] = carry2;
-
-	// Padding  intermediate output with 5 zeroes.
-	memset(intermediate + (3 * N) + 3, 0, 5);
-
-	// Apply the S-Box:
-	for (i = 0; i < (3 * N) + 8; ++i)
-	{
-		intermediate[i] = SBox[intermediate[i]];
-	}
-
-	// 3. The final and last SWIFFT:
-	SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut);
-	SWIFFTSum(fftOut,       3 * (N/8) + 1, output, As);
-
-	if (doSmooth)
-	{
-		unsigned char sum[N];
-		register int i, j;
-		memset(sum, 0, N);
-
-		for (i = 0; i < (N + 1) * 8; ++i)
-		{
-			register const swift_int16_t *AsRow;
-			register int AShift;
-
-			if  (!(output[i >> 3] & (1 << (i & 7))))
-			{
-				continue;
-			}
-
-			AsRow = As + N * M + (i & ~(N - 1)) ;
-			AShift = i & 63;
-
-			for (j = AShift; j < N; ++j)
-			{
-				sum[j] += AsRow[j - AShift];
-			}
-
-			for(j = 0; j < AShift; ++j)
-			{
-				sum[j] -= AsRow[N - AShift + j];
-			}
-		}
-
-		for (i = 0; i < N; ++i)
-		{
-			output[i] = sum[i];
-		}
-
-		output[N] = 0;
-	}
-}
diff --git a/configure b/configure
index e18473f6..5ae117c0 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.6.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.6'
-PACKAGE_STRING='cpuminer-opt 3.15.6'
+PACKAGE_VERSION='3.15.7'
+PACKAGE_STRING='cpuminer-opt 3.15.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.15.7 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.15.7:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.15.6
+cpuminer-opt configure 3.15.7
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.15.6, which was
+It was created by cpuminer-opt $as_me 3.15.7, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.15.6'
+ VERSION='3.15.7'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.6, which was
+This file was extended by cpuminer-opt $as_me 3.15.7, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.6
+cpuminer-opt config.status 3.15.7
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 95d825e3..bbe7a18b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.15.6])
+AC_INIT([cpuminer-opt], [3.15.7])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 254f38f8..fe2aed0e 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -204,6 +204,7 @@ static double   lowest_share = 9e99; // lowest accepted share diff
 static double   last_targetdiff = 0.;
 #if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
 static uint32_t hi_temp = 0;
+static uint32_t prev_temp = 0;
 #endif
 
   
@@ -998,32 +999,67 @@ static struct timeval last_submit_time = {0};
 
 static inline int stats_ptr_incr( int p )
 {
-   return ++p < s_stats_size ? p : 0;
+   return ++p % s_stats_size;
 }
 
 void report_summary_log( bool force )
 {
    struct timeval now, et, uptime, start_time;
 
-   pthread_mutex_lock( &stats_lock );
-
    gettimeofday( &now, NULL );
    timeval_subtract( &et, &now, &five_min_start );
 
-   if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) )
-        && ( et.tv_sec < 300 ) )
+#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
+
+   // Display CPU temperature and clock rate.
+   int curr_temp = cpu_temp(0); 
+   static struct timeval cpu_temp_time = {0};
+   struct timeval diff;
+
+   if ( !opt_quiet || ( curr_temp >= 80 ) )
    {
-      pthread_mutex_unlock( &stats_lock );
-      return;
+      int wait_time = curr_temp >= 90 ? 5 : curr_temp >= 80 ? 30 :
+                                            curr_temp >= 70 ? 60 : 120;
+      timeval_subtract( &diff, &now, &cpu_temp_time );
+      if ( ( diff.tv_sec > wait_time )
+        || ( ( curr_temp > prev_temp ) && ( curr_temp >= 75 ) ) )
+      {
+         char tempstr[32];
+         float lo_freq = 0., hi_freq = 0.;
+
+         memcpy( &cpu_temp_time, &now, sizeof(cpu_temp_time) );
+         linux_cpu_hilo_freq( &lo_freq, &hi_freq );
+         if ( use_colors && ( curr_temp >= 70 ) )
+         {
+            if ( curr_temp >= 80 )
+               sprintf( tempstr, "%s%d C%s", CL_RED, curr_temp, CL_WHT );
+            else
+               sprintf( tempstr, "%s%d C%s", CL_YLW, curr_temp, CL_WHT );
+         }
+         else
+            sprintf( tempstr, "%d C", curr_temp );
+
+         applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz",
+                 tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
+         if ( curr_temp > hi_temp ) hi_temp = curr_temp;
+         prev_temp = curr_temp;
+      }
    }
+
+#endif
+
+   if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) )
+     && ( et.tv_sec < 300 ) )
+      return;
    
    // collect and reset periodic counters
+   pthread_mutex_lock( &stats_lock );
+
    uint64_t submits = submit_sum;  submit_sum = 0;
    uint64_t accepts = accept_sum;  accept_sum = 0;
    uint64_t rejects = reject_sum;  reject_sum = 0;
    uint64_t stales  = stale_sum;   stale_sum  = 0;
    uint64_t solved  = solved_sum;  solved_sum = 0;
-
    memcpy( &start_time, &five_min_start, sizeof start_time );
    memcpy( &five_min_start, &now, sizeof now );
 
@@ -1080,27 +1116,38 @@ void report_summary_log( bool force )
 
    applog2( LOG_INFO,"Submitted        %6d       %6d",
                        submits, submitted_share_count );
-   applog2( LOG_INFO,"Accepted         %6d       %6d",
-                       accepts, accepted_share_count );
+   applog2( LOG_INFO,"Accepted         %6d       %6d      %5.1f%%",
+                       accepts, accepted_share_count,
+                      100. * accepted_share_count / submitted_share_count );
    if ( stale_share_count )
-      applog2( LOG_INFO,"Stale            %6d       %6d",
-                       stales, stale_share_count );
+      applog2( LOG_INFO,"Stale            %6d       %6d      %5.1f%%",
+                       stales, stale_share_count,
+                       100. * stale_share_count / submitted_share_count );
    if ( rejected_share_count )
-      applog2( LOG_INFO,"Rejected         %6d       %6d",
-                       rejects, rejected_share_count );
+      applog2( LOG_INFO,"Rejected         %6d       %6d      %5.1f%%",
+                       rejects, rejected_share_count,
+                       100. * rejected_share_count / submitted_share_count );
    if ( solved_block_count )
       applog2( LOG_INFO,"Blocks Solved    %6d       %6d",
                          solved, solved_block_count );
    applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
                        highest_share, lowest_share );
-}
 
-bool lowdiff_debug = false;
+   static int64_t no_acks = 0;
+   if ( no_acks )
+   {
+      no_acks = submitted_share_count
+         - ( accepted_share_count + stale_share_count + rejected_share_count );
+      if ( no_acks )  // 2 consecutive cycles non zero
+         applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect",
+                no_acks );
+   }
+}
 
 static int share_result( int result, struct work *work,
                          const char *reason )
 {
-   double share_time = 0.; //, share_ratio = 0.;
+   double share_time = 0.; 
    double hashrate = 0.;
    int latency = 0;
    struct share_stats_t my_stats = {0};
@@ -1141,11 +1188,6 @@ static int share_result( int result, struct work *work,
               sizeof last_submit_time );
    }
 
-/*   
-   share_ratio = my_stats.net_diff == 0. ? 0. : my_stats.share_diff /
-                                                my_stats.net_diff;
-*/
-
    // check result
    if ( likely( result ) )
    {
@@ -2324,6 +2366,8 @@ static void *miner_thread( void *userdata )
           pthread_mutex_unlock( &stats_lock );
        }
 
+       // This code is deprecated, scanhash should never return true.
+       // This remains as a backup in case some old implementations still exist.
        // If unsubmiited nonce(s) found, submit now. 
        if ( unlikely( nonce_found && !opt_benchmark ) )
        {  
@@ -2350,48 +2394,6 @@ static void *miner_thread( void *userdata )
           }
        }
 
-#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
-
-       // Display CPU temperature and clock rate.
-       int curr_temp, prev_hi_temp;
-       static struct timeval cpu_temp_time = {0};
-
-       pthread_mutex_lock( &stats_lock );
-
-       prev_hi_temp = hi_temp;
-       curr_temp = cpu_temp(0);
-       if ( curr_temp > hi_temp ) hi_temp = curr_temp;
-
-       pthread_mutex_unlock( &stats_lock );
-
-       if ( !opt_quiet || ( curr_temp >= 80 ) )
-       {
-          int wait_time = curr_temp >= 80 ? 20 : curr_temp >= 70 ? 60 : 120;
-          timeval_subtract( &diff, &tv_end, &cpu_temp_time );
-          if ( ( diff.tv_sec > wait_time ) || ( curr_temp > prev_hi_temp ) )
-          {
-             char tempstr[32];
-             float lo_freq = 0., hi_freq = 0.;
-
-             memcpy( &cpu_temp_time, &tv_end, sizeof(cpu_temp_time) ); 
-             linux_cpu_hilo_freq( &lo_freq, &hi_freq );
-             if ( use_colors && ( curr_temp >= 70 ) )
-             {
-                if ( curr_temp >= 80 )
-                   sprintf( tempstr, "%s%d C%s", CL_RED, curr_temp, CL_WHT );
-                else
-                   sprintf( tempstr, "%s%d C%s", CL_YLW, curr_temp, CL_WHT );
-             }
-             else
-                sprintf( tempstr, "%d C", curr_temp );
-
-             applog( LOG_NOTICE,"CPU temp: curr %s (max %d), Freq: %.3f/%.3f GHz",
-                     tempstr, prev_hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
-          }
-       }
-
-#endif
-
        // display hashrate
        if ( unlikely( opt_hash_meter ) )
        {
diff --git a/miner.h b/miner.h
index 119c8a75..234b1cc0 100644
--- a/miner.h
+++ b/miner.h
@@ -457,9 +457,6 @@ bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 
-extern bool lowdiff_debug;
-
-
 
 extern bool aes_ni_supported;
 extern char *rpc_user;
@@ -549,7 +546,7 @@ enum algos {
         ALGO_LYRA2REV3,
         ALGO_LYRA2Z,
         ALGO_LYRA2Z330,
-	ALGO_M7M,
+        ALGO_M7M,
         ALGO_MINOTAUR,
         ALGO_MYR_GR,      
         ALGO_NEOSCRYPT,
diff --git a/simd-utils.h b/simd-utils.h
index f8ee35fd..55cc5529 100644
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -131,7 +131,7 @@
 // If a sequence of constants is to be used it can be more efficient to
 // use arithmetic with already existing constants to generate new ones.
 //
-// ex: const __m512i one = _mm512_const1_64( 1 );
+// ex: const __m512i one = m512_one_64;
 //     const __m512i two = _mm512_add_epi64( one, one );
 //     
 //////////////////////////////////////////////////////////////////////////
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 8b1fbeba..35be6109 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -27,13 +27,15 @@
 // All of the utilities here assume all data is in registers except
 // in rare cases where arguments are pointers.
 //
+// Some constants are generated using a memory overlay on the stack.
+//
 // Intrinsics automatically promote from REX to VEX when AVX is available
 // but ASM needs to be done manually.
 //
 ///////////////////////////////////////////////////////////////////////////
 
 
-// Efficient and convenient moving bwtween GP & low bits of XMM.
+// Efficient and convenient moving between GP & low bits of XMM.
 // Use VEX when available to give access to xmm8-15 and zero extend for
 // larger vectors.
 
@@ -81,6 +83,23 @@ static inline uint32_t mm128_mov128_32( const __m128i a )
   return  n;
 }
 
+// Equivalent of set1, broadcast integer to all elements.
+#define m128_const_i128( i ) mm128_mov64_128( i )
+#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
+#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+
+#if defined(__SSE4_1__)
+
+// Assign 64 bit integers to respective elements: {hi, lo}
+#define m128_const_64( hi, lo ) \
+   _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
+
+#else  // No insert in SSE2
+
+#define m128_const_64  _mm_set_epi64x
+
+#endif
+
 // Pseudo constants
 
 #define m128_zero      _mm_setzero_si128()
@@ -107,27 +126,53 @@ static inline __m128i mm128_neg1_fn()
 }
 #define m128_neg1    mm128_neg1_fn()
 
+#if defined(__SSE4_1__)
 
-// const functions work best when arguments are immediate constants or
-// are known to be in registers. If data needs to loaded from memory or cache
-// use set.
-
-// Equivalent of set1, broadcast 64 bit integer to all elements.
-#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
-#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+/////////////////////////////
+//
+//      _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
+//
+// Fast and powerful but very limited in its application.
+// It requires SSE4.1 but only works with 128 bit vectors with 32 bit
+// elements. There is no equivalent instruction for 256 bit or 512 bit vectors.
+// There's no integer version. There's no 64 bit, 16 bit or byte element
+// sizing. It's unique.
+//
+// It can:
+//   - zero 32 bit elements of a 128 bit vector.
+//   - extract any 32 bit element from one 128 bit vector and insert the
+//     data to any 32 bit element of another 128 bit vector, or the same vector.
+//   - do both simultaneoulsly.
+//
+//   It can be used as a more efficient replacement for _mm_insert_epi32
+//   or _mm_extract_epi32.
+//
+// Control byte definition:
+//    c[3:0] zero mask
+//    c[5:4] destination element selector
+//    c[7:6] source element selector
 
-#if defined(__SSE4_1__)
+// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
+#define mm128_xim_32( v1, v2, c ) \
+   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
+                                    _mm_castsi128_ps( v2 ), c ) )
 
-// Assign 64 bit integers to respective elements: {hi, lo}
-#define m128_const_64( hi, lo ) \
-   _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
+// Some examples of simple operations:
 
-#else  // No insert in SSE2
+// Insert 32 bit integer into v at element c and return modified v.
+static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
+                                       const int c )
+{   return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
 
-#define m128_const_64  _mm_set_epi64x
+// Extract 32 bit element c from v and return as integer.
+static inline uint32_t mm128_extract_32( const __m128i v, const int c )
+{   return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
 
-#endif
+// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
+static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
+{   return mm128_xim_32( v, v, m ); }
 
+#endif  // SSE4_1
 
 //
 // Basic operations without equivalent SIMD intrinsic
@@ -140,11 +185,6 @@ static inline __m128i mm128_neg1_fn()
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  
 
-// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
-// Fast, avoids using vector mask, but only available for 128 bit vectors.
-#define mm128_mask_32( a, mask ) \
-   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( a ), \
-                                    _mm_castsi128_ps( a ), mask ) )
 
 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
@@ -162,27 +202,6 @@ static inline __m128i mm128_neg1_fn()
 #define mm128_xor4( a, b, c, d ) \
    _mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
 
-// Horizontal vector testing
-
-#if defined(__SSE4_1__)
-
-#define mm128_allbits0( a )    _mm_testz_si128(   a, a )
-#define mm128_allbits1( a )    _mm_testc_si128(   a, m128_neg1 )
-// probably broken, avx2 is
-//#define mm128_allbitsne( a )   _mm_testnzc_si128( a, m128_neg1 )
-#define mm128_anybits0( a )    mm128_allbits1( a )
-#define mm128_anybits1( a )    mm128_allbits0( a )
-
-#else   // SSE2
-
-// Bit-wise test of entire vector, useful to test results of cmp.
-#define mm128_anybits0( a ) (uint128_t)(a)
-#define mm128_anybits1( a ) (((uint128_t)(a))+1)
-
-#define mm128_allbits0( a ) ( !mm128_anybits1(a) )
-#define mm128_allbits1( a ) ( !mm128_anybits0(a) )
-
-#endif // SSE4.1 else SSE2
 
 //
 // Vector pointer cast
@@ -204,11 +223,6 @@ static inline __m128i mm128_neg1_fn()
 #define casto_m128i(p,o) (((__m128i*)(p))+(o))
 
 
-// Memory functions
-// Mostly for convenience, avoids calculating bytes.
-// Assumes data is alinged and integral.
-// n = number of __m128i, bytes/16
-
 // Memory functions
 // Mostly for convenience, avoids calculating bytes.
 // Assumes data is alinged and integral.
@@ -256,14 +270,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_32    _mm_ror_epi32
 #define mm128_rol_32    _mm_rol_epi32
 
-#else
+#else  // SSE2
 
 #define mm128_ror_64   mm128_ror_var_64
 #define mm128_rol_64   mm128_rol_var_64
 #define mm128_ror_32   mm128_ror_var_32
 #define mm128_rol_32   mm128_rol_var_32
 
-#endif   // AVX512 else
+#endif   // AVX512 else SSE2
 
 #define mm128_ror_16( v, c ) \
    _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
@@ -280,58 +294,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //#define mm128_swap_64( v )    _mm_alignr_epi8( v, v,  8 )
 //#define mm128_ror_1x32( v )   _mm_alignr_epi8( v, v,  4 )
 //#define mm128_rol_1x32( v )   _mm_alignr_epi8( v, v, 12 )
-#define mm128_ror_1x16( v )   _mm_alignr_epi8( v, v,  2 )
-#define mm128_rol_1x16( v )   _mm_alignr_epi8( v, v, 14 )
-#define mm128_ror_1x8( v )    _mm_alignr_epi8( v, v,  1 )
-#define mm128_rol_1x8( v )    _mm_alignr_epi8( v, v, 15 )
-
-// Rotate by c bytes
-#define mm128_ror_x8( v, c )  _mm_alignr_epi8( v, c )
-#define mm128_rol_x8( v, c )  _mm_alignr_epi8( v, 16-(c) )
-
-
-// Invert vector: {3,2,1,0} -> {0,1,2,3}
-#define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
-
-#if defined(__SSSE3__)
-
-#define mm128_invert_16( v ) \
-   _mm_shuffle_epi8( v, mm128_const_64( 0x0100030205040706, \
-                                        0x09080b0a0d0c0f0e )
-#define mm128_invert_8( v ) \
-   _mm_shuffle_epi8( v, mm128_const_64( 0x0001020304050607, \
-                                        0x08090a0b0c0d0e0f )
-
-#endif   // SSSE3
-
-
-//
-// Rotate elements within lanes.
 
+// Swap 32 bit elements in 64 bit lanes
 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
 
-#define mm128_rol64_8( v, c ) \
-     _mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \
-                   _mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-
-#define mm128_ror64_8( v, c ) \
-     _mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \
-                   _mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-
-#define mm128_rol32_8( v, c ) \
-     _mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \
-                   _mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
+#if defined(__SSSE3__)
 
-#define mm128_ror32_8( v, c ) \
-     _mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \
-                   _mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
-           
+// Rotate right by c bytes
+static inline __m128i mm128_ror_x8( const __m128i v, const int c )
+{ return _mm_alignr_epi8( v, v, c ); }
 
 //
 // Endian byte swap.
 
-#if defined(__SSSE3__)
-
 #define mm128_bswap_64( v ) \
    _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                        0x0001020304050607 ) )
@@ -374,7 +349,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #else  // SSE2
 
-// Use inline function instead of macro due to multiple statements.
 static inline __m128i mm128_bswap_64( __m128i v )
 {
       v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 5f94cbc8..635eb4f2 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -15,33 +15,35 @@
 // is available.
 
 // Move integer to low element of vector, other elements are set to zero.
+#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
+#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
 
-#define mm256_mov64_256( n ) _mm256_castsi128_si256( mm128_mov64_128( n ) )
-#define mm256_mov32_256( n ) _mm256_castsi128_si256( mm128_mov32_128( n ) )
-
-#define mm256_mov256_64( a ) mm128_mov128_64( _mm256_castsi256_si128( a ) )
-#define mm256_mov256_32( a ) mm128_mov128_32( _mm256_castsi256_si128( a ) )
+// Mo0ve low element of vector to integer.
+#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
+#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
 
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
    _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
 
 
-// Equavalent of set, move 64 bit integer constants to respective 64 bit
+// Equivalent of set, move 64 bit integer constants to respective 64 bit
 // elements.
 static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
 {
-    __m128i hi, lo;
-   lo = mm128_mov64_128( i0 );
-   hi = mm128_mov64_128( i2 );
-   lo = _mm_insert_epi64( lo, i1, 1 );
-   hi = _mm_insert_epi64( hi, i3, 1 );
-   return mm256_concat_128( hi, lo );
+  union { __m256i m256i;
+          uint64_t u64[4]; } v;
+  v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
+  return v.m256i;
 }
 
-// Equivalent of set1, broadcast integer constant to all elements.
-#define m256_const1_128( v ) _mm256_broadcastsi128_si256( v )
+// Equivalent of set1.
+// 128 bit vector argument
+#define m256_const1_128( v ) \
+   _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
+// 64 bit integer argument
+#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
 #define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
 #define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
 #define m256_const1_16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
@@ -50,119 +52,29 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
 #define m256_const2_64( i1, i0 ) \
   m256_const1_128( m128_const_64( i1, i0 ) )
 
-#define m126_const2_32( i1, i0 ) \
-   m256_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) ) 
-
-
 //
 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.
 
-#define m256_zero       _mm256_setzero_si256()
-#define m256_one_256    mm256_mov64_256( 1 )
-#define m256_one_128 \
-    _mm256_permute4x64_epi64( _mm256_castsi128_si256( \
-                               mm128_mov64_128( 1 ) ), 0x44 )
-#define m256_one_64     _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
-#define m256_one_32     _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
-#define m256_one_16     _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
-#define m256_one_8      _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
+#define m256_zero      _mm256_setzero_si256()
+#define m256_one_256   mm256_mov64_256( 1 )
+#define m256_one_128   m256_const1_i128( 1 )
+#define m256_one_64    _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
+#define m256_one_32    _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
+#define m256_one_16    _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
+#define m256_one_8     _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
 
 static inline __m256i mm256_neg1_fn()
 {
-   __m256i a;
-   asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
-   return a;
+   __m256i v;
+   asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(v) );
+   return v;
 }
 #define m256_neg1  mm256_neg1_fn()
 
-
-//
-// Vector size conversion.
-//
-// Allows operations on either or both halves of a 256 bit vector serially.
-// Handy for parallel AES.
-// Caveats when writing:
-//      _mm256_castsi256_si128 is free and without side effects.
-//      _mm256_castsi128_si256 is also free but leaves the high half
-//      undefined. That's ok if the hi half will be subseqnently assigned.
-//      If assigning both, do lo first, If assigning only 1, use
-//      _mm256_inserti128_si256.
-//
-#define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a )
-#define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 )
-
-// Extract integers from 256 bit vector, ineficient, avoid if possible..
-#define mm256_extr_4x64( a3, a2, a1, a0, src ) \
-do { \
-  __m128i hi = _mm256_extracti128_si256( src, 1 ); \
-  a0 = mm128_mov128_64( _mm256_castsi256_si128( src) ); \
-  a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
-  a2 = mm128_mov128_64( hi ); \
-  a3 = _mm_extract_epi64( hi, 1 ); \
-} while(0)
-
-#define mm256_extr_8x32( a7, a6, a5, a4, a3, a2, a1, a0, src ) \
-do { \
-  uint64_t t = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
-  __m128i hi = _mm256_extracti128_si256( src, 1 ); \
-  a0 = mm256_mov256_32( src ); \
-  a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \
-  a2 = (uint32_t)( t ); \
-  a3 = (uint32_t)( t<<32 ); \
-  t = _mm_extract_epi64(  hi, 1 ); \
-  a4 = mm128_mov128_32( hi ); \
-  a5 = _mm_extract_epi32( hi, 1 ); \
-  a6 = (uint32_t)( t ); \
-  a7 = (uint32_t)( t<<32 ); \
-} while(0)
-
-
-// Bytewise test of all 256 bits
-#define mm256_all0_8( a ) \
-     ( _mm256_movemask_epi8( a ) == 0 )
-
-#define mm256_all1_8( a ) \
-    ( _mm256_movemask_epi8( a ) == -1 )
-
-
-#define mm256_anybits0( a ) \
-   (  _mm256_movemask_epi8( a ) & 0xffffffff  )
-
-#define mm256_anybits1( a ) \
-   ( ( _mm256_movemask_epi8( a ) & 0xffffffff ) != 0xffffffff )
-
-
-// Bitwise test of all 256 bits
-#define mm256_allbits0( a )   _mm256_testc_si256( a, m256_neg1 )
-#define mm256_allbits1( a )   _mm256_testc_si256( m256_zero, a )
-//#define mm256_anybits0( a )   !mm256_allbits1( a )
-//#define mm256_anybits1( a )   !mm256_allbits0( a )
-
-
-// Parallel AES, for when x is expected to be in a 256 bit register.
-// Use same 128 bit key.
-
-#if defined(__VAES__)
-
-#define mm256_aesenc_2x128( x, k ) \
-   _mm256_aesenc_epi128( x, k )
-
-#else
-
-#define mm256_aesenc_2x128( x, k ) \
-   mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \
-                     _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) )
-
-#endif
-
-#define mm256_paesenc_2x128( y, x, k ) do \
-{ \
-  __m128i *X = (__m128i*)x; \
-  __m128i *Y = (__m128i*)y; \
-  Y[0] = _mm_aesenc_si128( X[0], k ); \
-  Y[1] = _mm_aesenc_si128( X[1], k ); \
-} while(0);
+// Consistent naming for similar operations.
+#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
+#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
 
 //
 // Pointer casting
@@ -201,13 +113,13 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Basic operations without SIMD equivalent
 
-// Bitwise not ( ~x )
-#define mm256_not( x )       _mm256_xor_si256( (x), m256_neg1 ) \
+// Bitwise not ( ~v )
+#define mm256_not( v )       _mm256_xor_si256( v, m256_neg1 ) \
 
-// Unary negation of each element ( -a )
-#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a )
-#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a )
-#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a )
+// Unary negation of each element ( -v )
+#define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
+#define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
+#define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v )
 
 
 // Add 4 values, fewer dependencies than sequential addition.
@@ -265,17 +177,14 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    _mm256_ror_epi32
 #define mm256_rol_32    _mm256_rol_epi32
 
-#else
-
-
-// No AVX512, use fallback.
+#else   // AVX2
 
 #define mm256_ror_64    mm256_ror_var_64 
 #define mm256_rol_64    mm256_rol_var_64
 #define mm256_ror_32    mm256_ror_var_32
 #define mm256_rol_32    mm256_rol_var_32
 
-#endif     // AVX512 else
+#endif     // AVX512 else AVX2
 
 #define  mm256_ror_16( v, c ) \
    _mm256_or_si256( _mm256_srli_epi16( v, c ), \
@@ -285,46 +194,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
    _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                     _mm256_srli_epi16( v, 16-(c) ) )
 
-// Rotate bits in each element of v by the amount in corresponding element of
-// index vector c
-#define mm256_rorv_64( v, c ) \
-   _mm256_or_si256( \
-         _mm256_srlv_epi64( v, c ), \
-         _mm256_sllv_epi64( v, _mm256_sub_epi64( \
-                                   _mm256_set1_epi64x( 64 ), c ) ) )
-
-#define mm256_rolv_64( v, c ) \
-   _mm256_or_si256( \
-         _mm256_sllv_epi64( v, c ), \
-         _mm256_srlv_epi64( v, _mm256_sub_epi64( \
-                                   _mm256_set1_epi64x( 64 ), c ) ) )
-
-#define mm256_rorv_32( v, c ) \
-   _mm256_or_si256( \
-         _mm256_srlv_epi32( v, c ), \
-         _mm256_sllv_epi32( v, _mm256_sub_epi32( \
-                                  _mm256_set1_epi32( 32 ), c ) ) )
-
-#define mm256_rolv_32( v, c ) \
-   _mm256_or_si256( \
-         _mm256_sllv_epi32( v, c ), \
-         _mm256_srlv_epi32( v, _mm256_sub_epi32( \
-                                     _mm256_set1_epi32( 32 ), c ) ) )
-
-// AVX512 can do 16 bit elements.
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-#define mm256_rorv_16( v, c ) \
-   _mm256_or_si256( \
-         _mm256_srlv_epi16( v, _mm256_set1_epi16( c ) ), \
-         _mm256_sllv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
-
-#define mm256_rolv_16( v, c ) \
-   _mm256_or_si256( \
-         _mm256_sllv_epi16( v, _mm256_set1_epi16( c ) ), \
-         _mm256_srlv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
-
-#endif  // AVX512
 
 //
 // Rotate elements accross all lanes.
@@ -336,13 +205,26 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-#define mm256_swap_128( v )   _mm256_alignr_epi64( v, v, 2 )
-#define mm256_ror_1x64( v )   _mm256_alignr_epi64( v, v, 1 )
-#define mm256_rol_1x64( v )   _mm256_alignr_epi64( v, v, 3 )
-#define mm256_ror_1x32( v )   _mm256_alignr_epi32( v, v, 1 )
-#define mm256_rol_1x32( v )   _mm256_alignr_epi32( v, v, 7 )
-#define mm256_ror_3x32( v )   _mm256_alignr_epi32( v, v, 3 )
-#define mm256_rol_3x32( v )   _mm256_alignr_epi32( v, v, 5 )
+static inline __m256i mm256_swap_128( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 2 ); }
+
+static inline __m256i mm256_ror_1x64( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 1 ); }
+
+static inline __m256i mm256_rol_1x64( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 3 ); }
+
+static inline __m256i mm256_ror_1x32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 1 ); }
+
+static inline __m256i mm256_rol_1x32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 7 ); }
+
+static inline __m256i mm256_ror_3x32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 3 ); }
+
+static inline __m256i mm256_rol_3x32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 5 ); }
 
 #else   // AVX2
 
@@ -377,131 +259,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 #endif    // AVX512 else AVX2
 
-
-// AVX512 can do 16 & 8 bit elements.
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// Rotate 256 bit vector by one 16 bit element.     
-#define mm256_ror_1x16( v ) \
-   _mm256_permutexvar_epi16( m256_const_64( \
-                                 0x0000000f000e000d, 0x000c000b000a0009, \
-                                 0x0008000700060005, 0x0004000300020001 ), v )
-
-#define mm256_rol_1x16( v ) \
-   _mm256_permutexvar_epi16( m256_const_64( \
-                                 0x000e000d000c000b, 0x000a000900080007, \
-                                 0x0006000500040003, 0x000200010000000f ), v )
-
-#if defined (__AVX512VBMI__)
-
-// Rotate 256 bit vector by one byte.
-#define mm256_ror_1x8( v ) _mm256_permutexvar_epi8( m256_const_64( \
-                                 0x001f1e1d1c1b1a19, 0x1817161514131211, \
-                                 0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
-
-#define mm256_rol_1x8( v ) _mm256_permutexvar_epi16( m256_const_64( \
-                                 0x1e1d1c1b1a191817, 0x161514131211100f, \
-                                 0x0e0d0c0b0a090807, 0x060504030201001f ), v )
-
-#endif  // VBMI
-
-#endif  // AVX512
-
-
-// Invert vector: {3,2,1,0} -> {0,1,2,3}
-
-#define mm256_invert_64 ( v ) _mm256_permute4x64_epi64( v, 0x1b )
-
-#define mm256_invert_32 ( v ) _mm256_permutevar8x32_epi32( v, \
-                     m256_const_64( 0x0000000000000001, 0x0000000200000003 \
-                                    0x0000000400000005, 0x0000000600000007 )
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7}
-#define mm256_invert_16 ( v ) \
-     _mm256_permutexvar_epi16( m256_const_64( \
-                                  0x0000000100020003, 0x0004000500060007, \
-                                  0x00080009000a000b, 0x000c000d000e000f ), v )
-
-#if defined(__AVX512VBMI__)
-
-#define mm256_invert_8( v ) \
-     _mm256_permutexvar_epi8( m256_const_64( \
-                                  0x0001020304050607, 0x08090a0b0c0d0e0f, \
-                                  0x1011121314151617, 0x18191a1b1c1d1e1f ), v )
-#endif  // VBMI
-#endif  // AVX512
-
-
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.
 
-#define mm256_swap128_64( v )   _mm256_shuffle_epi32( v, 0x4e )
-
-#define mm256_ror128_32( v )  _mm256_shuffle_epi32( v, 0x39 )
-
-#define mm256_rol128_32( v )  _mm256_shuffle_epi32( v, 0x93 )
-
-#define mm256_ror128_x8( v, c )  _mm256_alignr_epi8( v, v, c ) 
-
-/*
-// Rotate each 128 bit lane by c elements.
-#define mm256_ror128_8( v, c ) \
-  _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
-                   _mm256_bslli_epi128( v, 16-(c) ) )
-#define mm256_rol128_8( v, c ) \
-  _mm256_or_si256( _mm256_bslli_epi128( v, c ), \
-                   _mm256_bsrli_epi128( v, 16-(c) ) )
-*/
-
-// Rotate elements in each 64 bit lane
-
-#define mm256_swap64_32( v )    _mm256_shuffle_epi32( v, 0xb1 )
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-#define mm256_rol64_8( v, c )   _mm256_rol_epi64( v, ((c)<<3) ) 
-#define mm256_ror64_8( v, c )   _mm256_ror_epi64( v, ((c)<<3) ) 
-
-#else
-
-#define mm256_rol64_8( v, c ) \
-     _mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \
-                      _mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-
-#define mm256_ror64_8( v, c ) \
-     _mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \
-                      _mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-
-#endif
-
-
-// Rotate elements in each 32 bit lane
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 )
-
-#define mm256_rol32_8( v )   _mm256_rol_epi32( v, 8 )
-#define mm256_ror32_8( v )   _mm256_ror_epi32( v, 8 )
-
-#else
-
-#define mm256_swap32_16( v ) \
-     _mm256_or_si256( _mm256_slli_epi32( v, 16 ), \
-                      _mm256_srli_epi32( v, 16 ) )
-
-#define mm256_rol32_8( v ) \
-     _mm256_or_si256( _mm256_slli_epi32( v, 8 ), \
-                      _mm256_srli_epi32( v, 8 ) )
-
-#define mm256_ror32_8( v, c ) \
-     _mm256_or_si256( _mm256_srli_epi32( v, 8 ), \
-                      _mm256_slli_epi32( v, 8 ) )
+#define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
+#define mm256_ror128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
+#define mm256_rol128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
 
-#endif
+static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
+{ return _mm256_alignr_epi8( v, v, c ); }
 
+// Swap 32 bit elements in each 64 bit lane.
+#define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
 
 //
 // Swap bytes in vector elements, endian bswap.
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index a13e88f4..22c5331a 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -26,9 +26,6 @@
 //    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
 //    usually shuffles accross all lanes.
 //
-//    Some instructions like cmp and blend use a mask regsiter now instead
-//    a mask vector.
-//
 //    permutexvar has args reversed, index is first arg. Previously all
 //    permutes and shuffles have the index last.
 //
@@ -85,52 +82,43 @@
 #define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
 #define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
 
-
-// Insert and extract integers is a multistage operation.
-// Insert integer into __m128i, then insert __m128i to __m256i, finally
-// insert __256i into __m512i. Reverse the order for extract.
-// Do not use __m512_insert_epi64 or _mm256_insert_epi64 to perform multiple
-// inserts.
-// Avoid small integers for multiple inserts.
-// Shortcuts:
-// Use castsi to reference the low bits of a vector or sub-vector. (free)
-// Use mov to insert integer into low bits of vector or sub-vector. (cheap)
-// Use _mm_insert only to reference the high bits of __m128i. (expensive)
-// Sequence instructions to minimize data dependencies.
-// Use const or const1 only when integer is either immediate or known to be in 
-// a GP register. Use set/set1 when data needs to be loaded from memory or
-// cache.
+// A simple 128 bit permute, using function instead of macro avoids
+// problems if the v arg passed as an expression.
+static inline __m512i mm512_perm_128( const __m512i v, const int c )
+{  return _mm512_shuffle_i64x2( v, v, c ); }
 
 // Concatenate two 256 bit vectors into one 512 bit vector {hi, lo}
 #define mm512_concat_256( hi, lo ) \
    _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
 
 // Equivalent of set, assign 64 bit integers to respective 64 bit elements.
+// Use stack memory overlay
 static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
                                      const uint64_t i5, const uint64_t i4,
                                      const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
 {
-   __m256i hi, lo;
-   __m128i hi1, lo1;
-   lo  = mm256_mov64_256( i0 );
-   lo1 = mm128_mov64_128( i2 );
-   hi  = mm256_mov64_256( i4 );
-   hi1 = mm128_mov64_128( i6 );
-   lo  = _mm256_castsi128_si256(
-         _mm_insert_epi64( _mm256_castsi256_si128( lo ), i1, 1 ) );
-   lo1 = _mm_insert_epi64( lo1, i3, 1 );
-   hi  = _mm256_castsi128_si256(
-         _mm_insert_epi64( _mm256_castsi256_si128( hi ), i5, 1 ) );
-   hi1 = _mm_insert_epi64( hi1, i7, 1 );
-   lo  = _mm256_inserti128_si256( lo, lo1, 1 );
-   hi  = _mm256_inserti128_si256( hi, hi1, 1 );
-   return mm512_concat_256( hi, lo );
+  union { __m512i m512i;
+          uint64_t u64[8]; } v;   
+  v.u64[0] = i0;     v.u64[1] = i1;
+  v.u64[2] = i2;     v.u64[3] = i3;
+  v.u64[4] = i4;     v.u64[5] = i5;
+  v.u64[6] = i6;     v.u64[7] = i7;
+  return v.m512i;
 }
 
-// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
-#define m512_const1_256( v )   _mm512_broadcast_i64x4( v )
-#define m512_const1_128( v )   _mm512_broadcast_i64x2( v )
+// Equivalent of set1, broadcast lo element all elements.
+static inline __m512i m512_const1_256( const __m256i v )
+{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  
+
+#define m512_const1_128( v ) \
+    mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
+// Integer input argument up to 64 bits
+#define m512_const1_i128( i ) \
+    mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
+
+//#define m512_const1_256( v )   _mm512_broadcast_i64x4( v )
+//#define m512_const1_128( v )   _mm512_broadcast_i64x2( v )
 #define m512_const1_64( i )    _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
 #define m512_const1_32( i )    _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
 #define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
@@ -142,23 +130,17 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 #define m512_const2_64( i1, i0 ) \
    m512_const1_128( m128_const_64( i1, i0 ) )
 
-#define m512_const2_32( i1, i0 ) \
-   m512_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) )
-
-// { m128_1, m128_1, m128_0, m128_0 }
-#define m512_const_2x128( v1, v0 ) \
-   m512_mask_blend_epi64( 0x0f, m512_const1_128( v1 ), m512_const1_128( v0 ) )
 
 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                       const uint64_t i1, const uint64_t i0 )
 {
-   __m256i lo = mm256_mov64_256( i0 );
-   __m128i hi = mm128_mov64_128( i2 );
-   lo = _mm256_castsi128_si256(
-        _mm_insert_epi64( _mm256_castsi256_si128(
-                          lo ), i1, 1 ) );
-   hi = _mm_insert_epi64( hi,   i3, 1 );
-   return _mm512_broadcast_i64x4( _mm256_inserti128_si256( lo, hi, 1 ) );
+  union  {  __m512i m512i;
+            uint64_t u64[8];   } v;
+  v.u64[0] = v.u64[4] = i0;
+  v.u64[1] = v.u64[5] = i1;
+  v.u64[2] = v.u64[6] = i2;
+  v.u64[3] = v.u64[7] = i3;
+  return v.m512i;
 }
 
 //
@@ -170,14 +152,15 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 
 #define m512_zero       _mm512_setzero_si512()
 #define m512_one_512    mm512_mov64_512( 1 )
-#define m512_one_256    _mm512_broadcast_i64x4 ( mm256_mov64_256( 1 ) )
-#define m512_one_128    _mm512_broadcast_i64x2 ( mm128_mov64_128( 1 ) )
-#define m512_one_64     _mm512_broadcastq_epi64( mm128_mov64_128( 1 ) )
-#define m512_one_32     _mm512_broadcastd_epi32( mm128_mov64_128( 1 ) )
-#define m512_one_16     _mm512_broadcastw_epi16( mm128_mov64_128( 1 ) )
-#define m512_one_8      _mm512_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
+#define m512_one_256    _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )  
+#define m512_one_128    m512_const1_i128( 1 )
+#define m512_one_64     m512_const1_64( 1 )
+#define m512_one_32     m512_const1_32( 1 )
+#define m512_one_16     m512_const1_16( 1 )
+#define m512_one_8      m512_const1_8( 1 )
 
-#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
+//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
+#define m512_neg1 _mm512_movm_epi64( 0xff )
 
 //
 // Basic operations without SIMD equivalent
@@ -242,15 +225,6 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
    _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )
 
 
-
-// Horizontal vector testing
-// Returns bit __mmask8
-#define mm512_allbits0( a )    _mm512_cmpeq_epi64_mask( a, m512_zero )
-#define mm512_allbits1( a )    _mm512_cmpeq_epi64_mask( a, m512_neg1 )
-#define mm512_anybits0( a )    _mm512_cmpneq_epi64_mask( a, m512_neg1 )
-#define mm512_anybits1( a )    _mm512_cmpneq_epi64_mask( a, m512_zero )
-
-
 //
 // Bit rotations.
 
@@ -262,37 +236,47 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
 //
 
+// For convenience and consistency with AVX2
 #define mm512_ror_64 _mm512_ror_epi64
 #define mm512_rol_64 _mm512_rol_epi64
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32
 
-#define mm512_ror_var_64( v, c ) \
-   _mm512_or_si512( _mm512_srli_epi64( v, c ), \
-                    _mm512_slli_epi64( v, 64-(c) ) )
-
-#define mm512_rol_var_64( v, c ) \
-   _mm512_or_si512( _mm512_slli_epi64( v, c ), \
-                    _mm512_srli_epi64( v, 64-(c) ) )
-
-#define mm512_ror_var_32( v, c ) \
-   _mm512_or_si512( _mm512_srli_epi32( v, c ), \
-                    _mm512_slli_epi32( v, 32-(c) ) )
+static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
+{
+   return _mm512_or_si512( _mm512_srli_epi64( v, c ),
+                           _mm512_slli_epi64( v, 64-c ) );
+}
 
-#define mm512_rol_var_32( v, c ) \
-   _mm512_or_si512( _mm512_slli_epi32( v, c ), \
-                    _mm512_srli_epi32( v, 32-(c) ) )
+static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
+{
+   return _mm512_or_si512( _mm512_slli_epi64( v, c ),
+                           _mm512_srli_epi64( v, 64-c ) );
+}
 
+static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
+{
+   return _mm512_or_si512( _mm512_srli_epi32( v, c ),
+                           _mm512_slli_epi32( v, 32-c ) );
+}
 
-// Here is a fixed bit rotate for 16 bit elements:
-#define mm512_ror_16( v, c ) \
-    _mm512_or_si512( _mm512_srli_epi16( v, c ), \
-                     _mm512_slli_epi16( v, 16-(c) )
-#define mm512_rol_16( v, c ) \
-    _mm512_or_si512( _mm512_slli_epi16( v, c ), \
-                     _mm512_srli_epi16( v, 16-(c) )
+static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
+{
+   return _mm512_or_si512( _mm512_slli_epi32( v, c ),
+                           _mm512_srli_epi32( v, 32-c ) );
+}
 
+static inline __m512i mm512_ror_16( __m512i const v, const int c )
+{
+   return _mm512_or_si512( _mm512_srli_epi16( v, c ),
+                           _mm512_slli_epi16( v, 16-c ) );
+}
 
+static inline __m512i mm512_rol_16( const __m512i v, const int c )
+{
+   return _mm512_or_si512( _mm512_slli_epi16( v, c ),
+                           _mm512_srli_epi16( v, 16-c ) );
+}
 
 // Rotations using a vector control index are very slow due to overhead
 // to generate the index vector. Repeated rotations using the same index
@@ -363,25 +347,32 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements in 512 bit vector.
 
+static inline __m512i mm512_swap_256( const __m512i v )
+{ return _mm512_alignr_epi64( v, v, 4 ); }
+
+static inline __m512i mm512_ror_1x128( const __m512i v )
+{ return _mm512_alignr_epi64( v, v, 2 ); }
+
+static inline __m512i mm512_rol_1x128( const __m512i v )
+{ return _mm512_alignr_epi64( v, v, 6 ); }
 
-#define mm512_swap_256( v )        _mm512_alignr_epi64( v, v, 4 )
+static inline __m512i mm512_ror_1x64( const __m512i v )
+{ return _mm512_alignr_epi64( v, v, 1 ); }
 
-// 1x64 notation used to disinguish from bit rotation.
-#define mm512_ror_1x128( v )       _mm512_alignr_epi64( v, v, 2 )
-#define mm512_rol_1x128( v )       _mm512_alignr_epi64( v, v, 6 )
+static inline __m512i mm512_rol_1x64( const __m512i v )
+{ return _mm512_alignr_epi64( v, v, 7 ); }
 
-#define mm512_ror_1x64( v )        _mm512_alignr_epi64( v, v, 1 )
-#define mm512_rol_1x64( v )        _mm512_alignr_epi64( v, v, 7 )
+static inline __m512i mm512_ror_1x32( const __m512i v )
+{ return _mm512_alignr_epi32( v, v, 1 ); }
 
-#define mm512_ror_1x32( v )        _mm512_alignr_epi32( v, v, 1 )
-#define mm512_rol_1x32( v )        _mm512_alignr_epi32( v, v, 15 )
+static inline __m512i mm512_rol_1x32( const __m512i v )
+{ return _mm512_alignr_epi32( v, v, 15 ); }
 
-// Generic for odd rotations
-#define mm512_ror_x64( v, n )      _mm512_alignr_epi64( v, v, n )
-#define mm512_rol_x64( v, n )      _mm512_alignr_epi64( v, v, 8-(n) )
+static inline __m512i mm512_ror_x64( const __m512i v, const int n )
+{ return _mm512_alignr_epi64( v, v, n ); }
 
-#define mm512_ror_x32( v, n )      _mm512_alignr_epi32( v, v, n )
-#define mm512_rol_x32( v, n )      _mm512_alignr_epi32( v, v, 16-(n) )
+static inline __m512i mm512_ror_x32( const __m512i v, const int n )
+{ return _mm512_alignr_epi32( v, v, n ); }
 
 #define mm512_ror_1x16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
@@ -411,38 +402,6 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
                        0x1E1D1C1B1A191817, 0x161514131211100F, \
                        0x0E0D0C0B0A090807, 0x060504030201003F ) )
 
-
-// Invert vector: {3,2,1,0} -> {0,1,2,3}
-#define mm512_invert_256( v ) \
-   _mm512_permutexvar_epi64( v, m512_const_64( 3,2,1,0,7,6,5,4 ) )
-
-#define mm512_invert_128( v ) \
-   _mm512_permutexvar_epi64( v, m512_const_64( 1,0,3,2,5,4,7,6 ) )
-
-#define mm512_invert_64( v ) \
-   _mm512_permutexvar_epi64( v, m512_const_64( 0,1,2,3,4,5,6,7 ) )
-
-#define mm512_invert_32( v ) \
-   _mm512_permutexvar_epi32( m512_const_64( \
-                      0x0000000000000001,0x0000000200000003, \
-                      0x0000000400000005,0x0000000600000007, \
-                      0x0000000800000009,0x0000000a0000000b, \
-                      0x0000000c0000000d,0x0000000e0000000f ), v )
-
-#define mm512_invert_16( v ) \
-   _mm512_permutexvar_epi16( m512_const_64( \
-                       0x0000000100020003, 0x0004000500060007, \
-                       0x00080009000A000B, 0x000C000D000E000F, \
-                       0x0010001100120013, 0x0014001500160017, \
-                       0x00180019001A001B, 0x001C001D001E001F ), v )
-
-#define mm512_invert_8(  v ) \
-   _mm512_shuffle_epi8( v, m512_const_64( \
-                       0x0001020304050607, 0x08090A0B0C0D0E0F, \
-                       0x1011121314151617, 0x18191A1B1C1D1E1F, \
-                       0x2021222324252627, 0x28292A2B2C2D2E2F, \
-                       0x3031323334353637, 0x38393A3B3C3D3E3F ) )
-
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
 
@@ -450,11 +409,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
 
 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror256_64( v )   _mm512_permutex_epi64( v, 0x39 )
-#define mm512_rol256_64( v )   _mm512_permutex_epi64( v, 0x93 )
+#define mm512_ror256_64( v )     _mm512_permutex_epi64( v, 0x39 )
+#define mm512_rol256_64( v )     _mm512_permutex_epi64( v, 0x93 )
 
 // Rotate 256 bit lanes by one 32 bit element
-
 #define mm512_ror256_32( v ) \
    _mm512_permutexvar_epi32( m512_const_64( \
                       0x000000080000000f, 0x0000000e0000000d, \
@@ -488,68 +446,41 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
                      0x203f3e3d3c3b3a39, 0x3837363534333231, \
                      0x302f2e2d2c2b2a29, 0x2827262524232221, \
                      0x001f1e1d1c1b1a19, 0x1817161514131211, \
-                     0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
+                     0x100f0e0d0c0b0a09, 0x0807060504030201 ) )
 
 #define mm512_rol256_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x3e3d3c3b3a393837, 0x363534333231302f, \
                      0x2e2d2c2b2a292827, 0x262524232221203f, \
                      0x1e1d1c1b1a191817, 0x161514131211100f, \
-                     0x0e0d0c0b0a090807, 0x060504030201001f ), v )
+                     0x0e0d0c0b0a090807, 0x060504030201001f ) )
 
 //
 // Rotate elements within 128 bit lanes of 512 bit vector.
 
-// Swap hi & lo 64 bits in each 128 bit lane
-#define mm512_swap128_64( v )    _mm512_shuffle_epi32( v, 0x4e )
+// Swap 64 bits in each 128 bit lane
+#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
 
 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror128_32( v )   _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol128_32( v )   _mm512_shuffle_epi32( v, 0x93 )
-
-#define mm512_ror128_x8( v, c )  _mm512_alignr_epi8( v, v, c )
-
-/*
-// Rotate 128 bit lanes by c bytes, faster than building that monstrous 
-// constant above.  
-#define mm512_ror128_8( v, c ) \
-   _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
-                    _mm512_bslli_epi128( v, 16-(c) ) )
-#define mm512_rol128_8( v, c ) \
-   _mm512_or_si512( _mm512_bslli_epi128( v, c ), \
-                    _mm512_bsrli_epi128( v, 16-(c) ) )
-*/
-
-//
-// Rotate elements within 64 bit lanes.
-
-#define mm512_rol64_x8( v, c )   _mm512_rol_epi64( v, ((c)<<3) )
-#define mm512_ror64_x8( v, c )   _mm512_ror_epi64( v, ((c)<<3) )
-
-// Swap 32 bit elements in each 64 bit lane
-#define mm512_swap64_32( v )      _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_ror128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_rol128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
 
-// Rotate each 64 bit lane by one 16 bit element.
-#define mm512_ror64_16( v )   _mm512_ror_epi64( v, 16 )
-#define mm512_rol64_16( v )   _mm512_rol_epi64( v, 16 )
-#define mm512_ror64_8( v )    _mm512_ror_epi64( v, 8 )
-#define mm512_rol64_8( v )    _mm512_rol_epi64( v, 8 )
-
-//
-// Rotate elements within 32 bit lanes.
+// Rotate right 128 bit lanes by c bytes
+static inline __m512i mm512_ror128_x8( const __m512i v, const int c )
+{  return _mm512_alignr_epi8( v, v, c ); }
 
-#define mm512_rol32_x8( v, c )   _mm512_rol_epi32( v, ((c)<<2) )
-#define mm512_ror32_x8( v, c )   _mm512_ror_epi32( v, ((c)<<2) )
+// Swap 32 bits in each 64 bit lane.
+#define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
 
 
 //
 //  Rotate elements from 2 512 bit vectors in place, source arguments
 //  are overwritten.
 
-#define mm512_swap1024_512(v1, v2) \
-   v1 = _mm512_xor_si512(v1, v2); \
-   v2 = _mm512_xor_si512(v1, v2); \
-   v1 = _mm512_xor_si512(v1, v2);
+#define mm512_swap1024_512( v1, v2 ) \
+   v1 = _mm512_xor_si512( v1, v2 ); \
+   v2 = _mm512_xor_si512( v1, v2 ); \
+   v1 = _mm512_xor_si512( v1, v2 );
 
 #define mm512_ror1024_256( v1, v2 ) \
 do { \
diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h
index 2f50ec1a..e74066b6 100644
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -1,18 +1,18 @@
 #if !defined(SIMD_64_H__)
 #define SIMD_64_H__ 1
 
-#if defined(__MMX__)
+#if defined(__MMX__) && defined(__SSE__)
 
 ////////////////////////////////////////////////////////////////
 //
 //               64 bit MMX vectors.
 //
-// There are rumours MMX wil be removed. Although casting with int64
-// works there is likely some overhead to move the data to An MMX register
-// and back.
-
+// This code is not used anywhere annd likely never will. It's intent was
+// to support 2 way parallel hashing using SSE2 for 64 bit, and MMX for 32
+// bit hash functions, but was never implemented.
 
 // Pseudo constants
+
 /*
 #define m64_zero   _mm_setzero_si64()
 #define m64_one_64 _mm_set_pi32(  0UL, 1UL )
@@ -30,79 +30,67 @@
 
 #define casti_m64(p,i) (((__m64*)(p))[(i)])
 
-// cast all arguments as the're likely to be uint64_t
-
 // Bitwise not: ~(a)
 //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
 #define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )
 
 // Unary negate elements
-#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v )
-#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, (__m64)v )
-#define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, (__m64)v )
+#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
+#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
+#define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, v )
 
 // Rotate bits in packed elements of 64 bit vector
 #define mm64_rol_64( a, n ) \
-   _mm_or_si64( _mm_slli_si64( (__m64)(a), n ), \
-                _mm_srli_si64( (__m64)(a), 64-(n) ) )
+   _mm_or_si64( _mm_slli_si64( a, n ), \
+                _mm_srli_si64( a, 64-(n) ) )
 
 #define mm64_ror_64( a, n ) \
-   _mm_or_si64( _mm_srli_si64( (__m64)(a), n ), \
-                _mm_slli_si64( (__m64)(a), 64-(n) ) )
+   _mm_or_si64( _mm_srli_si64( a, n ), \
+                _mm_slli_si64( a, 64-(n) ) )
 
 #define mm64_rol_32( a, n ) \
-   _mm_or_si64( _mm_slli_pi32( (__m64)(a), n ), \
-                _mm_srli_pi32( (__m64)(a), 32-(n) ) )
+   _mm_or_si64( _mm_slli_pi32( a, n ), \
+                _mm_srli_pi32( a, 32-(n) ) )
 
 #define mm64_ror_32( a, n ) \
-   _mm_or_si64( _mm_srli_pi32( (__m64)(a), n ), \
-                _mm_slli_pi32( (__m64)(a), 32-(n) ) )
+   _mm_or_si64( _mm_srli_pi32( a, n ), \
+                _mm_slli_pi32( a, 32-(n) ) )
 
 #define mm64_rol_16( a, n ) \
-   _mm_or_si64( _mm_slli_pi16( (__m64)(a), n ), \
-                _mm_srli_pi16( (__m64)(a), 16-(n) ) )
+   _mm_or_si64( _mm_slli_pi16( a, n ), \
+                _mm_srli_pi16( a, 16-(n) ) )
 
 #define mm64_ror_16( a, n ) \
-   _mm_or_si64( _mm_srli_pi16( (__m64)(a), n ), \
-                _mm_slli_pi16( (__m64)(a), 16-(n) ) )
+   _mm_or_si64( _mm_srli_pi16( a, n ), \
+                _mm_slli_pi16( a, 16-(n) ) )
 
 // Rotate packed elements accross lanes. Useful for byte swap and byte
 // rotation.
 
-// _mm_shuffle_pi8 requires SSSE3 while _mm_shuffle_pi16 requires SSE
-// even though these are MMX instructions.
-
 // Swap hi & lo 32 bits.
-#define mm64_swap32( a )      _mm_shuffle_pi16( (__m64)(a), 0x4e )
+#define mm64_swap_32( a )     _mm_shuffle_pi16( a, 0x4e )
 
-#define mm64_ror1x16_64( a )  _mm_shuffle_pi16( (__m64)(a), 0x39 ) 
-#define mm64_rol1x16_64( a )  _mm_shuffle_pi16( (__m64)(a), 0x93 ) 
+#define mm64_ror64_1x16( a )  _mm_shuffle_pi16( a, 0x39 ) 
+#define mm64_rol64_1x16( a )  _mm_shuffle_pi16( a, 0x93 ) 
 
 // Swap hi & lo 16 bits of each 32 bit element
-#define mm64_swap16_32( a )  _mm_shuffle_pi16( (__m64)(a), 0xb1 )
+#define mm64_swap32_16( a )  _mm_shuffle_pi16( a, 0xb1 )
 
 #if defined(__SSSE3__)
 
 // Endian byte swap packed elements
-// A vectorized version of the u64 bswap, use when data already in MMX reg.
-#define mm64_bswap_64( v ) \
-    _mm_shuffle_pi8( (__m64)v, (__m64)0x0001020304050607 )
-
 #define mm64_bswap_32( v ) \
-    _mm_shuffle_pi8( (__m64)v, (__m64)0x0405060700010203 )
+    _mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
 
 #define mm64_bswap_16( v ) \
-    _mm_shuffle_pi8( (__m64)v, (__m64)0x0607040502030001 );
+    _mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
 
-#else
+// Rotate right by c bytes
+static inline __m64 mm64_ror_x8( __m64 v, const int c )
+{ return _mm_alignr_pi8( v, v, c ); }
 
-#define mm64_bswap_64( v ) \
-       (__m64)__builtin_bswap64( (uint64_t)v )
+#else
 
-// These exist only for compatibility with CPUs without SSSE3. MMX doesn't
-// have extract 32 instruction so pointers are needed to access elements.
-// It' more efficient for the caller to use scalar variables and call
-// bswap_32 directly.
 #define mm64_bswap_32( v ) \
    _mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
                  __builtin_bswap32( ((uint32_t*)&v)[0] )  )
@@ -115,17 +103,6 @@
 
 #endif
 
-// 64 bit mem functions use integral sizes instead of bytes, data must
-// be aligned to 64 bits.
-static inline void memcpy_m64( __m64 *dst, const __m64 *src, int n )
-{   for ( int i = 0; i < n; i++ ) dst[i] = src[i]; }
-
-static inline void memset_zero_m64( __m64 *src, int n )
-{   for ( int i = 0; i < n; i++ ) src[i] = (__m64)0ULL; }
-
-static inline void memset_m64( __m64 *dst, const __m64 a,  int n )
-{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
-
 #endif // MMX
 
 #endif // SIMD_64_H__
diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h
index 711134c8..5fff450f 100644
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -1,69 +1,16 @@
 #if !defined(SIMD_INT_H__)
 #define SIMD_INT_H__ 1
 
-///////////////////////////////////
-//
-//    Integers up to 128 bits.
-//
-//   These utilities enhance support for integers up to 128 bits.
-//   All standard operations are supported on 128 bit integers except
-//   numeric constant representation and IO. 128 bit integers must be built
-//   and displayed as 2 64 bit halves, just like the old times.
-//
-//   Some utilities are also provided for smaller integers, most notably
-//   bit rotation.   
-
-
-
-// MMX has no extract instruction for 32 bit elements so this:
-// Lo is trivial, high is a simple shift. 
-// Input may be uint64_t or __m64, returns uint32_t.
-#define u64_extr_lo32(a)   ( (uint32_t)( (uint64_t)(a) ) )
-#define u64_extr_hi32(a)   ( (uint32_t)( ((uint64_t)(a)) >> 32)  )
-
-#define u64_extr_32( a, n )  ( (uint32_t)( (a) >> ( ( 2-(n)) <<5 ) ) )
-#define u64_extr_16( a, n )  ( (uint16_t)( (a) >> ( ( 4-(n)) <<4 ) ) )
-#define u64_extr_8(  a, n )  ( (uint8_t) ( (a) >> ( ( 8-(n)) <<3 ) ) )
-
-// Rotate bits in various sized integers.
-#define u64_ror_64( x, c ) \
-      (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
-#define u64_rol_64( x, c ) \
-      (uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) )
-#define u32_ror_32( x, c ) \
-      (uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) )
-#define u32_rol_32( x, c ) \
-      (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
-#define u16_ror_16( x, c ) \
-      (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
-#define u16_rol_16( x, c ) \
-      (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
-#define u8_ror_8( x, c ) \
-      (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )
-#define u8_rol_8( x, c ) \
-      (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )
-
 // Endian byte swap
 #define bswap_64( a ) __builtin_bswap64( a )
 #define bswap_32( a ) __builtin_bswap32( a )
 
-// 64 bit mem functions use integral sizes instead of bytes, data must
-// be aligned to 64 bits. Mostly for scaled indexing convenience.
-static inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
-{   for ( int i = 0; i < n; i++ ) dst[i] = src[i]; }
-
-static inline void memset_zero_64( uint64_t *src, int n )
-{   for ( int i = 0; i < n; i++ ) src[i] = 0ull; }
-
-static inline void memset_64( uint64_t *dst, const uint64_t a,  int n )
-{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
-
 
 ///////////////////////////////////////
 // 
 //      128 bit integers
 //
-//  128 bit integers are inneficient and not a shortcut for __m128i.
+// 128 bit integers are inneficient and not a shortcut for __m128i.
 // Native type __int128 supported starting with GCC-4.8.
 //
 // __int128 uses two 64 bit GPRs to hold the data. The main benefits are
@@ -94,31 +41,12 @@ static inline void memset_64( uint64_t *dst, const uint64_t a,  int n )
 typedef          __int128  int128_t;
 typedef unsigned __int128 uint128_t;
 
-
-
-// Maybe usefull for making constants.
-#define mk_uint128( hi, lo ) \
-   ( ( (uint128_t)(hi) << 64 ) | ( (uint128_t)(lo) ) )
-
-
 // Extracting the low bits is a trivial cast.
 // These specialized functions are optimized while providing a
 // consistent interface.
 #define u128_hi64( x )    ( (uint64_t)( (uint128_t)(x) >> 64 ) )
 #define u128_lo64( x )    ( (uint64_t)(x) )
 
-// Generic extract, don't use for extracting low bits, cast instead.
-#define u128_extr_64( a, n )  ( (uint64_t)( (a) >> ( ( 2-(n)) <<6 ) ) )
-#define u128_extr_32( a, n )  ( (uint32_t)( (a) >> ( ( 4-(n)) <<5 ) ) )
-#define u128_extr_16( a, n )  ( (uint16_t)( (a) >> ( ( 8-(n)) <<4 ) ) )
-#define u128_extr_8(  a, n )  ( (uint8_t) ( (a) >> ( (16-(n)) <<3 ) ) )
-
-// Not much need for this but it fills a gap.
-#define u128_ror_128( x, c ) \
-       ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )
-#define u128_rol_128( x, c ) \
-       ( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) )
-
 #endif  // GCC_INT128
 
 #endif // SIMD_INT_H__

From d0b494132164c681999b9b61a81f56a1e5f3fe4e Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Fri, 19 Mar 2021 15:45:32 -0400
Subject: [PATCH 03/20] v3.16.0

---
 Makefile.am                        |   6 +-
 RELEASE_NOTES                      |   4 +
 algo-gate-api.c                    |   1 +
 algo/sha/sph_sha2.c                |   8 +
 algo/sha/sph_sha2.h                |   4 +
 algo/verthash/.verthash-gate.c.swp | Bin 0 -> 16384 bytes
 algo/verthash/Verthash.c           | 621 +++++++++++++++++++++++++++++
 algo/verthash/Verthash.h           |  61 +++
 algo/verthash/fopen_utf8.c         | 181 +++++++++
 algo/verthash/fopen_utf8.h         |  25 ++
 algo/verthash/tiny_sha3/sha3.c     | 191 +++++++++
 algo/verthash/tiny_sha3/sha3.h     |  55 +++
 algo/verthash/verthash-gate.c      |  96 +++++
 configure                          |  20 +-
 configure.ac                       |   2 +-
 cpu-miner.c                        |  12 +-
 miner.h                            |   3 +
 verthash-help.txt                  |  17 +
 winbuild-cross.sh                  |   1 +
 19 files changed, 1290 insertions(+), 18 deletions(-)
 create mode 100644 algo/verthash/.verthash-gate.c.swp
 create mode 100644 algo/verthash/Verthash.c
 create mode 100644 algo/verthash/Verthash.h
 create mode 100644 algo/verthash/fopen_utf8.c
 create mode 100644 algo/verthash/fopen_utf8.h
 create mode 100644 algo/verthash/tiny_sha3/sha3.c
 create mode 100644 algo/verthash/tiny_sha3/sha3.h
 create mode 100644 algo/verthash/verthash-gate.c
 create mode 100644 verthash-help.txt

diff --git a/Makefile.am b/Makefile.am
index c3a999d2..f4163820 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,7 +129,7 @@ cpuminer_SOURCES = \
   algo/lyra2/allium.c \
   algo/lyra2/phi2-4way.c \
   algo/lyra2/phi2.c \
-  algo//m7m/m7m.c \
+  algo/m7m/m7m.c \
   algo/m7m/magimath.cpp \
   algo/nist5/nist5-gate.c \
   algo/nist5/nist5-4way.c \
@@ -192,6 +192,10 @@ cpuminer_SOURCES = \
   algo/sm3/sm3-hash-4way.c \
   algo/swifftx/swifftx.c \
   algo/tiger/sph_tiger.c \
+  algo/verthash/verthash-gate.c \
+  algo/verthash/Verthash.c \
+  algo/verthash/fopen_utf8.c \
+  algo/verthash/tiny_sha3/sha3.c \
   algo/whirlpool/sph_whirlpool.c \
   algo/whirlpool/whirlpool-hash-4way.c \
   algo/whirlpool/whirlpool-gate.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 732b5e64..e6c7f14a 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,10 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.16.0
+
+Added verthash algo.
+
 v3.15.7
 
 Added accepted/stale/rejected percentage to summary log report.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index 6f273ccf..2fea7afe 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -349,6 +349,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_TRIBUS:        register_tribus_algo        ( gate ); break;
     case ALGO_VANILLA:       register_vanilla_algo       ( gate ); break;
     case ALGO_VELTOR:        register_veltor_algo        ( gate ); break;
+    case ALGO_VERTHASH:      register_verthash_algo      ( gate ); break;
     case ALGO_WHIRLPOOL:     register_whirlpool_algo     ( gate ); break;
     case ALGO_WHIRLPOOLX:    register_whirlpoolx_algo    ( gate ); break;
     case ALGO_X11:           register_x11_algo           ( gate ); break;
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index 513a29fd..e96a2d1c 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -691,6 +691,14 @@ sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 //	sph_sha256_init(cc);
 }
 
+void sph_sha256_full( void *dst, const void *data, size_t len )
+{
+   sph_sha256_context cc;
+   sph_sha256_init( &cc );
+   sph_sha256( &cc, data, len );
+   sph_sha256_close( &cc, dst );
+}   
+
 /* see sph_sha2.h */
 //void
 //sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h
index df0e8369..e3a83eb8 100644
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -205,6 +205,10 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
 #define sph_sha256_comp   sph_sha224_comp
 #endif
 
+void sph_sha256_full( void *dst, const void *data, size_t len );
+
+
+
 #if SPH_64
 
 /**
diff --git a/algo/verthash/.verthash-gate.c.swp b/algo/verthash/.verthash-gate.c.swp
new file mode 100644
index 0000000000000000000000000000000000000000..4c59aa9e76706b9dc6b924ebfa9e701366767038
GIT binary patch
literal 16384
zcmeHN%a0tz881v?NC@VUa6kyDycVWsnVFr(&g?8^SF!-ZDrPMiV<a4_sp+nsY24F2
z>c_5^m6KBxh!i2ji35pph{!3R905fULY$D`hTwpRP$bAd5JgcWeqZ%-cE_xJLvgkA
ztDdf|`o5~)_o(UW9=6VJJjDmiHp6izW4HbL-1Dy=`iQ;sK#N&{>r?eXN3;`lRt{=}
z`@bZCOSasS-jX#<Cay2TMlg++y3Ka4Ihd(yq;(CnZfwag9*bzaB)n0uR9xjh8Y2<Q
zrgaSi&`mT0H!1^3>@)_qvu>x|A_|Tz9OXZ~`<WY+4LxtofM!55pc&8%Xa+O`ngPwg
z4aq<}J;Z*6nIFn#|7>=?Y0vrQ?D|L9Ie*;F+hx`Iq#4i*Xa+O`ngPv#W<WEb8PE)9
z1~dbj0nNbokO5&c_9-6z&-?&`!~6gI`Tw7HF!mwvHt-hkTi|8j9B>5q><D9T0txUa
za0hS%_~drRJ_cR|ehzE^_W)o0gt7O5H-Hd$40sSY4E*CZ#=ZpJ1zrPw2fPGCKo|JW
zj~V*_coldVkU$@30>^-_uqp9(;4i=*fj<Dh2i^cY;03@1P6PJ?_X0NoZ{5n+uYu=*
zUjTOjcLHDB!q`W^E5L7n=Yacxe;sD*8{p5t2>2Or7<lK0jQtgO6L=cf1Wo}bf%|~}
zf!p_h*8$>r6L<>P!lud#fC-!fHh`1BD)2RI@+aVL-?hQQ=Nw`>^@5Rc`pgsN<7dy-
z`NApTdeY``z=h`p7QU6>6FCXO9qt4nKc9VD!EF(Xh1v-=N946teYfJhGQ@Om&2?PQ
zQFR@?RnvVUu2;a-+l%tnfj`rx`v!9ro0WpU|G%zvP6zm*^diY!hZ|et(}A$h$|&(-
zzP83YHCB?aPuYxHa|aQ?eKCQgh`aul@Lap%>+{673-L=!{7K>4kWdJfVTjPN0$WBj
zh+QgZpme@MMS4_5_3X*Bn;V-?ARG%J6maRtwq)o!E+W$nd>+MOq{eBxkr*O!kCr3N
zbwGSY4X!Gh(x5*`3qGgfj!Fy_@eoweux}0<o2Smqp8!c4O6l`-Cdf8XVp%eB63^S2
zRgg-}%E*0D29AMV>ie$8M0^UCdJ>;{nZun}Mm1g|Q@40w46b~E5|}la8hmSvJ|gc_
zC;js!)ft=7Jaduv+VGvANoIK(<Kop^7){4!G!~s+-=sNRU2!3zUg}F+KOUIzp21$|
zb}sTYzKhvi$+PY*gR@l*a?GRF#15O8Gn_Byr)G`o!gF$ts1uLF1T)CSKtkccsdaW#
zrdN(<ps5@4ZCFqG%GYW+O%;c_2D5vJ%(+IV$e0&hts2LmHUs`RMG_j}>d~>ZE|oFn
z`fltZP+u&<B2FmUKqi>1=7PfNrV9h2aAV_m)uyFim}Fg((t)YcfwD1l=dDX_h&q5i
zEo`humhkBVL=J-)TY+mY){f_OX5N>Kh>sb~ScD@Pn~tzzBnc#{UccEo!PqVv27$*z
zIdY>|hDAS<zG^hEIFAC8)S2<3QVa7jroudwaT59*LNKT)P2oa&m7K>{HH$Pf8JTwA
zORD4zST~Gf(+_-0swP+X_Ly>@fxC}zPvkfG@#FmHb`V}NLy5!_#uwsoXu9^rW_Ddh
zf&b7$N)(1r$vsgroVc+W1ztkg%itue&S^w-KG}h^Qo^3W52I}{HM<=eQxR*sG({ZB
ziB+C9NtTgGoTlU=EzrP+(znKim!}<z(n$Qs9r<ueYb-);EY>LEYD!S4uV|kBFs#6j
zV!prYpoOlIsLumCH+wd|{IW~{g;Nz(lve8GRdgs}Gj>#Sw}UhX<nk!(bj@ifi3W28
zY^jdf3?`FyXJ9HOyB%}T%I?-vFQG;Zs?td*?@K9LQ;w>BPegc0*)?rM(UrOYJW1KF
zvF_Tc0i`i0iD9m_eGZoKmN%(*7{rLnsVl-U+ZA1fz>2p*bm8E$S{HNMuPGmEtf#K9
zw-K2LLdXpp>vS>OfLOdRxX4XY#Bt~j6U3Wo7|1i^Dn@^~R!e=c7+h+nPV1mPyTtUI
zg9P=LL0dIU>saQ)#LBov*5Zp<urRW~zT9Hu?P*?oF%7a}hy}q|<q*ZOYh@-U&tEM3
zaBuQk;|sZ$(W<v_f|IwjE)V;4l+rFs;m~h;h00EwN+GCR9;9m*YFFu6T2wj%DhI01
zK((`%sb)pV?XFS;wJx_?RM%G0mQ|}%OTSC?ogS4#)yD4C8J5Y*Le=$D*;W0d(zL94
zSZcgIr33wDi6E-f5?rpI`JaaLpGLYGQ|QsGgy^*G72!y%YQuhab)|2ka^N@vX?0fZ
z-mrzF-WATWXt&#~VXN1-?B%W;blbh2=#iB3nWAz*NfVFjbM?<gJ$r%uV)ZU8mzyMA
z%&^vFk|>KHoR7J_<t4T(|425+3zb^+i9B+~QvR1^|84cXd_zoKx<$|b|HX6r2|Ux&
z^FJPo)$uv5>3QD;=$ZcvunDXKj{vHD)>fZ11DXNNfM!55pc&8%Xa+O`ngPv#W<WFW
zf5$+!Ba73_<}7Wvr@L*%?r64c>j=CVQoB&IyF=AY>|+0unpF1Q=e^9?voW6dm;7Mc
n=jA)8hxr5cLcUF@-ssTwI=$?*<uDnITz|x;NjME6yuSJtinpbG

literal 0
HcmV?d00001

diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
new file mode 100644
index 00000000..5baeb68f
--- /dev/null
+++ b/algo/verthash/Verthash.c
@@ -0,0 +1,621 @@
+﻿/*
+ * Copyright 2018-2021 CryptoGraphics
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version. See LICENSE for more details.
+ */
+
+#include "Verthash.h"
+
+//-----------------------------------------------------------------------------
+// Verthash info management
+int verthash_info_init(verthash_info_t* info, const char* file_name)
+{
+    // init fields to 0
+    info->fileName = NULL;
+    info->data = NULL;
+    info->dataSize = 0;
+    info->bitmask = 0;
+
+    // get name
+    if (file_name == NULL) { return 1; }
+    size_t fileNameLen = strlen(file_name);
+    if (fileNameLen == 0) { return 1; }
+
+    info->fileName = (char*)malloc(fileNameLen+1);
+    if (!info->fileName)
+    {
+        // Memory allocation fatal error.
+        return 2;
+    }
+
+    memset(info->fileName, 0, fileNameLen+1);
+    memcpy(info->fileName, file_name, fileNameLen);
+
+    // Load data
+    FILE *fileMiningData = fopen_utf8(info->fileName, "rb");
+    // Failed to open file for reading
+    if (!fileMiningData) { return 1; }
+
+    // Get file size
+    fseek(fileMiningData, 0, SEEK_END);
+    uint64_t fileSize = (uint64_t)ftell(fileMiningData);
+    fseek(fileMiningData, 0, SEEK_SET);
+
+    // Allocate data
+    info->data = (uint8_t *)malloc(fileSize);
+    if (!info->data)
+    {
+        fclose(fileMiningData);
+        // Memory allocation fatal error.
+        return 2;
+    }
+
+    // Load data
+    fread(info->data, fileSize, 1, fileMiningData);
+    fclose(fileMiningData);
+
+    // Update fields
+    info->bitmask = ((fileSize - VH_HASH_OUT_SIZE)/VH_BYTE_ALIGNMENT) + 1;
+    info->dataSize = fileSize;
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+void verthash_info_free(verthash_info_t* info)
+{
+    free(info->fileName);
+    free(info->data);
+    info->dataSize = 0;
+    info->bitmask = 0;
+}
+
+
+//-----------------------------------------------------------------------------
+// Verthash hash
+#define VH_P0_SIZE 64
+#define VH_N_ITER 8 
+#define VH_N_SUBSET VH_P0_SIZE*VH_N_ITER
+#define VH_N_ROT 32
+#define VH_N_INDEXES 4096
+#define VH_BYTE_ALIGNMENT 16
+
+static __thread sha3_ctx_t sha3_midstate_ctx;
+
+void verthash_sha3_prehash_72( const void *data )
+{
+   sha3_init( &sha3_midstate_ctx, 256 );
+   sha3_update( &sha3_midstate_ctx, data, 72 );
+}
+
+void verthash_sha3_final_8( sha3_ctx_t *ctx, void *out, const void *data )
+{
+   sha3_update( ctx, data, 8 );
+   sha3_final( out, ctx );
+}
+
+static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
+{
+    return (a ^ b) * 0x1000193;
+}
+
+void verthash_hash(const unsigned char* blob_bytes,
+                   const size_t blob_size,
+                   const unsigned char(*input)[VH_HEADER_SIZE],
+                   unsigned char(*output)[VH_HASH_OUT_SIZE])
+{
+    unsigned char p1[VH_HASH_OUT_SIZE];
+//    sha3_ctx_t sha3_ctx;
+//    memcpy ( &sha3_ctx, &sha3_midstate_ctx, sizeof sha3_ctx );
+//    verthash_sha3_final_8( &sha3_ctx, &p1[0], &input[72] );
+
+    sha3(&input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE);
+
+    unsigned char p0[VH_N_SUBSET];
+
+    unsigned char input_header[VH_HEADER_SIZE];
+    memcpy(input_header, input, VH_HEADER_SIZE);
+
+    for (size_t i = 0; i < VH_N_ITER; ++i)
+    {
+        input_header[0] += 1;
+        sha3(&input_header[0], VH_HEADER_SIZE, p0 + i * VH_P0_SIZE, VH_P0_SIZE);
+    }
+
+    uint32_t* p0_index = (uint32_t*)p0;
+    uint32_t seek_indexes[VH_N_INDEXES];
+
+    for (size_t x = 0; x < VH_N_ROT; ++x)
+    {
+        memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
+                p0, VH_N_SUBSET);
+        for (size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); ++y)
+        {
+            *(p0_index + y) = ( *(p0_index + y) << 1 )
+            | ( 1 & (*(p0_index + y) >> 31) );
+        }
+    }
+
+    uint32_t* p1_32 = (uint32_t*)p1;
+    uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes;
+    uint32_t value_accumulator = 0x811c9dc5;
+    const uint32_t mdiv = ((blob_size - VH_HASH_OUT_SIZE) / VH_BYTE_ALIGNMENT) + 1;
+    for (size_t i = 0; i < VH_N_INDEXES; i++)
+    {
+        const uint32_t offset = (fnv1a(seek_indexes[i], value_accumulator) % mdiv) * VH_BYTE_ALIGNMENT / sizeof(uint32_t);
+        for (size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++)
+        {
+            const uint32_t value = *(blob_bytes_32 + offset + i2);
+            uint32_t* p1_ptr = p1_32 + i2;
+            *p1_ptr = fnv1a(*p1_ptr, value);
+
+            value_accumulator = fnv1a(value_accumulator, value);
+        }
+    }
+
+    memcpy(output, p1, VH_HASH_OUT_SIZE);
+}
+
+//-----------------------------------------------------------------------------
+// Verthash data file generator
+
+#define NODE_SIZE 32
+
+struct Graph
+{
+    FILE *db;
+    int64_t log2;
+    int64_t pow2;
+    uint8_t *pk;
+    int64_t index;
+};
+
+int64_t Log2(int64_t x)
+{
+    int64_t r = 0;
+    for (; x > 1; x >>= 1)
+    {
+        r++;
+    }
+
+    return r;
+}
+
+int64_t bfsToPost(struct Graph *g, const int64_t node)
+{
+    return node & ~g->pow2;
+}
+
+int64_t numXi(int64_t index)
+{
+    return (1 << ((uint64_t)index)) * (index + 1) * index;
+}
+
+void WriteId(struct Graph *g, uint8_t *Node, const int64_t id)
+{
+    fseek(g->db, id * NODE_SIZE, SEEK_SET);
+    fwrite(Node, 1, NODE_SIZE, g->db);
+}
+
+void WriteNode(struct Graph *g, uint8_t *Node, const int64_t id)
+{
+    const int64_t idx = bfsToPost(g, id);
+    WriteId(g, Node, idx);
+}
+
+void NewNode(struct Graph *g, const int64_t id, uint8_t *hash)
+{
+    WriteNode(g, hash, id);
+}
+
+uint8_t *GetId(struct Graph *g, const int64_t id)
+{
+    fseek(g->db, id * NODE_SIZE, SEEK_SET);
+    uint8_t *node = (uint8_t *)malloc(NODE_SIZE);
+    const size_t bytes_read = fread(node, 1, NODE_SIZE, g->db);
+    if(bytes_read != NODE_SIZE) {
+        return NULL;
+    }
+    return node;
+}
+
+uint8_t *GetNode(struct Graph *g, const int64_t id)
+{
+    const int64_t idx = bfsToPost(g, id);
+    return GetId(g, idx);
+}
+
+uint32_t WriteVarInt(uint8_t *buffer, int64_t val)
+{
+    memset(buffer, 0, NODE_SIZE);
+    uint64_t uval = ((uint64_t)(val)) << 1;
+    if (val < 0)
+    {
+        uval = ~uval;
+    }
+    uint32_t i = 0;
+    while (uval >= 0x80)
+    {
+        buffer[i] = (uint8_t)uval | 0x80;
+        uval >>= 7;
+        i++;
+    }
+    buffer[i] = (uint8_t)uval;
+    return i;
+}
+
+void ButterflyGraph(struct Graph *g, int64_t index, int64_t *count)
+{
+    if (index == 0)
+    {
+        index = 1;
+    }
+
+    int64_t numLevel = 2 * index;
+    int64_t perLevel = (int64_t)(1 << (uint64_t)index);
+    int64_t begin = *count - perLevel;
+    int64_t level, i;
+
+    for (level = 1; level < numLevel; level++)
+    {
+        for (i = 0; i < perLevel; i++)
+        {
+            int64_t prev;
+            int64_t shift = index - level;
+            if (level > numLevel / 2)
+            {
+                shift = level - numLevel / 2;
+            }
+            if (((i >> (uint64_t)shift) & 1) == 0)
+            {
+                prev = i + (1 << (uint64_t)shift);
+            }
+            else
+            {
+                prev = i - (1 << (uint64_t)shift);
+            }
+
+            uint8_t *parent0 = GetNode(g, begin + (level - 1) * perLevel + prev);
+            uint8_t *parent1 = GetNode(g, *count - perLevel);
+            uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
+            WriteVarInt(buf, *count);
+            uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
+            memcpy(hashInput, g->pk, NODE_SIZE);
+            memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+            memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
+            memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE);
+
+            uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
+            sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE);
+
+            NewNode(g, *count, hashOutput);
+            (*count)++;
+
+            free(hashOutput);
+            free(hashInput);
+            free(parent0);
+            free(parent1);
+            free(buf);
+        }
+    }
+}
+
+void XiGraphIter(struct Graph *g, int64_t index)
+{
+    int64_t count = g->pow2;
+
+    int8_t stackSize = 5;
+    int64_t *stack = (int64_t *)malloc(sizeof(int64_t) * stackSize);
+    for (int i = 0; i < 5; i++)
+        stack[i] = index;
+
+    int8_t graphStackSize = 5;
+    int32_t *graphStack = (int32_t *)malloc(sizeof(int32_t) * graphStackSize);
+    for (int i = 0; i < 5; i++)
+        graphStack[i] = graphStackSize - i - 1;
+
+    int64_t i = 0;
+    int64_t graph = 0;
+    int64_t pow2index = 1 << ((uint64_t)index);
+
+    for (i = 0; i < pow2index; i++)
+    {
+        uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
+        WriteVarInt(buf, count);
+        uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 2);
+        memcpy(hashInput, g->pk, NODE_SIZE);
+        memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+        uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
+
+        sha3(hashInput, NODE_SIZE * 2, hashOutput, NODE_SIZE);
+        NewNode(g, count, hashOutput);
+        count++;
+
+        free(hashOutput);
+        free(hashInput);
+        free(buf);
+    }
+
+    if (index == 1)
+    {
+        ButterflyGraph(g, index, &count);
+        return;
+    }
+
+    while (stackSize != 0 && graphStackSize != 0)
+    {
+
+        index = stack[stackSize - 1];
+        graph = graphStack[graphStackSize - 1];
+
+        stackSize--;
+        if (stackSize > 0)
+        {
+            int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize));
+            memcpy(tempStack, stack, sizeof(int64_t) * (stackSize));
+            free(stack);
+            stack = tempStack;
+        }
+
+        graphStackSize--;
+        if (graphStackSize > 0)
+        {
+            int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize));
+            memcpy(tempGraphStack, graphStack, sizeof(int32_t) * (graphStackSize));
+            free(graphStack);
+            graphStack = tempGraphStack;
+        }
+
+        int8_t indicesSize = 5;
+        int64_t *indices = (int64_t *)malloc(sizeof(int64_t) * indicesSize);
+        for (int i = 0; i < indicesSize; i++)
+            indices[i] = index - 1;
+
+        int8_t graphsSize = 5;
+        int32_t *graphs = (int32_t *)malloc(sizeof(int32_t) * graphsSize);
+        for (int i = 0; i < graphsSize; i++)
+            graphs[i] = graphsSize - i - 1;
+
+        int64_t pow2indexInner = 1 << ((uint64_t)index);
+        int64_t pow2indexInner_1 = 1 << ((uint64_t)index - 1);
+
+        if (graph == 0)
+        {
+            uint64_t sources = count - pow2indexInner;
+            for (i = 0; i < pow2indexInner_1; i++)
+            {
+                uint8_t *parent0 = GetNode(g, sources + i);
+                uint8_t *parent1 = GetNode(g, sources + i + pow2indexInner_1);
+
+                uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
+                WriteVarInt(buf, count);
+
+                uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
+                memcpy(hashInput, g->pk, NODE_SIZE);
+                memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE);
+
+                uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
+                sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE);
+
+                NewNode(g, count, hashOutput);
+                count++;
+
+                free(hashOutput);
+                free(hashInput);
+                free(parent0);
+                free(parent1);
+                free(buf);
+            }
+        }
+        else if (graph == 1)
+        {
+            uint64_t firstXi = count;
+            for (i = 0; i < pow2indexInner_1; i++)
+            {
+                uint64_t nodeId = firstXi + i;
+                uint8_t *parent = GetNode(g, firstXi - pow2indexInner_1 + i);
+
+                uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
+                WriteVarInt(buf, nodeId);
+
+                uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
+                memcpy(hashInput, g->pk, NODE_SIZE);
+                memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
+
+                uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
+                sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
+
+                NewNode(g, count, hashOutput);
+                count++;
+
+                free(hashOutput);
+                free(hashInput);
+                free(parent);
+                free(buf);
+            }
+        }
+        else if (graph == 2)
+        {
+            uint64_t secondXi = count;
+            for (i = 0; i < pow2indexInner_1; i++)
+            {
+                uint64_t nodeId = secondXi + i;
+                uint8_t *parent = GetNode(g, secondXi - pow2indexInner_1 + i);
+
+                uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
+                WriteVarInt(buf, nodeId);
+
+                uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
+                memcpy(hashInput, g->pk, NODE_SIZE);
+                memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
+
+                uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
+                sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
+
+                NewNode(g, count, hashOutput);
+                count++;
+
+                free(hashOutput);
+                free(hashInput);
+                free(parent);
+                free(buf);
+            }
+        }
+        else if (graph == 3)
+        {
+            uint64_t secondButter = count;
+            for (i = 0; i < pow2indexInner_1; i++)
+            {
+                uint64_t nodeId = secondButter + i;
+                uint8_t *parent = GetNode(g, secondButter - pow2indexInner_1 + i);
+
+                uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
+                WriteVarInt(buf, nodeId);
+
+                uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
+                memcpy(hashInput, g->pk, NODE_SIZE);
+                memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
+
+                uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
+                sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
+
+                NewNode(g, count, hashOutput);
+                count++;
+
+                free(hashOutput);
+                free(hashInput);
+                free(parent);
+                free(buf);
+            }
+        }
+        else
+        {
+            uint64_t sinks = count;
+            uint64_t sources = sinks + pow2indexInner - numXi(index);
+            for (i = 0; i < pow2indexInner_1; i++)
+            {
+                uint64_t nodeId0 = sinks + i;
+                uint64_t nodeId1 = sinks + i + pow2indexInner_1;
+                uint8_t *parent0 = GetNode(g, sinks - pow2indexInner_1 + i);
+                uint8_t *parent1_0 = GetNode(g, sources + i);
+                uint8_t *parent1_1 = GetNode(g, sources + i + pow2indexInner_1);
+
+                uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
+                WriteVarInt(buf, nodeId0);
+
+                uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
+                memcpy(hashInput, g->pk, NODE_SIZE);
+                memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 3), parent1_0, NODE_SIZE);
+
+                uint8_t *hashOutput0 = (uint8_t *)malloc(NODE_SIZE);
+                sha3(hashInput, NODE_SIZE * 4, hashOutput0, NODE_SIZE);
+
+                WriteVarInt(buf, nodeId1);
+
+                memcpy(hashInput, g->pk, NODE_SIZE);
+                memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
+                memcpy(hashInput + (NODE_SIZE * 3), parent1_1, NODE_SIZE);
+
+                uint8_t *hashOutput1 = (uint8_t *)malloc(NODE_SIZE);
+                sha3(hashInput, NODE_SIZE * 4, hashOutput1, NODE_SIZE);
+
+                NewNode(g, nodeId0, hashOutput0);
+                NewNode(g, nodeId1, hashOutput1);
+                count += 2;
+
+                free(parent0);
+                free(parent1_0);
+                free(parent1_1);
+                free(buf);
+                free(hashInput);
+                free(hashOutput0);
+                free(hashOutput1);
+            }
+        }
+
+        if ((graph == 0 || graph == 3) ||
+            ((graph == 1 || graph == 2) && index == 2))
+        {
+            ButterflyGraph(g, index - 1, &count);
+        }
+        else if (graph == 1 || graph == 2)
+        {
+
+            int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize + indicesSize));
+            memcpy(tempStack, stack, stackSize * sizeof(int64_t));
+            memcpy(tempStack + stackSize, indices, indicesSize * sizeof(int64_t));
+            stackSize += indicesSize;
+            free(stack);
+            stack = tempStack;
+
+            int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize + graphsSize));
+            memcpy(tempGraphStack, graphStack, graphStackSize * sizeof(int32_t));
+            memcpy(tempGraphStack + graphStackSize, graphs, graphsSize * sizeof(int32_t));
+            graphStackSize += graphsSize;
+            free(graphStack);
+            graphStack = tempGraphStack;
+        }
+
+        free(indices);
+        free(graphs);
+    }
+
+    free(stack);
+    free(graphStack);
+}
+
+struct Graph *NewGraph(int64_t index, const char* targetFile, uint8_t *pk)
+{
+    uint8_t exists = 0;
+    FILE *db;
+    if ((db = fopen_utf8(targetFile, "r")) != NULL)
+    {
+        fclose(db);
+        exists = 1;
+    }
+
+    db = fopen_utf8(targetFile, "wb+");
+    int64_t size = numXi(index);
+    int64_t log2 = Log2(size) + 1;
+    int64_t pow2 = 1 << ((uint64_t)log2);
+
+    struct Graph *g = (struct Graph *)malloc(sizeof(struct Graph));
+    g->db = db;
+    g->log2 = log2;
+    g->pow2 = pow2;
+    g->pk = pk;
+    g->index = index;
+
+    if (exists == 0)
+    {
+        XiGraphIter(g, index);
+    }
+
+    fclose(db);
+    return g;
+}
+
+//-----------------------------------------------------------------------------
+int verthash_generate_data_file(const char* output_file_name)
+{
+    const char *hashInput = "Verthash Proof-of-Space Datafile";
+    uint8_t *pk = (uint8_t*)malloc(NODE_SIZE);
+    sha3(hashInput, 32, pk, NODE_SIZE);
+
+    int64_t index = 17;
+    NewGraph(index, output_file_name, pk);
+
+    return 0;
+}
+
diff --git a/algo/verthash/Verthash.h b/algo/verthash/Verthash.h
new file mode 100644
index 00000000..5eac0a4c
--- /dev/null
+++ b/algo/verthash/Verthash.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2018-2021 CryptoGraphics
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version. See LICENSE for more details.
+ */
+
+#ifndef Verthash_INCLUDE_ONCE
+#define Verthash_INCLUDE_ONCE
+
+#include "tiny_sha3/sha3.h"
+#include "fopen_utf8.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+// Verthash constants used to compute bitmask, used inside kernel during IO pass
+#define VH_HASH_OUT_SIZE 32
+#define VH_BYTE_ALIGNMENT 16
+#define VH_HEADER_SIZE 80
+
+//-----------------------------------------------------------------------------
+// Verthash data
+//! Verthash C api for data maniputation.
+typedef struct VerthashInfo
+{
+    char* fileName;
+    uint8_t* data;
+    uint64_t dataSize;
+    uint32_t bitmask;
+} verthash_info_t;
+
+//! Must be called before usage. Reset all fields and set a mining data file name.
+//! Error codes
+//! 0 - Success(No error).
+//! 1 - File name is invalid.
+//! 2 - Memory allocation error
+int verthash_info_init(verthash_info_t* info, const char* file_name);
+
+//! Reset all fields and free allocated data.
+void verthash_info_free(verthash_info_t* info);
+
+//! Generate verthash data file and save it to specified location.
+int verthash_generate_data_file(const char* output_file_name);
+
+void verthash_sha3_prehash_72( const void *data );
+
+void verthash_sha3_final_8( sha3_ctx_t *ctx, void *out, const void *data );
+
+void verthash_hash(const unsigned char* blob_bytes,
+                   const size_t blob_size,
+                   const unsigned char(*input)[VH_HEADER_SIZE],
+                   unsigned char(*output)[VH_HASH_OUT_SIZE]);
+
+
+#endif // !Verthash_INCLUDE_ONCE
+
diff --git a/algo/verthash/fopen_utf8.c b/algo/verthash/fopen_utf8.c
new file mode 100644
index 00000000..e2bd4b1d
--- /dev/null
+++ b/algo/verthash/fopen_utf8.c
@@ -0,0 +1,181 @@
+#ifndef H_FOPEN_UTF8
+#define H_FOPEN_UTF8
+
+#include "fopen_utf8.h"
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int utf8_char_size(const uint8_t *c)
+{
+	const uint8_t	m0x	= 0x80, c0x	= 0x00,
+	      		m10x	= 0xC0, c10x	= 0x80,
+	      		m110x	= 0xE0, c110x	= 0xC0,
+	      		m1110x	= 0xF0, c1110x	= 0xE0,
+	      		m11110x	= 0xF8, c11110x	= 0xF0;
+
+	if ((c[0] & m0x) == c0x)
+		return 1;
+
+	if ((c[0] & m110x) == c110x)
+	if ((c[1] & m10x) == c10x)
+		return 2;
+
+	if ((c[0] & m1110x) == c1110x)
+	if ((c[1] & m10x) == c10x)
+	if ((c[2] & m10x) == c10x)
+		return 3;
+
+	if ((c[0] & m11110x) == c11110x)
+	if ((c[1] & m10x) == c10x)
+	if ((c[2] & m10x) == c10x)
+	if ((c[3] & m10x) == c10x)
+		return 4;
+
+	if ((c[0] & m10x) == c10x)	// not a first UTF-8 byte
+		return 0;
+
+	return -1;			// if c[0] is a first byte but the other bytes don't match
+}
+
+uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index)
+{
+	uint32_t v;
+	int size;
+	const uint8_t m6 = 63, m5 = 31, m4 = 15, m3 = 7;
+
+	if (c==NULL)
+		return 0;
+
+	size = utf8_char_size(c);
+
+	if (size > 0 && index)
+		*index += size-1;
+
+	switch (size)
+	{
+		case 1:
+			v = c[0];
+			break;
+		case 2:
+			v = c[0] & m5;
+			v = v << 6 | (c[1] & m6);
+			break;
+		case 3:
+			v = c[0] & m4;
+			v = v << 6 | (c[1] & m6);
+			v = v << 6 | (c[2] & m6);
+			break;
+		case 4:
+			v = c[0] & m3;
+			v = v << 6 | (c[1] & m6);
+			v = v << 6 | (c[2] & m6);
+			v = v << 6 | (c[3] & m6);
+			break;
+		case 0:				// not a first UTF-8 byte
+		case -1:			// corrupt UTF-8 letter
+		default:
+			v = -1;
+			break;
+	}
+
+	return v;
+}
+
+int codepoint_utf16_size(uint32_t c)
+{
+	if (c < 0x10000) return 1;
+	if (c < 0x110000) return 2;
+
+	return 0;
+}
+
+uint16_t *sprint_utf16(uint16_t *str, uint32_t c)	// str must be able to hold 1 to 3 entries and will be null-terminated by this function
+{
+	int c_size;
+
+	if (str==NULL)
+		return NULL;
+
+	c_size = codepoint_utf16_size(c);
+
+	switch (c_size)
+	{
+		case 1:
+			str[0] = c;
+			if (c > 0)
+				str[1] = '\0';
+			break;
+
+		case 2:
+			c -= 0x10000;
+			str[0] = 0xD800 + (c >> 10);
+			str[1] = 0xDC00 + (c & 0x3FF);
+			str[2] = '\0';
+			break;
+
+		default:
+			str[0] = '\0';
+	}
+
+	return str;
+}
+
+size_t strlen_utf8_to_utf16(const uint8_t *str)
+{
+	size_t i, count;
+	uint32_t c;
+
+	for (i=0, count=0; ; i++)
+	{
+		if (str[i]==0)
+			return count;
+
+		c = utf8_to_unicode32(&str[i], &i);
+		count += codepoint_utf16_size(c);
+	}
+}
+
+uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16)
+{
+	size_t i, j;
+	uint32_t c;
+
+	if (utf8==NULL)
+		return NULL;
+
+	if (utf16==NULL)
+		utf16 = (uint16_t *) calloc(strlen_utf8_to_utf16(utf8) + 1, sizeof(uint16_t));
+
+	for (i=0, j=0, c=1; c; i++)
+	{
+		c = utf8_to_unicode32(&utf8[i], &i);
+		sprint_utf16(&utf16[j], c);
+		j += codepoint_utf16_size(c);
+	}
+
+	return utf16;
+}
+
+FILE *fopen_utf8(const char *path, const char *mode)
+{
+	#ifdef _WIN32
+	wchar_t *wpath, wmode[8];
+	FILE *file;
+
+	if (utf8_to_utf16((const uint8_t *) mode, (uint16_t *) wmode)==NULL)
+		return NULL;
+
+	wpath = (wchar_t *) utf8_to_utf16((const uint8_t *) path, NULL);
+	if (wpath==NULL)
+		return NULL;
+
+	file = _wfopen(wpath, wmode);
+	free(wpath);
+	return file;
+	#else
+	return fopen(path, mode);
+	#endif
+}
+#endif
diff --git a/algo/verthash/fopen_utf8.h b/algo/verthash/fopen_utf8.h
new file mode 100644
index 00000000..0547313f
--- /dev/null
+++ b/algo/verthash/fopen_utf8.h
@@ -0,0 +1,25 @@
+#ifndef H_FOPEN_UTF8
+#define H_FOPEN_UTF8
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stddef.h>
+
+int utf8_char_size(const uint8_t *c);
+uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index);
+int codepoint_utf16_size(uint32_t c);
+uint16_t *sprint_utf16(uint16_t *str, uint32_t c);
+size_t strlen_utf8_to_utf16(const uint8_t *str);
+uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16);
+
+FILE *fopen_utf8(const char *path, const char *mode);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/algo/verthash/tiny_sha3/sha3.c b/algo/verthash/tiny_sha3/sha3.c
new file mode 100644
index 00000000..931ae020
--- /dev/null
+++ b/algo/verthash/tiny_sha3/sha3.c
@@ -0,0 +1,191 @@
+// sha3.c
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
+// Revised 03-Sep-15 for portability + OpenSSL - style API
+
+#include "sha3.h"
+
+// update the state with given number of rounds
+
+void sha3_keccakf(uint64_t st[25])
+{
+    // constants
+    const uint64_t keccakf_rndc[24] = {
+        0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+        0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+        0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+        0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+        0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+        0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+        0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+        0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+    };
+    const int keccakf_rotc[24] = {
+        1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+        27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+    };
+    const int keccakf_piln[24] = {
+        10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+        15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+    };
+
+    // variables
+    int i, j, r;
+    uint64_t t, bc[5];
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+    uint8_t *v;
+
+    // endianess conversion. this is redundant on little-endian targets
+    for (i = 0; i < 25; i++) {
+        v = (uint8_t *) &st[i];
+        st[i] = ((uint64_t) v[0])     | (((uint64_t) v[1]) << 8) |
+            (((uint64_t) v[2]) << 16) | (((uint64_t) v[3]) << 24) |
+            (((uint64_t) v[4]) << 32) | (((uint64_t) v[5]) << 40) |
+            (((uint64_t) v[6]) << 48) | (((uint64_t) v[7]) << 56);
+    }
+#endif
+
+    // actual iteration
+    for (r = 0; r < KECCAKF_ROUNDS; r++) {
+
+        // Theta
+        for (i = 0; i < 5; i++)
+            bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
+
+        for (i = 0; i < 5; i++) {
+            t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+            for (j = 0; j < 25; j += 5)
+                st[j + i] ^= t;
+        }
+
+        // Rho Pi
+        t = st[1];
+        for (i = 0; i < 24; i++) {
+            j = keccakf_piln[i];
+            bc[0] = st[j];
+            st[j] = ROTL64(t, keccakf_rotc[i]);
+            t = bc[0];
+        }
+
+        //  Chi
+        for (j = 0; j < 25; j += 5) {
+            for (i = 0; i < 5; i++)
+                bc[i] = st[j + i];
+            for (i = 0; i < 5; i++)
+                st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
+        }
+
+        //  Iota
+        st[0] ^= keccakf_rndc[r];
+    }
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+    // endianess conversion. this is redundant on little-endian targets
+    for (i = 0; i < 25; i++) {
+        v = (uint8_t *) &st[i];
+        t = st[i];
+        v[0] = t & 0xFF;
+        v[1] = (t >> 8) & 0xFF;
+        v[2] = (t >> 16) & 0xFF;
+        v[3] = (t >> 24) & 0xFF;
+        v[4] = (t >> 32) & 0xFF;
+        v[5] = (t >> 40) & 0xFF;
+        v[6] = (t >> 48) & 0xFF;
+        v[7] = (t >> 56) & 0xFF;
+    }
+#endif
+}
+
+// Initialize the context for SHA3
+
+int sha3_init(sha3_ctx_t *c, int mdlen)
+{
+    int i;
+
+    for (i = 0; i < 25; i++)
+        c->st.q[i] = 0;
+    c->mdlen = mdlen;
+    c->rsiz = 200 - 2 * mdlen;
+    c->pt = 0;
+
+    return 1;
+}
+
+// update state with more data
+
+int sha3_update(sha3_ctx_t *c, const void *data, size_t len)
+{
+    size_t i;
+    int j;
+
+    j = c->pt;
+    for (i = 0; i < len; i++) {
+        c->st.b[j++] ^= ((const uint8_t *) data)[i];
+        if (j >= c->rsiz) {
+            sha3_keccakf(c->st.q);
+            j = 0;
+        }
+    }
+    c->pt = j;
+
+    return 1;
+}
+
+// finalize and output a hash
+
+int sha3_final(void *md, sha3_ctx_t *c)
+{
+    int i;
+
+    c->st.b[c->pt] ^= 0x06;
+    c->st.b[c->rsiz - 1] ^= 0x80;
+    sha3_keccakf(c->st.q);
+
+    for (i = 0; i < c->mdlen; i++) {
+        ((uint8_t *) md)[i] = c->st.b[i];
+    }
+
+    return 1;
+}
+
+// compute a SHA-3 hash (md) of given byte length from "in"
+
+void *sha3(const void *in, size_t inlen, void *md, int mdlen)
+{
+    sha3_ctx_t sha3;
+
+    sha3_init(&sha3, mdlen);
+    sha3_update(&sha3, in, inlen);
+    sha3_final(md, &sha3);
+
+    return md;
+}
+
+// SHAKE128 and SHAKE256 extensible-output functionality
+
+void shake_xof(sha3_ctx_t *c)
+{
+    c->st.b[c->pt] ^= 0x1F;
+    c->st.b[c->rsiz - 1] ^= 0x80;
+    sha3_keccakf(c->st.q);
+    c->pt = 0;
+}
+
+void shake_out(sha3_ctx_t *c, void *out, size_t len)
+{
+    size_t i;
+    int j;
+
+    j = c->pt;
+    for (i = 0; i < len; i++) {
+        if (j >= c->rsiz) {
+            sha3_keccakf(c->st.q);
+            j = 0;
+        }
+        ((uint8_t *) out)[i] = c->st.b[j++];
+    }
+    c->pt = j;
+}
+
diff --git a/algo/verthash/tiny_sha3/sha3.h b/algo/verthash/tiny_sha3/sha3.h
new file mode 100644
index 00000000..2d7bf8d2
--- /dev/null
+++ b/algo/verthash/tiny_sha3/sha3.h
@@ -0,0 +1,55 @@
+// sha3.h
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+#ifndef SHA3_H
+#define SHA3_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef KECCAKF_ROUNDS
+#define KECCAKF_ROUNDS 24
+#endif
+
+#ifndef ROTL64
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+#endif
+
+// state context
+typedef struct {
+    union {                                 // state:
+        uint8_t b[200];                     // 8-bit bytes
+        uint64_t q[25];                     // 64-bit words
+    } st;
+    int pt, rsiz, mdlen;                    // these don't overflow
+} sha3_ctx_t;
+
+// Compression function.
+void sha3_keccakf(uint64_t st[25]);
+
+// OpenSSL - like interfece
+int sha3_init(sha3_ctx_t *c, int mdlen);    // mdlen = hash output in bytes
+int sha3_update(sha3_ctx_t *c, const void *data, size_t len);
+int sha3_final(void *md, sha3_ctx_t *c);    // digest goes to md
+
+// compute a sha3 hash (md) of given byte length from "in"
+void *sha3(const void *in, size_t inlen, void *md, int mdlen);
+
+// SHAKE128 and SHAKE256 extensible-output functions
+#define shake128_init(c) sha3_init(c, 16)
+#define shake256_init(c) sha3_init(c, 32)
+#define shake_update sha3_update
+
+void shake_xof(sha3_ctx_t *c);
+void shake_out(sha3_ctx_t *c, void *out, size_t len);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
new file mode 100644
index 00000000..a3e0bc34
--- /dev/null
+++ b/algo/verthash/verthash-gate.c
@@ -0,0 +1,96 @@
+#include "algo-gate-api.h"
+#include "algo/sha/sph_sha2.h"
+#include "Verthash.h"
+
+static verthash_info_t verthashInfo;
+
+// Verthash data file hash in bytes for verification
+// 0x48aa21d7afededb63976d48a8ff8ec29d5b02563af4a1110b056cd43e83155a5
+static const uint8_t verthashDatFileHash_bytes[32] =
+{ 0xa5, 0x55, 0x31, 0xe8, 0x43, 0xcd, 0x56, 0xb0,
+  0x10, 0x11, 0x4a, 0xaf, 0x63, 0x25, 0xb0, 0xd5,
+  0x29, 0xec, 0xf8, 0x8f, 0x8a, 0xd4, 0x76, 0x39,
+  0xb6, 0xed, 0xed, 0xaf, 0xd7, 0x21, 0xaa, 0x48 };
+
+static const char* verthash_data_file_name = "verthash.dat"; 
+
+int scanhash_verthash( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 1;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   mm128_bswap32_80( edata, pdata );
+//   verthash_sha3_prehash_72( edata );
+   do
+   {
+      edata[19] = n;
+      verthash_hash( verthashInfo.data, verthashInfo.dataSize, 
+                     (const unsigned char (*)[80]) edata,
+                     (unsigned char (*)[32]) hash );
+      if ( valid_hash( hash, ptarget ) && !bench )
+      {
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash, mythr );
+      }
+      n++;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   pdata[19] = n;
+   return 0;
+}
+
+
+bool register_verthash_algo( algo_gate_t* gate )
+{
+
+  opt_target_factor = 256.0;
+  gate->scanhash  = (void*)&scanhash_verthash;
+   
+  // verthash data file
+  int vhLoadResult = verthash_info_init(&verthashInfo, verthash_data_file_name );
+  // Check Verthash initialization status
+  if (vhLoadResult == 0) // No Error
+  {
+      applog(LOG_INFO, "Verthash data file has been loaded succesfully!");
+
+      //  and verify data file(if it was enabled)
+      if ( true )
+//            if (!cmdr.disableVerthashDataFileVerification)
+      {
+         uint8_t vhDataFileHash[32] = { 0 };
+         sph_sha256_full( vhDataFileHash, verthashInfo.data,
+                          verthashInfo.dataSize );
+
+         if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
+                      sizeof(verthashDatFileHash_bytes) ) == 0 )
+            applog(LOG_INFO, "Verthash data file has been verified succesfully!");
+         else
+            applog(LOG_ERR, "Verthash data file verification has failed!");
+      }
+      else
+         applog(LOG_WARNING, "Verthash data file verification stage is disabled!");
+   }
+   else
+   {
+       // Handle Verthash error codes
+       if (vhLoadResult == 1)
+          applog(LOG_ERR, "Verthash data file name is invalid");
+       else if (vhLoadResult == 2)
+          applog(LOG_ERR, "Failed to allocate memory for Verthash data");
+       else // for debugging purposes
+          applog(LOG_ERR, "Verthash data initialization unknown error code: %d",
+                 vhLoadResult);
+       return false;
+   }
+
+   return true;
+}
+
diff --git a/configure b/configure
index 5ae117c0..b1b74084 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.7.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.8.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.7'
-PACKAGE_STRING='cpuminer-opt 3.15.7'
+PACKAGE_VERSION='3.15.8'
+PACKAGE_STRING='cpuminer-opt 3.15.8'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.7 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.15.8 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.7:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.15.8:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.15.7
+cpuminer-opt configure 3.15.8
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.15.7, which was
+It was created by cpuminer-opt $as_me 3.15.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.15.7'
+ VERSION='3.15.8'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.7, which was
+This file was extended by cpuminer-opt $as_me 3.15.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.7
+cpuminer-opt config.status 3.15.8
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index bbe7a18b..3923c436 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.15.7])
+AC_INIT([cpuminer-opt], [3.16.0])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index fe2aed0e..3b4839ed 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -119,14 +119,14 @@ bool opt_sapling = false;
 // Need compile time and run time test.
 #if defined(__linux) && defined(GCC_INT128)  
 #define AFFINITY_USES_UINT128 1
-uint128_t opt_affinity = -1;
+static uint128_t opt_affinity = -1;
 static bool affinity_uses_uint128 = true;
 #else
-uint64_t opt_affinity = -1;
+static uint64_t opt_affinity = -1;
 static bool affinity_uses_uint128 = false;
 #endif
 
-int opt_priority = 0;
+int opt_priority = 0;  // deprecated
 int num_cpus = 1;
 int num_cpugroups = 1;
 char *rpc_url = NULL;;
@@ -3186,14 +3186,12 @@ void parse_arg(int key, char *arg )
 			ul = strtoull( p, NULL, 16 );
 		else
 			ul = atoll( arg );
-//		if ( ul > ( 1ULL << num_cpus ) - 1ULL )
-//			ul = -1LL;
 #if AFFINITY_USES_UINT128
 // replicate the low 64 bits to make a full 128 bit mask if there are more
 // than 64 CPUs, otherwise zero extend the upper half.
          opt_affinity = (uint128_t)ul;
          if ( num_cpus > 64 )
-            opt_affinity = (opt_affinity << 64 ) | opt_affinity;
+            opt_affinity |= opt_affinity << 64;
 #else
          opt_affinity = ul;
 #endif
@@ -3202,6 +3200,8 @@ void parse_arg(int key, char *arg )
 		v = atoi(arg);
 		if (v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
+      // option is deprecated, show warning
+      applog( LOG_WARNING, "High priority mining threads may cause system instability");
 		opt_priority = v;
 		break;
    case 'N':    // N parameter for various scrypt algos
diff --git a/miner.h b/miner.h
index 234b1cc0..e43012da 100644
--- a/miner.h
+++ b/miner.h
@@ -573,6 +573,7 @@ enum algos {
         ALGO_TRIBUS,
         ALGO_VANILLA,
         ALGO_VELTOR,
+        ALGO_VERTHASH,
         ALGO_WHIRLPOOL,
         ALGO_WHIRLPOOLX,
         ALGO_X11,
@@ -665,6 +666,7 @@ static const char* const algo_names[] = {
         "tribus",
         "vanilla",
         "veltor",
+        "verthash",
         "whirlpool",
         "whirlpoolx",
         "x11",
@@ -824,6 +826,7 @@ Options:\n\
                           tribus        Denarius (DNR)\n\
                           vanilla       blake256r8vnl (VCash)\n\
                           veltor\n\
+                          verthash\n\
                           whirlpool\n\
                           whirlpoolx\n\
                           x11           Dash\n\
diff --git a/verthash-help.txt b/verthash-help.txt
new file mode 100644
index 00000000..b055ec31
--- /dev/null
+++ b/verthash-help.txt
@@ -0,0 +1,17 @@
+
+The verthash data file must be named verthash.dat and located in the same
+directory as the cpuminer executable. A Linux symlink works.
+
+The verthash data file must be obtained seperately. If you already use
+VerthashMiner you can simply copy or link the existing data file to the
+cpuminer directory, using the required name.
+
+Otherwise it may be created using
+https://github.com/CryptoGraphics/VerthashMiner/releases
+following the instructions. A GPU is not necessary to create the file.
+
+The same data file can be used by both cpuminer and VerthashMiner
+simultaneously.
+
+Launching cpuminer to mine verthash is the same as any other algorithm,
+no extra options are required. 
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 58503fe3..f6402bad 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -31,6 +31,7 @@ mkdir release
 cp README.txt release/
 cp README.md release/
 cp RELEASE_NOTES release/
+cp verthash-help.txt release/
 cp $MINGW_LIB/zlib1.dll release/
 cp $MINGW_LIB/libwinpthread-1.dll release/
 cp $GCC_MINGW_LIB/libstdc++-6.dll release/

From 902ec046dda53158304a13fce2d82ac85c63891f Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Wed, 24 Mar 2021 18:24:20 -0400
Subject: [PATCH 04/20] v3.16.1

---
 README.md                          |   3 +-
 RELEASE_NOTES                      |  19 ++
 algo-gate-api.c                    | 192 ++++++------
 algo-gate-api.h                    |  12 +-
 algo/verthash/.verthash-gate.c.swp | Bin 16384 -> 0 bytes
 algo/verthash/Verthash.c           | 174 +++++++----
 algo/verthash/Verthash.h           |   4 -
 algo/verthash/verthash-gate.c      |  55 ++--
 configure                          |  20 +-
 configure.ac                       |   2 +-
 cpu-miner.c                        | 459 +++++++++++++++--------------
 miner.h                            |   8 +-
 simd-utils/simd-int.h              |   4 +
 verthash-help.txt                  |  87 +++++-
 14 files changed, 609 insertions(+), 430 deletions(-)
 delete mode 100644 algo/verthash/.verthash-gate.c.swp

diff --git a/README.md b/README.md
index d66f1b0c..65b3f0e7 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ Supported Algorithms
                           lyra2h        Hppcoin
                           lyra2re       lyra2
                           lyra2rev2     lyra2v2
-                          lyra2rev3     lyrav2v3, Vertcoin
+                          lyra2rev3     lyrav2v3
                           lyra2z        
                           lyra2z330     Lyra2 330 rows, Zoin (ZOI)
                           m7m           Magi (XMG)
@@ -122,6 +122,7 @@ Supported Algorithms
                           tribus        Denarius (DNR)
                           vanilla       blake256r8vnl (VCash)
                           veltor        (VLT)
+                          verthash      Vertcoin
                           whirlpool
                           whirlpoolx
                           x11           Dash
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index e6c7f14a..1c7aca5e 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,25 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.16.1
+
+New options for verthash:
+  --data-file to specify the name, and optionally the path, of the verthash
+              data file, default is "verthash.dat" in the current directory.
+  --verify to perform the data file integrity check at startup, default is
+           not to verify data file integrity.
+
+Support for creation of default verthash data file if:
+   1) --data-file option is not used,
+   2) no default data file is found in the current directory, and,
+   3) --verify option is used.
+
+More detailed logs related to verthash data file.
+
+Small verthash performance improvement.
+
+Fixed detection of corrupt stats caused by networking issues.
+
 v3.16.0
 
 Added verthash algo.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index 2fea7afe..f34f5ac0 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -15,8 +15,6 @@
 #include <stdbool.h>
 #include <memory.h>
 #include <unistd.h>
-#include <openssl/sha.h>
-//#include "miner.h"
 #include "algo-gate-api.h"
 
 // Define null and standard functions.
@@ -279,9 +277,11 @@ void init_algo_gate( algo_gate_t* gate )
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
 
-// called by each thread that uses the gate
+// Called once by main
 bool register_algo_gate( int algo, algo_gate_t *gate )
 {
+  bool rc = false;
+
   if ( NULL == gate )
   {
     applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
@@ -290,109 +290,108 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
 
   init_algo_gate( gate );
 
-  switch (algo)
+  switch ( algo )
   {
-    case ALGO_ALLIUM:        register_allium_algo        ( gate ); break;
-    case ALGO_ANIME:         register_anime_algo         ( gate ); break;
-    case ALGO_ARGON2:        register_argon2_algo        ( gate ); break;
-    case ALGO_ARGON2D250:    register_argon2d_crds_algo  ( gate ); break;
-    case ALGO_ARGON2D500:    register_argon2d_dyn_algo   ( gate ); break;
-    case ALGO_ARGON2D4096:   register_argon2d4096_algo   ( gate ); break;
-    case ALGO_AXIOM:         register_axiom_algo         ( gate ); break;
-    case ALGO_BLAKE:         register_blake_algo         ( gate ); break;
-    case ALGO_BLAKE2B:       register_blake2b_algo       ( gate ); break;
-    case ALGO_BLAKE2S:       register_blake2s_algo       ( gate ); break;
-    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
-    case ALGO_BMW512:        register_bmw512_algo        ( gate ); break;
-    case ALGO_C11:           register_c11_algo           ( gate ); break;
-    case ALGO_DECRED:        register_decred_algo        ( gate ); break;
-    case ALGO_DEEP:          register_deep_algo          ( gate ); break;
-    case ALGO_DMD_GR:        register_dmd_gr_algo        ( gate ); break;
-    case ALGO_GROESTL:       register_groestl_algo       ( gate ); break;
-    case ALGO_HEX:           register_hex_algo           ( gate ); break;
-    case ALGO_HMQ1725:       register_hmq1725_algo       ( gate ); break;
-    case ALGO_HODL:          register_hodl_algo          ( gate ); break;
-    case ALGO_JHA:           register_jha_algo           ( gate ); break;
-    case ALGO_KECCAK:        register_keccak_algo        ( gate ); break;
-    case ALGO_KECCAKC:       register_keccakc_algo       ( gate ); break;
-    case ALGO_LBRY:          register_lbry_algo          ( gate ); break;
-    case ALGO_LYRA2H:        register_lyra2h_algo        ( gate ); break;
-    case ALGO_LYRA2RE:       register_lyra2re_algo       ( gate ); break;
-    case ALGO_LYRA2REV2:     register_lyra2rev2_algo     ( gate ); break;
-    case ALGO_LYRA2REV3:     register_lyra2rev3_algo     ( gate ); break;
-    case ALGO_LYRA2Z:        register_lyra2z_algo        ( gate ); break;
-    case ALGO_LYRA2Z330:     register_lyra2z330_algo     ( gate ); break;
-    case ALGO_M7M:           register_m7m_algo           ( gate ); break;
-    case ALGO_MINOTAUR:      register_minotaur_algo      ( gate ); break;
-    case ALGO_MYR_GR:        register_myriad_algo        ( gate ); break;
-    case ALGO_NEOSCRYPT:     register_neoscrypt_algo     ( gate ); break;
-    case ALGO_NIST5:         register_nist5_algo         ( gate ); break;
-    case ALGO_PENTABLAKE:    register_pentablake_algo    ( gate ); break;
-    case ALGO_PHI1612:       register_phi1612_algo       ( gate ); break;
-    case ALGO_PHI2:          register_phi2_algo          ( gate ); break;
-    case ALGO_POLYTIMOS:     register_polytimos_algo     ( gate ); break;
-    case ALGO_POWER2B:       register_power2b_algo       ( gate ); break;
-    case ALGO_QUARK:         register_quark_algo         ( gate ); break;
-    case ALGO_QUBIT:         register_qubit_algo         ( gate ); break;
-    case ALGO_SCRYPT:        register_scrypt_algo        ( gate ); break;
-    case ALGO_SHA256D:       register_sha256d_algo       ( gate ); break;
-    case ALGO_SHA256Q:       register_sha256q_algo       ( gate ); break;
-    case ALGO_SHA256T:       register_sha256t_algo       ( gate ); break;
-    case ALGO_SHA3D:         register_sha3d_algo         ( gate ); break;
-    case ALGO_SHAVITE3:      register_shavite_algo       ( gate ); break;
-    case ALGO_SKEIN:         register_skein_algo         ( gate ); break;
-    case ALGO_SKEIN2:        register_skein2_algo        ( gate ); break;
-    case ALGO_SKUNK:         register_skunk_algo         ( gate ); break;
-    case ALGO_SONOA:         register_sonoa_algo         ( gate ); break;
-    case ALGO_TIMETRAVEL:    register_timetravel_algo    ( gate ); break;
-    case ALGO_TIMETRAVEL10:  register_timetravel10_algo  ( gate ); break;
-    case ALGO_TRIBUS:        register_tribus_algo        ( gate ); break;
-    case ALGO_VANILLA:       register_vanilla_algo       ( gate ); break;
-    case ALGO_VELTOR:        register_veltor_algo        ( gate ); break;
-    case ALGO_VERTHASH:      register_verthash_algo      ( gate ); break;
-    case ALGO_WHIRLPOOL:     register_whirlpool_algo     ( gate ); break;
-    case ALGO_WHIRLPOOLX:    register_whirlpoolx_algo    ( gate ); break;
-    case ALGO_X11:           register_x11_algo           ( gate ); break;
-    case ALGO_X11EVO:        register_x11evo_algo        ( gate ); break;
-    case ALGO_X11GOST:       register_x11gost_algo       ( gate ); break;
-    case ALGO_X12:           register_x12_algo           ( gate ); break;
-    case ALGO_X13:           register_x13_algo           ( gate ); break;
-    case ALGO_X13BCD:        register_x13bcd_algo        ( gate ); break;
-    case ALGO_X13SM3:        register_x13sm3_algo        ( gate ); break;
-    case ALGO_X14:           register_x14_algo           ( gate ); break;
-    case ALGO_X15:           register_x15_algo           ( gate ); break;
-    case ALGO_X16R:          register_x16r_algo          ( gate ); break;
-    case ALGO_X16RV2:        register_x16rv2_algo        ( gate ); break;
-    case ALGO_X16RT:         register_x16rt_algo         ( gate ); break;
-    case ALGO_X16RT_VEIL:    register_x16rt_veil_algo    ( gate ); break;
-    case ALGO_X16S:          register_x16s_algo          ( gate ); break;
-    case ALGO_X17:           register_x17_algo           ( gate ); break;
-    case ALGO_X21S:          register_x21s_algo          ( gate ); break;
-    case ALGO_X22I:          register_x22i_algo          ( gate ); break;
-    case ALGO_X25X:          register_x25x_algo          ( gate ); break;
-    case ALGO_XEVAN:         register_xevan_algo         ( gate ); break;
-    case ALGO_YESCRYPT:      register_yescrypt_05_algo   ( gate ); break;
+    case ALGO_ALLIUM:       rc = register_allium_algo        ( gate ); break;
+    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
+    case ALGO_ARGON2:       rc = register_argon2_algo        ( gate ); break;
+    case ALGO_ARGON2D250:   rc = register_argon2d_crds_algo  ( gate ); break;
+    case ALGO_ARGON2D500:   rc = register_argon2d_dyn_algo   ( gate ); break;
+    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
+    case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
+    case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
+    case ALGO_BLAKE2B:      rc = register_blake2b_algo       ( gate ); break;
+    case ALGO_BLAKE2S:      rc = register_blake2s_algo       ( gate ); break;
+    case ALGO_BLAKECOIN:    rc = register_blakecoin_algo     ( gate ); break;
+    case ALGO_BMW512:       rc = register_bmw512_algo        ( gate ); break;
+    case ALGO_C11:          rc = register_c11_algo           ( gate ); break;
+    case ALGO_DECRED:       rc = register_decred_algo        ( gate ); break;
+    case ALGO_DEEP:         rc = register_deep_algo          ( gate ); break;
+    case ALGO_DMD_GR:       rc = register_dmd_gr_algo        ( gate ); break;
+    case ALGO_GROESTL:      rc = register_groestl_algo       ( gate ); break;
+    case ALGO_HEX:          rc = register_hex_algo           ( gate ); break;
+    case ALGO_HMQ1725:      rc = register_hmq1725_algo       ( gate ); break;
+    case ALGO_HODL:         rc = register_hodl_algo          ( gate ); break;
+    case ALGO_JHA:          rc = register_jha_algo           ( gate ); break;
+    case ALGO_KECCAK:       rc = register_keccak_algo        ( gate ); break;
+    case ALGO_KECCAKC:      rc = register_keccakc_algo       ( gate ); break;
+    case ALGO_LBRY:         rc = register_lbry_algo          ( gate ); break;
+    case ALGO_LYRA2H:       rc = register_lyra2h_algo        ( gate ); break;
+    case ALGO_LYRA2RE:      rc = register_lyra2re_algo       ( gate ); break;
+    case ALGO_LYRA2REV2:    rc = register_lyra2rev2_algo     ( gate ); break;
+    case ALGO_LYRA2REV3:    rc = register_lyra2rev3_algo     ( gate ); break;
+    case ALGO_LYRA2Z:       rc = register_lyra2z_algo        ( gate ); break;
+    case ALGO_LYRA2Z330:    rc = register_lyra2z330_algo     ( gate ); break;
+    case ALGO_M7M:          rc = register_m7m_algo           ( gate ); break;
+    case ALGO_MINOTAUR:     rc = register_minotaur_algo      ( gate ); break;
+    case ALGO_MYR_GR:       rc = register_myriad_algo        ( gate ); break;
+    case ALGO_NEOSCRYPT:    rc = register_neoscrypt_algo     ( gate ); break;
+    case ALGO_NIST5:        rc = register_nist5_algo         ( gate ); break;
+    case ALGO_PENTABLAKE:   rc = register_pentablake_algo    ( gate ); break;
+    case ALGO_PHI1612:      rc = register_phi1612_algo       ( gate ); break;
+    case ALGO_PHI2:         rc = register_phi2_algo          ( gate ); break;
+    case ALGO_POLYTIMOS:    rc = register_polytimos_algo     ( gate ); break;
+    case ALGO_POWER2B:      rc = register_power2b_algo       ( gate ); break;
+    case ALGO_QUARK:        rc = register_quark_algo         ( gate ); break;
+    case ALGO_QUBIT:        rc = register_qubit_algo         ( gate ); break;
+    case ALGO_SCRYPT:       rc = register_scrypt_algo        ( gate ); break;
+    case ALGO_SHA256D:      rc = register_sha256d_algo       ( gate ); break;
+    case ALGO_SHA256Q:      rc = register_sha256q_algo       ( gate ); break;
+    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
+    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
+    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
+    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
+    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
+    case ALGO_SKUNK:        rc = register_skunk_algo         ( gate ); break;
+    case ALGO_SONOA:        rc = register_sonoa_algo         ( gate ); break;
+    case ALGO_TIMETRAVEL:   rc = register_timetravel_algo    ( gate ); break;
+    case ALGO_TIMETRAVEL10: rc = register_timetravel10_algo  ( gate ); break;
+    case ALGO_TRIBUS:       rc = register_tribus_algo        ( gate ); break;
+    case ALGO_VANILLA:      rc = register_vanilla_algo       ( gate ); break;
+    case ALGO_VELTOR:       rc = register_veltor_algo        ( gate ); break;
+    case ALGO_VERTHASH:     rc = register_verthash_algo      ( gate ); break;
+    case ALGO_WHIRLPOOL:    rc = register_whirlpool_algo     ( gate ); break;
+    case ALGO_WHIRLPOOLX:   rc = register_whirlpoolx_algo    ( gate ); break;
+    case ALGO_X11:          rc = register_x11_algo           ( gate ); break;
+    case ALGO_X11EVO:       rc = register_x11evo_algo        ( gate ); break;
+    case ALGO_X11GOST:      rc = register_x11gost_algo       ( gate ); break;
+    case ALGO_X12:          rc = register_x12_algo           ( gate ); break;
+    case ALGO_X13:          rc = register_x13_algo           ( gate ); break;
+    case ALGO_X13BCD:       rc = register_x13bcd_algo        ( gate ); break;
+    case ALGO_X13SM3:       rc = register_x13sm3_algo        ( gate ); break;
+    case ALGO_X14:          rc = register_x14_algo           ( gate ); break;
+    case ALGO_X15:          rc = register_x15_algo           ( gate ); break;
+    case ALGO_X16R:         rc = register_x16r_algo          ( gate ); break;
+    case ALGO_X16RV2:       rc = register_x16rv2_algo        ( gate ); break;
+    case ALGO_X16RT:        rc = register_x16rt_algo         ( gate ); break;
+    case ALGO_X16RT_VEIL:   rc = register_x16rt_veil_algo    ( gate ); break;
+    case ALGO_X16S:         rc = register_x16s_algo          ( gate ); break;
+    case ALGO_X17:          rc = register_x17_algo           ( gate ); break;
+    case ALGO_X21S:         rc = register_x21s_algo          ( gate ); break;
+    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
+    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
+    case ALGO_XEVAN:        rc = register_xevan_algo         ( gate ); break;
+    case ALGO_YESCRYPT:     rc = register_yescrypt_05_algo   ( gate ); break;
 //    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
-    case ALGO_YESCRYPTR8:    register_yescryptr8_05_algo ( gate ); break;
+    case ALGO_YESCRYPTR8:   rc = register_yescryptr8_05_algo ( gate ); break;
 //    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
-    case ALGO_YESCRYPTR8G:   register_yescryptr8g_algo   ( gate ); break;
-    case ALGO_YESCRYPTR16:   register_yescryptr16_05_algo( gate ); break;
+    case ALGO_YESCRYPTR8G:  rc = register_yescryptr8g_algo   ( gate ); break;
+    case ALGO_YESCRYPTR16:  rc = register_yescryptr16_05_algo( gate ); break;
 //    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
-    case ALGO_YESCRYPTR32:   register_yescryptr32_05_algo( gate ); break;
+    case ALGO_YESCRYPTR32:  rc = register_yescryptr32_05_algo( gate ); break;
 //    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
-    case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
-    case ALGO_YESPOWERR16:   register_yespowerr16_algo   ( gate ); break;
-    case ALGO_YESPOWER_B2B:  register_yespower_b2b_algo  ( gate ); break;
-    case ALGO_ZR5:           register_zr5_algo           ( gate ); break;
+    case ALGO_YESPOWER:     rc = register_yespower_algo      ( gate ); break;
+    case ALGO_YESPOWERR16:  rc = register_yespowerr16_algo   ( gate ); break;
+    case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo  ( gate ); break;
+    case ALGO_ZR5:          rc = register_zr5_algo           ( gate ); break;
    default:
-      applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
+      applog(LOG_ERR,"BUG: unregistered algorithm %s.\n", algo_names[opt_algo] );
       return false;
   } // switch
 
- // ensure required functions were defined.
-  if (  gate->scanhash == (void*)&null_scanhash )
+  if ( !rc )
   {
-    applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n");
+    applog(LOG_ERR, "FAIL: %s algorithm failed to initialize\n", algo_names[opt_algo] );
     return false;
   }
   return true;
@@ -434,7 +433,6 @@ const char* const algo_alias_map[][2] =
   { "flax",              "c11"            },
   { "hsr",               "x13sm3"         },
   { "jackpot",           "jha"            },
-  { "jane",              "scryptjane"     }, 
   { "lyra2",             "lyra2re"        },
   { "lyra2v2",           "lyra2rev2"      },
   { "lyra2v3",           "lyra2rev3"      },
diff --git a/algo-gate-api.h b/algo-gate-api.h
index c578f85a..8d61d266 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -114,15 +114,15 @@ typedef struct
 // Mandatory functions, one of these is mandatory. If a generic scanhash
 // is used a custom target hash function must be registered, with a custom
 // scanhash the target hash function can be called directly and doesn't need
-// to be registered in the gate. 
+// to be registered with the gate. 
 int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
 
 int ( *hash )     ( void*, const void*, int );
 
 //optional, safe to use default in most cases
 
-// Allocate thread local buffers and other initialization specific to miner
-// threads.
+// Called once by each miner thread to allocate thread local buffers and
+// other initialization specific to miner threads.
 bool ( *miner_thread_init )     ( int );
 
 // Get thread local copy of blockheader with unique nonce.
@@ -150,7 +150,7 @@ void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
 
 char* ( *malloc_txs_request )   ( struct work* );
 
-// Big or little
+// Big endian or little endian
 void ( *set_work_data_endian )  ( struct work* );
 
 double ( *calc_network_diff )   ( struct work* );
@@ -260,7 +260,7 @@ int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
 #endif
 
 // displays warning
-int null_hash    ();
+int null_hash();
 
 // optional safe targets, default listed first unless noted.
 
@@ -281,7 +281,7 @@ void std_be_build_stratum_request( char *req, struct work *work );
 
 char* std_malloc_txs_request( struct work *work );
 
-// Default is do_nothing (assumed LE)
+// Default is do_nothing, little endian is assumed
 void set_work_data_big_endian( struct work *work );
 
 double std_calc_network_diff( struct work *work );
diff --git a/algo/verthash/.verthash-gate.c.swp b/algo/verthash/.verthash-gate.c.swp
deleted file mode 100644
index 4c59aa9e76706b9dc6b924ebfa9e701366767038..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeHN%a0tz881v?NC@VUa6kyDycVWsnVFr(&g?8^SF!-ZDrPMiV<a4_sp+nsY24F2
z>c_5^m6KBxh!i2ji35pph{!3R905fULY$D`hTwpRP$bAd5JgcWeqZ%-cE_xJLvgkA
ztDdf|`o5~)_o(UW9=6VJJjDmiHp6izW4HbL-1Dy=`iQ;sK#N&{>r?eXN3;`lRt{=}
z`@bZCOSasS-jX#<Cay2TMlg++y3Ka4Ihd(yq;(CnZfwag9*bzaB)n0uR9xjh8Y2<Q
zrgaSi&`mT0H!1^3>@)_qvu>x|A_|Tz9OXZ~`<WY+4LxtofM!55pc&8%Xa+O`ngPwg
z4aq<}J;Z*6nIFn#|7>=?Y0vrQ?D|L9Ie*;F+hx`Iq#4i*Xa+O`ngPv#W<WEb8PE)9
z1~dbj0nNbokO5&c_9-6z&-?&`!~6gI`Tw7HF!mwvHt-hkTi|8j9B>5q><D9T0txUa
za0hS%_~drRJ_cR|ehzE^_W)o0gt7O5H-Hd$40sSY4E*CZ#=ZpJ1zrPw2fPGCKo|JW
zj~V*_coldVkU$@30>^-_uqp9(;4i=*fj<Dh2i^cY;03@1P6PJ?_X0NoZ{5n+uYu=*
zUjTOjcLHDB!q`W^E5L7n=Yacxe;sD*8{p5t2>2Or7<lK0jQtgO6L=cf1Wo}bf%|~}
zf!p_h*8$>r6L<>P!lud#fC-!fHh`1BD)2RI@+aVL-?hQQ=Nw`>^@5Rc`pgsN<7dy-
z`NApTdeY``z=h`p7QU6>6FCXO9qt4nKc9VD!EF(Xh1v-=N946teYfJhGQ@Om&2?PQ
zQFR@?RnvVUu2;a-+l%tnfj`rx`v!9ro0WpU|G%zvP6zm*^diY!hZ|et(}A$h$|&(-
zzP83YHCB?aPuYxHa|aQ?eKCQgh`aul@Lap%>+{673-L=!{7K>4kWdJfVTjPN0$WBj
zh+QgZpme@MMS4_5_3X*Bn;V-?ARG%J6maRtwq)o!E+W$nd>+MOq{eBxkr*O!kCr3N
zbwGSY4X!Gh(x5*`3qGgfj!Fy_@eoweux}0<o2Smqp8!c4O6l`-Cdf8XVp%eB63^S2
zRgg-}%E*0D29AMV>ie$8M0^UCdJ>;{nZun}Mm1g|Q@40w46b~E5|}la8hmSvJ|gc_
zC;js!)ft=7Jaduv+VGvANoIK(<Kop^7){4!G!~s+-=sNRU2!3zUg}F+KOUIzp21$|
zb}sTYzKhvi$+PY*gR@l*a?GRF#15O8Gn_Byr)G`o!gF$ts1uLF1T)CSKtkccsdaW#
zrdN(<ps5@4ZCFqG%GYW+O%;c_2D5vJ%(+IV$e0&hts2LmHUs`RMG_j}>d~>ZE|oFn
z`fltZP+u&<B2FmUKqi>1=7PfNrV9h2aAV_m)uyFim}Fg((t)YcfwD1l=dDX_h&q5i
zEo`humhkBVL=J-)TY+mY){f_OX5N>Kh>sb~ScD@Pn~tzzBnc#{UccEo!PqVv27$*z
zIdY>|hDAS<zG^hEIFAC8)S2<3QVa7jroudwaT59*LNKT)P2oa&m7K>{HH$Pf8JTwA
zORD4zST~Gf(+_-0swP+X_Ly>@fxC}zPvkfG@#FmHb`V}NLy5!_#uwsoXu9^rW_Ddh
zf&b7$N)(1r$vsgroVc+W1ztkg%itue&S^w-KG}h^Qo^3W52I}{HM<=eQxR*sG({ZB
ziB+C9NtTgGoTlU=EzrP+(znKim!}<z(n$Qs9r<ueYb-);EY>LEYD!S4uV|kBFs#6j
zV!prYpoOlIsLumCH+wd|{IW~{g;Nz(lve8GRdgs}Gj>#Sw}UhX<nk!(bj@ifi3W28
zY^jdf3?`FyXJ9HOyB%}T%I?-vFQG;Zs?td*?@K9LQ;w>BPegc0*)?rM(UrOYJW1KF
zvF_Tc0i`i0iD9m_eGZoKmN%(*7{rLnsVl-U+ZA1fz>2p*bm8E$S{HNMuPGmEtf#K9
zw-K2LLdXpp>vS>OfLOdRxX4XY#Bt~j6U3Wo7|1i^Dn@^~R!e=c7+h+nPV1mPyTtUI
zg9P=LL0dIU>saQ)#LBov*5Zp<urRW~zT9Hu?P*?oF%7a}hy}q|<q*ZOYh@-U&tEM3
zaBuQk;|sZ$(W<v_f|IwjE)V;4l+rFs;m~h;h00EwN+GCR9;9m*YFFu6T2wj%DhI01
zK((`%sb)pV?XFS;wJx_?RM%G0mQ|}%OTSC?ogS4#)yD4C8J5Y*Le=$D*;W0d(zL94
zSZcgIr33wDi6E-f5?rpI`JaaLpGLYGQ|QsGgy^*G72!y%YQuhab)|2ka^N@vX?0fZ
z-mrzF-WATWXt&#~VXN1-?B%W;blbh2=#iB3nWAz*NfVFjbM?<gJ$r%uV)ZU8mzyMA
z%&^vFk|>KHoR7J_<t4T(|425+3zb^+i9B+~QvR1^|84cXd_zoKx<$|b|HX6r2|Ux&
z^FJPo)$uv5>3QD;=$ZcvunDXKj{vHD)>fZ11DXNNfM!55pc&8%Xa+O`ngPv#W<WFW
zf5$+!Ba73_<}7Wvr@L*%?r64c>j=CVQoB&IyF=AY>|+0unpF1Q=e^9?voW6dm;7Mc
n=jA)8hxr5cLcUF@-ssTwI=$?*<uDnITz|x;NjME6yuSJtinpbG

diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
index 5baeb68f..072df372 100644
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -7,7 +7,9 @@
  * any later version. See LICENSE for more details.
  */
 
+#include "algo-gate-api.h"
 #include "Verthash.h"
+#include "mm_malloc.h"
 
 //-----------------------------------------------------------------------------
 // Verthash info management
@@ -18,34 +20,71 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
     info->data = NULL;
     info->dataSize = 0;
     info->bitmask = 0;
+    size_t fileNameLen;
 
-    // get name
-    if (file_name == NULL) { return 1; }
-    size_t fileNameLen = strlen(file_name);
-    if (fileNameLen == 0) { return 1; }
-
-    info->fileName = (char*)malloc(fileNameLen+1);
-    if (!info->fileName)
+    if ( !file_name || !( fileNameLen = strlen( file_name ) ) ) 
+    { 
+       applog( LOG_ERR, "Invalid file specification" );
+       return -1; 
+    }
+    
+    info->fileName = (char*)malloc( fileNameLen + 1 );
+    if ( !info->fileName )
     {
-        // Memory allocation fatal error.
-        return 2;
+        applog( LOG_ERR, "Failed to allocate memory for Verthash data" );
+        return -1;
     }
 
-    memset(info->fileName, 0, fileNameLen+1);
-    memcpy(info->fileName, file_name, fileNameLen);
+    memset( info->fileName, 0, fileNameLen + 1 );
+    memcpy( info->fileName, file_name, fileNameLen );
 
-    // Load data
-    FILE *fileMiningData = fopen_utf8(info->fileName, "rb");
-    // Failed to open file for reading
-    if (!fileMiningData) { return 1; }
+    FILE *fileMiningData = fopen_utf8( info->fileName, "rb" );
+    if ( !fileMiningData )
+    {
+       if ( opt_data_file || !opt_verify ) 
+       {
+          if ( opt_data_file )
+             applog( LOG_ERR,
+                     "Verthash data file not found or invalid: %s", info->fileName );
+          else
+          {
+             applog( LOG_ERR,
+                     "No Verthash data file specified and default not found");
+             applog( LOG_NOTICE,
+                     "Add '--verify' to create default 'verthash.dat'");
+          }
+          return -1;
+       }
+       else
+       {
+          applog( LOG_NOTICE, "Creating default 'verthash.dat' in current directory, this will take several minutes");
+          if ( verthash_generate_data_file( info->fileName ) )
+             return -1;
+
+          fileMiningData = fopen_utf8( info->fileName, "rb" );
+          if ( !fileMiningData )
+          {
+              applog( LOG_ERR, "File system error opening %s", info->fileName );
+              return -1;
+          }
+
+          applog( LOG_NOTICE, "Verthash data file created successfully" );
+       }
+    }
 
     // Get file size
     fseek(fileMiningData, 0, SEEK_END);
-    uint64_t fileSize = (uint64_t)ftell(fileMiningData);
+    int fileSize = ftell(fileMiningData);
     fseek(fileMiningData, 0, SEEK_SET);
 
+    if ( fileSize < 0 ) 
+    {
+        fclose(fileMiningData);
+        return 1;
+    }
+
     // Allocate data
-    info->data = (uint8_t *)malloc(fileSize);
+    info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
     if (!info->data)
     {
         fclose(fileMiningData);
@@ -54,13 +93,20 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
     }
 
     // Load data
-    fread(info->data, fileSize, 1, fileMiningData);
+    if ( !fread( info->data, fileSize, 1, fileMiningData ) )
+    {
+        applog( LOG_ERR, "File system error reading %s", info->fileName );
+        fclose(fileMiningData);
+        return -1;
+    }
+
     fclose(fileMiningData);
 
     // Update fields
     info->bitmask = ((fileSize - VH_HASH_OUT_SIZE)/VH_BYTE_ALIGNMENT) + 1;
     info->dataSize = fileSize;
 
+    applog( LOG_NOTICE, "Using Verthash data file '%s'", info->fileName );
     return 0;
 }
 
@@ -83,20 +129,6 @@ void verthash_info_free(verthash_info_t* info)
 #define VH_N_INDEXES 4096
 #define VH_BYTE_ALIGNMENT 16
 
-static __thread sha3_ctx_t sha3_midstate_ctx;
-
-void verthash_sha3_prehash_72( const void *data )
-{
-   sha3_init( &sha3_midstate_ctx, 256 );
-   sha3_update( &sha3_midstate_ctx, data, 72 );
-}
-
-void verthash_sha3_final_8( sha3_ctx_t *ctx, void *out, const void *data )
-{
-   sha3_update( ctx, data, 8 );
-   sha3_final( out, ctx );
-}
-
 static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
 {
     return (a ^ b) * 0x1000193;
@@ -107,16 +139,12 @@ void verthash_hash(const unsigned char* blob_bytes,
                    const unsigned char(*input)[VH_HEADER_SIZE],
                    unsigned char(*output)[VH_HASH_OUT_SIZE])
 {
-    unsigned char p1[VH_HASH_OUT_SIZE];
-//    sha3_ctx_t sha3_ctx;
-//    memcpy ( &sha3_ctx, &sha3_midstate_ctx, sizeof sha3_ctx );
-//    verthash_sha3_final_8( &sha3_ctx, &p1[0], &input[72] );
-
+    unsigned char p1[VH_HASH_OUT_SIZE] __attribute__ ((aligned (64)));
     sha3(&input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE);
 
     unsigned char p0[VH_N_SUBSET];
 
-    unsigned char input_header[VH_HEADER_SIZE];
+    unsigned char input_header[VH_HEADER_SIZE] __attribute__ ((aligned (64)));
     memcpy(input_header, input, VH_HEADER_SIZE);
 
     for (size_t i = 0; i < VH_N_ITER; ++i)
@@ -126,17 +154,47 @@ void verthash_hash(const unsigned char* blob_bytes,
     }
 
     uint32_t* p0_index = (uint32_t*)p0;
-    uint32_t seek_indexes[VH_N_INDEXES];
+    uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
 
-    for (size_t x = 0; x < VH_N_ROT; ++x)
+    for ( size_t x = 0; x < VH_N_ROT; ++x )
     {
         memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
                 p0, VH_N_SUBSET);
-        for (size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); ++y)
+
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)        
+// 512 bit vector processing is actually slower because it reduces the CPU
+// clock significantly, which also slows mem access. The AVX512 rol instruction
+// is still available for smaller vectors.
+
+//        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 16 )
+//        {
+//            __m512i *p0_v = (__m512i*)( p0_index + y );
+//            *p0_v = mm512_rol_32( *p0_v, 1 );
+//        }
+
+#if defined(__AVX2__)
+
+        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 8 )
         {
-            *(p0_index + y) = ( *(p0_index + y) << 1 )
-            | ( 1 & (*(p0_index + y) >> 31) );
+            __m256i *p0_v = (__m256i*)( p0_index + y );
+            *p0_v = mm256_rol_32( *p0_v, 1 );
         }
+
+#else
+
+        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 4 )
+        {
+            __m128i *p0_v = (__m128i*)( p0_index + y );
+            *p0_v = mm128_rol_32( *p0_v, 1 );
+        }
+
+#endif
+
+//        for (size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); ++y)
+//        {
+//            *(p0_index + y) = ( *(p0_index + y) << 1 )
+//            | ( 1 & (*(p0_index + y) >> 31) );
+//        }
     }
 
     uint32_t* p1_32 = (uint32_t*)p1;
@@ -146,13 +204,13 @@ void verthash_hash(const unsigned char* blob_bytes,
     for (size_t i = 0; i < VH_N_INDEXES; i++)
     {
         const uint32_t offset = (fnv1a(seek_indexes[i], value_accumulator) % mdiv) * VH_BYTE_ALIGNMENT / sizeof(uint32_t);
+        const uint32_t *blob_off = blob_bytes_32 + offset;
         for (size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++)
         {
-            const uint32_t value = *(blob_bytes_32 + offset + i2);
+            const uint32_t value = *( blob_off + i2 );
             uint32_t* p1_ptr = p1_32 + i2;
-            *p1_ptr = fnv1a(*p1_ptr, value);
-
-            value_accumulator = fnv1a(value_accumulator, value);
+            *p1_ptr = fnv1a( *p1_ptr, value );
+            value_accumulator = fnv1a( value_accumulator, value );
         }
     }
 
@@ -591,6 +649,9 @@ struct Graph *NewGraph(int64_t index, const char* targetFile, uint8_t *pk)
     int64_t pow2 = 1 << ((uint64_t)log2);
 
     struct Graph *g = (struct Graph *)malloc(sizeof(struct Graph));
+
+    if ( !g ) return NULL;
+
     g->db = db;
     g->log2 = log2;
     g->pow2 = pow2;
@@ -607,14 +668,27 @@ struct Graph *NewGraph(int64_t index, const char* targetFile, uint8_t *pk)
 }
 
 //-----------------------------------------------------------------------------
+
+// use info for _mm_malloc, then verify file
 int verthash_generate_data_file(const char* output_file_name)
 {
     const char *hashInput = "Verthash Proof-of-Space Datafile";
-    uint8_t *pk = (uint8_t*)malloc(NODE_SIZE);
-    sha3(hashInput, 32, pk, NODE_SIZE);
+    uint8_t *pk = (uint8_t*)malloc( NODE_SIZE );
+    
+    if ( !pk )
+    {
+      applog( LOG_ERR, "Verthash data memory allocation failed");
+      return -1;
+    }
+
+    sha3( hashInput, 32, pk, NODE_SIZE );
 
     int64_t index = 17;
-    NewGraph(index, output_file_name, pk);
+    if ( !NewGraph( index, output_file_name, pk ) )
+    {
+       applog( LOG_ERR, "Verthash file creation failed");
+       return -1;
+    }
 
     return 0;
 }
diff --git a/algo/verthash/Verthash.h b/algo/verthash/Verthash.h
index 5eac0a4c..f81c9750 100644
--- a/algo/verthash/Verthash.h
+++ b/algo/verthash/Verthash.h
@@ -47,10 +47,6 @@ void verthash_info_free(verthash_info_t* info);
 //! Generate verthash data file and save it to specified location.
 int verthash_generate_data_file(const char* output_file_name);
 
-void verthash_sha3_prehash_72( const void *data );
-
-void verthash_sha3_final_8( sha3_ctx_t *ctx, void *out, const void *data );
-
 void verthash_hash(const unsigned char* blob_bytes,
                    const size_t blob_size,
                    const unsigned char(*input)[VH_HEADER_SIZE],
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
index a3e0bc34..00c137fc 100644
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -12,8 +12,6 @@ static const uint8_t verthashDatFileHash_bytes[32] =
   0x29, 0xec, 0xf8, 0x8f, 0x8a, 0xd4, 0x76, 0x39,
   0xb6, 0xed, 0xed, 0xaf, 0xd7, 0x21, 0xaa, 0x48 };
 
-static const char* verthash_data_file_name = "verthash.dat"; 
-
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -28,7 +26,6 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
 
    mm128_bswap32_80( edata, pdata );
-//   verthash_sha3_prehash_72( edata );
    do
    {
       edata[19] = n;
@@ -47,6 +44,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
    return 0;
 }
 
+const char *default_verthash_data_file = "verthash.dat";
 
 bool register_verthash_algo( algo_gate_t* gate )
 {
@@ -55,42 +53,49 @@ bool register_verthash_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_verthash;
    
   // verthash data file
-  int vhLoadResult = verthash_info_init(&verthashInfo, verthash_data_file_name );
-  // Check Verthash initialization status
-  if (vhLoadResult == 0) // No Error
-  {
-      applog(LOG_INFO, "Verthash data file has been loaded succesfully!");
-
+  char *verthash_data_file = opt_data_file ? opt_data_file
+                                           : default_verthash_data_file;
+  
+   int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
+   if (vhLoadResult == 0) // No Error
+   {
       //  and verify data file(if it was enabled)
-      if ( true )
-//            if (!cmdr.disableVerthashDataFileVerification)
+      if ( opt_verify )
       {
          uint8_t vhDataFileHash[32] = { 0 };
+
+         applog( LOG_NOTICE, "Verifying Verthash data" );
          sph_sha256_full( vhDataFileHash, verthashInfo.data,
                           verthashInfo.dataSize );
-
          if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
                       sizeof(verthashDatFileHash_bytes) ) == 0 )
-            applog(LOG_INFO, "Verthash data file has been verified succesfully!");
+            applog( LOG_NOTICE, "Verthash data has been verified" );
          else
-            applog(LOG_ERR, "Verthash data file verification has failed!");
+         {
+            applog( LOG_ERR, "Verthash data verification has failed" );
+            return false;
+         }
       }
-      else
-         applog(LOG_WARNING, "Verthash data file verification stage is disabled!");
    }
    else
+
    {
-       // Handle Verthash error codes
-       if (vhLoadResult == 1)
-          applog(LOG_ERR, "Verthash data file name is invalid");
-       else if (vhLoadResult == 2)
-          applog(LOG_ERR, "Failed to allocate memory for Verthash data");
-       else // for debugging purposes
-          applog(LOG_ERR, "Verthash data initialization unknown error code: %d",
-                 vhLoadResult);
-       return false;
+      // Handle Verthash error codes
+      if ( vhLoadResult == 1 )
+      {
+         applog( LOG_ERR, "Verthash data file not found: %s", verthash_data_file );
+         if ( !opt_data_file )
+            applog( LOG_NOTICE, "Add '--verify' to create verthash.dat");
+      }
+      else if ( vhLoadResult == 2 )
+         applog( LOG_ERR, "Failed to allocate memory for Verthash data" );
+//       else // for debugging purposes
+//          applog( LOG_ERR, "Verthash data initialization unknown error code: %d",
+//                 vhLoadResult );
+      return false;
    }
 
+   printf("\n");
    return true;
 }
 
diff --git a/configure b/configure
index b1b74084..00998223 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.8.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.8'
-PACKAGE_STRING='cpuminer-opt 3.15.8'
+PACKAGE_VERSION='3.16.1'
+PACKAGE_STRING='cpuminer-opt 3.16.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.8 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.16.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.8:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.16.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.15.8
+cpuminer-opt configure 3.16.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.15.8, which was
+It was created by cpuminer-opt $as_me 3.16.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.15.8'
+ VERSION='3.16.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.8, which was
+This file was extended by cpuminer-opt $as_me 3.16.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.8
+cpuminer-opt config.status 3.16.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 3923c436..29b3b9a7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.0])
+AC_INIT([cpuminer-opt], [3.16.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 3b4839ed..0ed29b92 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -112,7 +112,6 @@ char* opt_param_key = NULL;
 int opt_param_n = 0;
 int opt_param_r = 0;
 int opt_n_threads = 0;
-bool opt_reset_on_stale = false;
 bool opt_sapling = false;
 
 // Windows doesn't support 128 bit affinity mask.
@@ -134,6 +133,8 @@ char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
 char *short_url = NULL;
 char *coinbase_address;
+char *opt_data_file = NULL;
+bool opt_verify = false;
 
 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
@@ -1070,12 +1071,11 @@ void report_summary_log( bool force )
    
    double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
    double ghrate = global_hashrate;
-   double shrate = share_time == 0. ? 0. : exp32 * last_targetdiff
-                                           * (double)(accepts) / share_time;
-   double sess_hrate = uptime.tv_sec == 0. ? 0. : exp32 * norm_diff_sum
-                                                / (double)uptime.tv_sec;
-   double submit_rate = share_time == 0. ? 0. : (double)submits*60.
-                                                / share_time;
+   double shrate = safe_div( exp32 * last_targetdiff * (double)(accepts),
+                             share_time, 0. );
+   double sess_hrate = safe_div( exp32 * norm_diff_sum,
+                                 (double)uptime.tv_sec, 0. );
+   double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
    char shr_units[4] = {0};
    char ghr_units[4] = {0};
    char sess_hr_units[4] = {0};
@@ -1092,11 +1092,10 @@ void report_summary_log( bool force )
    applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
    applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
    applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
-                      submit_rate, (double)submitted_share_count*60. /
-                    ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
+            submit_rate, (double)submitted_share_count*60. /
+            ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
    applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
-                     shrate, shr_units, sess_hrate, sess_hr_units, 
-                     ghrate, ghr_units );
+            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
 
    if ( accepted_share_count < submitted_share_count )
    {
@@ -1110,37 +1109,40 @@ void report_summary_log( bool force )
       char lghr_units[4] = {0};
       scale_hash_for_display( &lost_shrate, lshr_units );
       scale_hash_for_display( &lost_ghrate, lghr_units );
-      applog2( LOG_INFO, "Lost hash rate  %7.2f%sh/s   %7.2f%sh/s",
-                     lost_shrate, lshr_units, lost_ghrate, lghr_units );
+      applog2( LOG_INFO, "Lost hash rate  %7.2f%sh/s    %7.2f%sh/s",
+               lost_shrate, lshr_units, lost_ghrate, lghr_units );
    }
 
-   applog2( LOG_INFO,"Submitted        %6d       %6d",
-                       submits, submitted_share_count );
-   applog2( LOG_INFO,"Accepted         %6d       %6d      %5.1f%%",
-                       accepts, accepted_share_count,
-                      100. * accepted_share_count / submitted_share_count );
+   applog2( LOG_INFO,"Submitted       %7d      %7d",
+               submits, submitted_share_count );
+   applog2( LOG_INFO, "Accepted        %7d      %7d      %5.1f%%",
+                      accepts, accepted_share_count,
+                      100. * safe_div( (double)accepted_share_count, 
+                                       (double)submitted_share_count, 0. ) ); 
    if ( stale_share_count )
-      applog2( LOG_INFO,"Stale            %6d       %6d      %5.1f%%",
-                       stales, stale_share_count,
-                       100. * stale_share_count / submitted_share_count );
+      applog2( LOG_INFO, "Stale           %7d      %7d      %5.1f%%",
+                      stales, stale_share_count,
+                      100. * safe_div( (double)stale_share_count,
+                                       (double)submitted_share_count, 0. ) );
    if ( rejected_share_count )
-      applog2( LOG_INFO,"Rejected         %6d       %6d      %5.1f%%",
-                       rejects, rejected_share_count,
-                       100. * rejected_share_count / submitted_share_count );
+      applog2( LOG_INFO, "Rejected        %7d      %7d      %5.1f%%",
+                      rejects, rejected_share_count,
+                      100. * safe_div( (double)rejected_share_count,
+                                       (double)submitted_share_count, 0. ) );
    if ( solved_block_count )
-      applog2( LOG_INFO,"Blocks Solved    %6d       %6d",
-                         solved, solved_block_count );
+      applog2( LOG_INFO,"Blocks Solved   %7d      %7d",
+               solved, solved_block_count );
    applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
-                       highest_share, lowest_share );
+               highest_share, lowest_share );
 
-   static int64_t no_acks = 0;
-   if ( no_acks )
-   {
-      no_acks = submitted_share_count
+   int mismatch = submitted_share_count
          - ( accepted_share_count + stale_share_count + rejected_share_count );
-      if ( no_acks )  // 2 consecutive cycles non zero
-         applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect",
-                no_acks );
+   if ( mismatch )
+   {
+      if ( mismatch != 1 )
+         applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect", mismatch );
+      else
+         applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
    }
 }
 
@@ -1294,7 +1296,8 @@ static int share_result( int result, struct work *work,
       if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason );
          
       diff_to_hash( str, my_stats.share_diff );
-      applog2( LOG_INFO, "Hash:   %08x%08x%08x...", str[7], str[6], str[5] );
+      applog2( LOG_INFO, "Hash:   %08x%08x%08x%08x%08x%08x", str[7], str[6],
+               str[5], str[4], str[3],str[2], str[1], str[0] );
 
       if ( work )
          targ = work->target;
@@ -1303,7 +1306,8 @@ static int share_result( int result, struct work *work,
          diff_to_hash( str, my_stats.target_diff );
          targ = &str[0];
       }
-      applog2( LOG_INFO, "Target: %08x%08x%08x...", targ[7], targ[6], targ[5] );
+      applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
+               targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
    }
    return 1;
 }
@@ -2790,6 +2794,189 @@ static void *stratum_thread(void *userdata )
   return NULL;
 }
 
+static void show_credits()
+{
+   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  *********** \n");
+   printf("     A CPU miner with multi algo support and optimized for CPUs\n");
+   printf("     with AVX512, SHA and VAES extensions by JayDDee.\n");
+   printf("     BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
+}
+
+#define check_cpu_capability() cpu_capability( false )
+#define display_cpu_capability() cpu_capability( true )
+static bool cpu_capability( bool display_only )
+{
+     char cpu_brand[0x40];
+     bool cpu_has_sse2   = has_sse2();
+     bool cpu_has_aes    = has_aes_ni();
+     bool cpu_has_sse42  = has_sse42();
+     bool cpu_has_avx    = has_avx();
+     bool cpu_has_avx2   = has_avx2();
+     bool cpu_has_sha    = has_sha();
+     bool cpu_has_avx512 = has_avx512();
+     bool cpu_has_vaes   = has_vaes();
+     bool sw_has_aes    = false;
+     bool sw_has_sse2   = false;
+     bool sw_has_sse42  = false;
+     bool sw_has_avx    = false;
+     bool sw_has_avx2   = false;
+     bool sw_has_avx512 = false;
+     bool sw_has_sha    = false;
+     bool sw_has_vaes   = false;
+     set_t algo_features = algo_gate.optimizations;
+     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
+     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
+     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
+     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
+     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
+     bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
+     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
+     bool use_aes;
+     bool use_sse2;
+     bool use_sse42;
+     bool use_avx2;
+     bool use_avx512;
+     bool use_sha;
+     bool use_vaes;
+     bool use_none;
+
+     #ifdef __AES__
+       sw_has_aes = true;
+     #endif
+     #ifdef __SSE2__
+         sw_has_sse2 = true;
+     #endif
+     #ifdef __SSE4_2__
+         sw_has_sse42 = true;
+     #endif
+     #ifdef __AVX__
+         sw_has_avx = true;
+     #endif
+     #ifdef __AVX2__
+         sw_has_avx2 = true;
+     #endif
+     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
+         sw_has_avx512 = true;
+     #endif
+     #ifdef __SHA__
+         sw_has_sha = true;
+     #endif
+     #ifdef __VAES__
+         sw_has_vaes = true;
+     #endif
+
+
+//     #if !((__AES__) || (__SSE2__))
+//         printf("Neither __AES__ nor __SSE2__ defined.\n");
+//     #endif
+
+     cpu_brand_string( cpu_brand );
+     printf( "CPU: %s\n", cpu_brand );
+
+     printf("SW built on " __DATE__
+     #ifdef _MSC_VER
+         " with VC++ 2013\n");
+     #elif defined(__GNUC__)
+         " with GCC");
+        printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
+     #else
+        printf("\n");
+     #endif
+
+     printf("CPU features: ");
+     if      ( cpu_has_avx512 )    printf( " AVX512" );
+     else if ( cpu_has_avx2   )    printf( " AVX2  " );
+     else if ( cpu_has_avx    )    printf( " AVX   " );
+     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
+     else if ( cpu_has_sse2   )    printf( " SSE2  " );
+     if      ( cpu_has_vaes   )    printf( " VAES"   );
+     else if ( cpu_has_aes    )    printf( "  AES"   );
+     if      ( cpu_has_sha    )    printf( " SHA"    );
+
+     printf("\nSW features:  ");
+     if      ( sw_has_avx512 )    printf( " AVX512" );
+     else if ( sw_has_avx2   )    printf( " AVX2  " );
+     else if ( sw_has_avx    )    printf( " AVX   " );
+     else if ( sw_has_sse42  )    printf( " SSE4.2" );
+     else if ( sw_has_sse2   )    printf( " SSE2  " );
+     if      ( sw_has_vaes   )    printf( " VAES"   );
+     else if ( sw_has_aes    )    printf( "  AES"   );
+     if      ( sw_has_sha    )    printf( " SHA"    );
+
+     printf("\nAlgo features:");
+     if ( algo_features == EMPTY_SET ) printf( " None" );
+     else
+     {
+        if      ( algo_has_avx512 )    printf( " AVX512" );
+        else if ( algo_has_avx2   )    printf( " AVX2  " );
+        else if ( algo_has_sse42  )    printf( " SSE4.2" );
+        else if ( algo_has_sse2   )    printf( " SSE2  " );
+        if      ( algo_has_vaes   )    printf( " VAES"   );
+        else if ( algo_has_aes    )    printf( "  AES"   );
+        if      ( algo_has_sha    )    printf( " SHA"    );
+     }
+     printf("\n");
+
+     if ( display_only ) return true;
+
+     // Check for CPU and build incompatibilities
+     if ( !cpu_has_sse2 )
+     {
+        printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
+        return false;
+     }
+     if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
+     {
+        printf( "The SW build requires a CPU with AES and AVX2!\n" );
+        return false;
+     }
+     if ( sw_has_sse42 && !cpu_has_sse42 )
+     {
+        printf( "The SW build requires a CPU with SSE4.2!\n" );
+        return false;
+     }
+     if ( sw_has_aes && !cpu_has_aes )
+     {
+        printf( "The SW build requires a CPU with AES!\n" );
+        return false;
+     }
+     if ( sw_has_sha && !cpu_has_sha )
+     {
+        printf( "The SW build requires a CPU with SHA!\n" );
+        return false;
+     }
+
+     // Determine mining options
+     use_sse2   = cpu_has_sse2   && algo_has_sse2;
+     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
+     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
+     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
+     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
+     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes
+          && ( use_avx512 || algo_has_vaes256 );
+     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
+                   use_sha || use_vaes );
+
+     // Display best options
+     printf( "\nStarting miner with" );
+     if         ( use_none ) printf( " no optimizations" );
+     else
+     {
+        if      ( use_avx512 ) printf( " AVX512" );
+        else if ( use_avx2   ) printf( " AVX2"   );
+        else if ( use_sse42  ) printf( " SSE4.2" );
+        else if ( use_sse2   ) printf( " SSE2"   );
+        if      ( use_vaes   ) printf( " VAES"   );
+        else if ( use_aes    ) printf( " AES"    );
+        if      ( use_sha    ) printf( " SHA"    );
+     }
+     printf( "...\n\n" );
+
+     return true;
+}
+        
 void show_version_and_exit(void)
 {
         printf("\n built on " __DATE__
@@ -2837,7 +3024,6 @@ void show_version_and_exit(void)
 #endif
                 "\n\n");
 
-        /* dependencies versions */
         printf("%s\n", curl_version());
 #ifdef JANSSON_VERSION
         printf("jansson/%s ", JANSSON_VERSION);
@@ -2849,7 +3035,6 @@ void show_version_and_exit(void)
         exit(0);
 }
 
-
 void show_usage_and_exit(int status)
 {
 	if (status)
@@ -3237,11 +3422,15 @@ void parse_arg(int key, char *arg )
 	case 1024:
 		opt_randomize = true;
 		break;
-   case 1026:
-      opt_reset_on_stale = true;
+   case 1027:  // data-file
+      opt_data_file = strdup( arg );
+      break;
+   case 1028:  // verify
+      opt_verify = true;
       break;
 	case 'V':
-		show_version_and_exit();
+      display_cpu_capability();
+      exit(0);
 	case 'h':
 		show_usage_and_exit(0);
 
@@ -3358,185 +3547,6 @@ static int thread_create(struct thr_info *thr, void* func)
 	return err;
 }
 
-static void show_credits()
-{
-   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  *********** \n");
-   printf("     A CPU miner with multi algo support and optimized for CPUs\n");
-   printf("     with AVX512, SHA and VAES extensions by JayDDee.\n");
-   printf("     BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
-}
-
-bool check_cpu_capability ()
-{
-     char cpu_brand[0x40];
-     bool cpu_has_sse2   = has_sse2();
-     bool cpu_has_aes    = has_aes_ni();
-     bool cpu_has_sse42  = has_sse42();
-     bool cpu_has_avx    = has_avx();
-     bool cpu_has_avx2   = has_avx2();
-     bool cpu_has_sha    = has_sha();
-     bool cpu_has_avx512 = has_avx512();
-     bool cpu_has_vaes   = has_vaes();
-     bool sw_has_aes    = false;
-     bool sw_has_sse2   = false;
-     bool sw_has_sse42  = false;
-     bool sw_has_avx    = false;
-     bool sw_has_avx2   = false;
-     bool sw_has_avx512 = false;
-     bool sw_has_sha    = false;
-     bool sw_has_vaes   = false;
-     set_t algo_features = algo_gate.optimizations;
-     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
-     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
-     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
-     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
-     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
-     bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
-     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
-     bool use_aes;
-     bool use_sse2;
-     bool use_sse42;
-     bool use_avx2;
-     bool use_avx512;
-     bool use_sha;
-     bool use_vaes;
-     bool use_none;
-
-     #ifdef __AES__
-       sw_has_aes = true;
-     #endif
-     #ifdef __SSE2__
-         sw_has_sse2 = true;
-     #endif
-     #ifdef __SSE4_2__
-         sw_has_sse42 = true;
-     #endif
-     #ifdef __AVX__
-         sw_has_avx = true;
-     #endif
-     #ifdef __AVX2__
-         sw_has_avx2 = true;
-     #endif
-     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
-         sw_has_avx512 = true;
-     #endif
-     #ifdef __SHA__
-         sw_has_sha = true;
-     #endif
-     #ifdef __VAES__
-         sw_has_vaes = true;
-     #endif
-         
-
-//     #if !((__AES__) || (__SSE2__))
-//         printf("Neither __AES__ nor __SSE2__ defined.\n");
-//     #endif
-
-     cpu_brand_string( cpu_brand );
-     printf( "CPU: %s\n", cpu_brand );
-     
-     printf("SW built on " __DATE__
-     #ifdef _MSC_VER
-         " with VC++ 2013\n");
-     #elif defined(__GNUC__)
-         " with GCC");
-        printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
-     #else
-        printf("\n");
-     #endif
-
-     printf("CPU features: ");
-     if      ( cpu_has_avx512 )    printf( " AVX512" );
-     else if ( cpu_has_avx2   )    printf( " AVX2  " );
-     else if ( cpu_has_avx    )    printf( " AVX   " );
-     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-     else if ( cpu_has_sse2   )    printf( " SSE2  " );
-     if      ( cpu_has_vaes   )    printf( " VAES"   );
-     else if ( cpu_has_aes    )    printf( "  AES"   );
-     if      ( cpu_has_sha    )    printf( " SHA"    );
-
-     printf("\nSW features:  ");
-     if      ( sw_has_avx512 )    printf( " AVX512" );
-     else if ( sw_has_avx2   )    printf( " AVX2  " );
-     else if ( sw_has_avx    )    printf( " AVX   " );
-     else if ( sw_has_sse42  )    printf( " SSE4.2" );
-     else if ( sw_has_sse2   )    printf( " SSE2  " );
-     if      ( sw_has_vaes   )    printf( " VAES"   );
-     else if ( sw_has_aes    )    printf( "  AES"   );
-     if      ( sw_has_sha    )    printf( " SHA"    );
-
-     printf("\nAlgo features:");
-     if ( algo_features == EMPTY_SET ) printf( " None" );
-     else
-     {
-        if      ( algo_has_avx512 )    printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2  " );
-        else if ( algo_has_sse42  )    printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2  " );
-        if      ( algo_has_vaes   )    printf( " VAES"   );
-        else if ( algo_has_aes    )    printf( "  AES"   );
-        if      ( algo_has_sha    )    printf( " SHA"    );
-     }
-     printf("\n");
-
-     // Check for CPU and build incompatibilities
-     if ( !cpu_has_sse2 )
-     {
-        printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
-        return false;
-     }
-     if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
-     {
-        printf( "The SW build requires a CPU with AES and AVX2!\n" );
-        return false;
-     }
-     if ( sw_has_sse42 && !cpu_has_sse42 )
-     {
-        printf( "The SW build requires a CPU with SSE4.2!\n" );
-        return false;
-     }
-     if ( sw_has_aes && !cpu_has_aes )
-     {
-        printf( "The SW build requires a CPU with AES!\n" );
-        return false;
-     }
-     if ( sw_has_sha && !cpu_has_sha )
-     {
-        printf( "The SW build requires a CPU with SHA!\n" );
-        return false;
-     }
-
-     // Determine mining options
-     use_sse2   = cpu_has_sse2   && algo_has_sse2;
-     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
-     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
-     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
-     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
-     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes 
-	       && ( use_avx512 || algo_has_vaes256 );   
-     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
-                   use_sha || use_vaes );
-      
-     // Display best options
-     printf( "\nStarting miner with" );
-     if         ( use_none ) printf( " no optimizations" );
-     else
-     {
-        if      ( use_avx512 ) printf( " AVX512" );
-        else if ( use_avx2   ) printf( " AVX2"   );
-        else if ( use_sse42  ) printf( " SSE4.2" );
-        else if ( use_sse2   ) printf( " SSE2"   );
-        if      ( use_vaes   ) printf( " VAES"   );
-        else if ( use_aes    ) printf( " AES"    );
-        if      ( use_sha    ) printf( " SHA"    );
-     }
-     printf( "...\n\n" );
-
-     return true;
-}
-
 void get_defconfig_path(char *out, size_t bufsize, char *argv0);
 
 int main(int argc, char *argv[])
@@ -3598,6 +3608,11 @@ int main(int argc, char *argv[])
       fprintf(stderr, "%s: no algo supplied\n", argv[0]);
       show_usage_and_exit(1);
    }
+
+   if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
+
+   if ( !check_cpu_capability() ) exit(1);
+   
 	if ( !opt_benchmark )
    {
       if ( !short_url )
@@ -3637,7 +3652,7 @@ int main(int argc, char *argv[])
 	}
 
    // All options must be set before starting the gate
-   if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
+//   if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
 
    if ( coinbase_address )
    {
@@ -3656,7 +3671,7 @@ int main(int argc, char *argv[])
    memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
    memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
 
-   if ( !check_cpu_capability() ) exit(1);
+//   if ( !check_cpu_capability() ) exit(1);
 
 	pthread_mutex_init( &stats_lock, NULL );
    pthread_rwlock_init( &g_work_lock, NULL );
diff --git a/miner.h b/miner.h
index e43012da..9e2749a3 100644
--- a/miner.h
+++ b/miner.h
@@ -737,7 +737,6 @@ extern uint32_t opt_work_size;
 extern double *thr_hashrates;
 extern double global_hashrate;
 extern double stratum_diff;
-extern bool opt_reset_on_stale;
 extern double net_diff;
 extern double net_hashrate;
 extern int opt_param_n;
@@ -762,6 +761,8 @@ extern pthread_mutex_t stats_lock;
 extern bool opt_sapling;
 extern const int pk_buffer_size_max;
 extern int pk_buffer_size;
+extern char *opt_data_file;
+extern bool opt_verify;
 
 static char const usage[] = "\
 Usage: cpuminer [OPTIONS]\n\
@@ -905,6 +906,8 @@ Options:\n\
       --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
       --max-diff=N      Only mine if net difficulty is less than specified value\n\
   -c, --config=FILE     load a JSON-format configuration file\n\
+      --data-file       path and name of data file\n\
+      --verify          enable additional time consuming start up tests\n\
   -V, --version         display version information and exit\n\
   -h, --help            display this help text and exit\n\
 ";
@@ -962,7 +965,6 @@ static struct option const options[] = {
         { "retries", 1, NULL, 'r' },
         { "retry-pause", 1, NULL, 1025 },
         { "randomize", 0, NULL, 1024 },
-        { "reset-on-stale", 0, NULL, 1026 },
         { "scantime", 1, NULL, 's' },
 #ifdef HAVE_SYSLOG_H
         { "syslog", 0, NULL, 'S' },
@@ -973,6 +975,8 @@ static struct option const options[] = {
         { "url", 1, NULL, 'o' },
         { "user", 1, NULL, 'u' },
         { "userpass", 1, NULL, 'O' },
+        { "data-file", 1, NULL, 1027 },
+        { "verify", 0, NULL, 1028 },
         { "version", 0, NULL, 'V' },
         { 0, 0, 0, 0 }
 };
diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h
index 5fff450f..4a7188e5 100644
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -5,6 +5,10 @@
 #define bswap_64( a ) __builtin_bswap64( a )
 #define bswap_32( a ) __builtin_bswap32( a )
 
+// safe division, integer or floating point
+#define safe_div( dividend, divisor, safe_result ) \
+   ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )
+
 
 ///////////////////////////////////////
 // 
diff --git a/verthash-help.txt b/verthash-help.txt
index b055ec31..f8e02db4 100644
--- a/verthash-help.txt
+++ b/verthash-help.txt
@@ -1,17 +1,80 @@
+Quickstart:
+----------
 
-The verthash data file must be named verthash.dat and located in the same
-directory as the cpuminer executable. A Linux symlink works.
+First time mining verthash or don't have a Verthash data file:
 
-The verthash data file must be obtained seperately. If you already use
-VerthashMiner you can simply copy or link the existing data file to the
-cpuminer directory, using the required name.
+--algo verthash --verify --url ...
 
-Otherwise it may be created using
-https://github.com/CryptoGraphics/VerthashMiner/releases
-following the instructions. A GPU is not necessary to create the file.
+Verthash data file already exists:
 
-The same data file can be used by both cpuminer and VerthashMiner
-simultaneously.
+--algo verthash --data-file /path/to/verthash.dat --url ...
+
+
+Background:
+----------
+
+Verthash algorithm requires a data file for hashing. This file is
+static, portable, and only needs to be created once.
+
+A Verthash data file created by VerthashMiner can also be used by cpuminer-opt
+and used simultaneously by both miners.
+
+Due to its size >1GB it is recommened one data file be created and
+stored in a permanent location accessible to any miner that wants to use it.
+
+New command line options:
+------------------------
+
+cpuminer-opt adds two new command line options for verthash. The names
+and some behaviour is changed from VerthashMiner.
+
+--data-file /path/to/verthash.dat
+  default when not used is verthash.dat in current working directory.  
+
+--verify
+  verify integrity of file specified by --data-file, or if not specified
+  the default data file if it exists, or create a default file and verify it
+  if one does not yet exist. Data file verification is disabled by default.
+
+Detailed usage:
+--------------
+
+If a data file already exists it can be selected using the --data-file
+option to specify the path and name of the file.
+
+--algo verthash --datafile /path/to/verthash.dat --url ...
+
+If the --data-file option is not used the default is to use 'verthash.dat'
+from the current working directory.
+
+If no data file exists it can be created by using the --verify option
+without the --data-file option. If the default data file is not found in
+the current directory it will be created.
+
+--algo verthash --verify --url ...
+
+Data file creation can take up to 30 minutes on a spinning hard drive. 
+Once created the new data file will be verified and used immediately
+if a valid url and user were included on the command line.
+
+A default data file can be created by ommitting the url option. That will
+either verify an existing default data file or create one and verify it,
+then exit.
+
+--algo verthash --verify
+
+A data file will never be created if --data-file is specified. The miner
+will exit with an error if the file is not found. This is to avoid accidentally
+creating an unwanted data file due to a typo.
+
+After creation the data file can moved to a more convenient location and
+referenced by --data-file, or left where it is and used by default without the
+--data-file option.
+
+Data file verification takes a few seconds and is disabled by default.
+VerthashMiner enables data file verification by default and has an option to
+disable it.
+
+The --verify option is intended primarily to create a new file. It's 
+not necessary or useful to verify a file every time the miner is started.
 
-Launching cpuminer to mine verthash is the same as any other algorithm,
-no extra options are required. 

From f3333b0070f56fe5850da592720ac26c1b26aee5 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 8 Apr 2021 18:09:31 -0400
Subject: [PATCH 05/20] v3.16.2

---
 Makefile.am                         |   1 +
 README.md                           |   2 +-
 RELEASE_NOTES                       |  12 +-
 algo/verthash/Verthash.c            | 136 ++++++++-----
 algo/verthash/Verthash.h            |   2 +
 algo/verthash/tiny_sha3/sha3-4way.c | 301 ++++++++++++++++++++++++++++
 algo/verthash/tiny_sha3/sha3-4way.h |  67 +++++++
 algo/verthash/tiny_sha3/sha3.c      |  71 +++++--
 algo/verthash/verthash-gate.c       |  83 +++++++-
 configure                           |  20 +-
 configure.ac                        |   2 +-
 cpu-miner.c                         |  49 +++--
 miner.h                             |   2 +-
 simd-utils/intrlv.h                 | 206 ++++---------------
 simd-utils/simd-128.h               |  10 +-
 simd-utils/simd-256.h               |  52 ++---
 util.c                              | 148 +++++++++++++-
 17 files changed, 827 insertions(+), 337 deletions(-)
 create mode 100644 algo/verthash/tiny_sha3/sha3-4way.c
 create mode 100644 algo/verthash/tiny_sha3/sha3-4way.h

diff --git a/Makefile.am b/Makefile.am
index f4163820..d5398c00 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -196,6 +196,7 @@ cpuminer_SOURCES = \
   algo/verthash/Verthash.c \
   algo/verthash/fopen_utf8.c \
   algo/verthash/tiny_sha3/sha3.c \
+  algo/verthash/tiny_sha3/sha3-4way.c \
   algo/whirlpool/sph_whirlpool.c \
   algo/whirlpool/whirlpool-hash-4way.c \
   algo/whirlpool/whirlpool-gate.c \
diff --git a/README.md b/README.md
index 65b3f0e7..d740fd7b 100644
--- a/README.md
+++ b/README.md
@@ -135,7 +135,7 @@ Supported Algorithms
                           x14           X14
                           x15           X15
                           x16r          
-                          x16rv2        Ravencoin (RVN)
+                          x16rv2        
                           x16rt         Gincoin (GIN)
                           x16rt-veil    Veil (VEIL)
                           x16s          Pigeoncoin (PGN)
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 1c7aca5e..a1133db1 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,14 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.16.2
+
+Verthash: midstate prehash optimization for all architectures.
+Verthash: AVX2 optimization.
+GBT: added support for Bech32 addresses, untested.
+Linux: added CPU frequency to benchmark log.
+Fixed integer overflow in time calculations.
+
 v3.16.1
 
 New options for verthash:
@@ -72,16 +80,12 @@ New options for verthash:
               data file, default is "verthash.dat" in the current directory.
   --verify to perform the data file integrity check at startup, default is
            not to verify data file integrity.
-
 Support for creation of default verthash data file if:
    1) --data-file option is not used,
    2) no default data file is found in the current directory, and,
    3) --verify option is used.
-
 More detailed logs related to verthash data file.
-
 Small verthash performance improvement.
-
 Fixed detection of corrupt stats caused by networking issues.
 
 v3.16.0
diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
index 072df372..475c79a7 100644
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -134,87 +134,117 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
     return (a ^ b) * 0x1000193;
 }
 
-void verthash_hash(const unsigned char* blob_bytes,
-                   const size_t blob_size,
-                   const unsigned char(*input)[VH_HEADER_SIZE],
-                   unsigned char(*output)[VH_HASH_OUT_SIZE])
+void verthash_hash( const unsigned char* blob_bytes,
+                    const size_t blob_size,
+                    const unsigned char(*input)[VH_HEADER_SIZE],
+                    unsigned char(*output)[VH_HASH_OUT_SIZE] )
 {
-    unsigned char p1[VH_HASH_OUT_SIZE] __attribute__ ((aligned (64)));
-    sha3(&input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE);
-
-    unsigned char p0[VH_N_SUBSET];
-
-    unsigned char input_header[VH_HEADER_SIZE] __attribute__ ((aligned (64)));
-    memcpy(input_header, input, VH_HEADER_SIZE);
-
-    for (size_t i = 0; i < VH_N_ITER; ++i)
-    {
-        input_header[0] += 1;
-        sha3(&input_header[0], VH_HEADER_SIZE, p0 + i * VH_P0_SIZE, VH_P0_SIZE);
-    }
-
-    uint32_t* p0_index = (uint32_t*)p0;
+    unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64)));
+    unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64)));
     uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
+    uint32_t* p0_index = (uint32_t*)p0;
 
+    verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] );
+    
     for ( size_t x = 0; x < VH_N_ROT; ++x )
     {
         memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
                 p0, VH_N_SUBSET);
 
-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)        
-// 512 bit vector processing is actually slower because it reduces the CPU
-// clock significantly, which also slows mem access. The AVX512 rol instruction
-// is still available for smaller vectors.
-
-//        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 16 )
-//        {
-//            __m512i *p0_v = (__m512i*)( p0_index + y );
-//            *p0_v = mm512_rol_32( *p0_v, 1 );
-//        }
-
 #if defined(__AVX2__)
 
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 8 )
+        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8)
         {
-            __m256i *p0_v = (__m256i*)( p0_index + y );
-            *p0_v = mm256_rol_32( *p0_v, 1 );
+           casti_m256i( p0_index, y   ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y   ), 1 );
+           casti_m256i( p0_index, y+1 ) = mm256_rol_32( 
+                                            casti_m256i( p0_index, y+1 ), 1 );
+           casti_m256i( p0_index, y+2 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+2 ), 1 );
+           casti_m256i( p0_index, y+3 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+3 ), 1 );
+           casti_m256i( p0_index, y+4 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+4 ), 1 );
+           casti_m256i( p0_index, y+5 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+5 ), 1 );
+           casti_m256i( p0_index, y+6 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+6 ), 1 );
+           casti_m256i( p0_index, y+7 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+7 ), 1 );
         }
 
 #else
 
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 4 )
+        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8)
         {
-            __m128i *p0_v = (__m128i*)( p0_index + y );
-            *p0_v = mm128_rol_32( *p0_v, 1 );
+           casti_m128i( p0_index, y   ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y   ), 1 );
+           casti_m128i( p0_index, y+1 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+1 ), 1 );
+           casti_m128i( p0_index, y+2 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+2 ), 1 );
+           casti_m128i( p0_index, y+3 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+3 ), 1 );
+           casti_m128i( p0_index, y+4 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+4 ), 1 );
+           casti_m128i( p0_index, y+5 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+5 ), 1 );
+           casti_m128i( p0_index, y+6 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+6 ), 1 );
+           casti_m128i( p0_index, y+7 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+7 ), 1 );
         }
-
+        
 #endif
 
-//        for (size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); ++y)
-//        {
-//            *(p0_index + y) = ( *(p0_index + y) << 1 )
-//            | ( 1 & (*(p0_index + y) >> 31) );
-//        }
     }
 
+    sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE );
+    
     uint32_t* p1_32 = (uint32_t*)p1;
     uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes;
     uint32_t value_accumulator = 0x811c9dc5;
-    const uint32_t mdiv = ((blob_size - VH_HASH_OUT_SIZE) / VH_BYTE_ALIGNMENT) + 1;
-    for (size_t i = 0; i < VH_N_INDEXES; i++)
+    const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
+                             / VH_BYTE_ALIGNMENT ) + 1;
+#if defined (__AVX2__)        
+    const __m256i k = _mm256_set1_epi32( 0x1000193 );
+#elif defined(__SSE41__)
+    const __m128i k = _mm_set1_epi32( 0x1000193 );
+#endif
+
+    for ( size_t i = 0; i < VH_N_INDEXES; i++ )
     {
-        const uint32_t offset = (fnv1a(seek_indexes[i], value_accumulator) % mdiv) * VH_BYTE_ALIGNMENT / sizeof(uint32_t);
+        const uint32_t offset =
+                      ( fnv1a( seek_indexes[i], value_accumulator) % mdiv )
+                      * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) );
         const uint32_t *blob_off = blob_bytes_32 + offset;
-        for (size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++)
-        {
-            const uint32_t value = *( blob_off + i2 );
-            uint32_t* p1_ptr = p1_32 + i2;
-            *p1_ptr = fnv1a( *p1_ptr, value );
-            value_accumulator = fnv1a( value_accumulator, value );
-        }
+
+        // update value accumulator for next seek index
+        value_accumulator = fnv1a( value_accumulator, blob_off[0] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[1] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[2] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[3] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[4] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[5] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[6] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[7] );
+        
+#if defined (__AVX2__)        
+        *(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256(
+                                  *(__m256i*)p1_32, *(__m256i*)blob_off ), k );
+#elif defined(__SSE41__)
+        casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128( 
+                    casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k );
+        casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128( 
+                    casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k );
+#else
+         for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ )
+            p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] );
+#endif
+
     }
 
-    memcpy(output, p1, VH_HASH_OUT_SIZE);
+    memcpy( output, p1, VH_HASH_OUT_SIZE );
 }
 
 //-----------------------------------------------------------------------------
diff --git a/algo/verthash/Verthash.h b/algo/verthash/Verthash.h
index f81c9750..5cce653a 100644
--- a/algo/verthash/Verthash.h
+++ b/algo/verthash/Verthash.h
@@ -52,6 +52,8 @@ void verthash_hash(const unsigned char* blob_bytes,
                    const unsigned char(*input)[VH_HEADER_SIZE],
                    unsigned char(*output)[VH_HASH_OUT_SIZE]);
 
+void verthash_sha3_512_prehash_72( const void *input );
+void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );
 
 #endif // !Verthash_INCLUDE_ONCE
 
diff --git a/algo/verthash/tiny_sha3/sha3-4way.c b/algo/verthash/tiny_sha3/sha3-4way.c
new file mode 100644
index 00000000..abbc8483
--- /dev/null
+++ b/algo/verthash/tiny_sha3/sha3-4way.c
@@ -0,0 +1,301 @@
+#if defined(__AVX2__)
+
+// sha3-4way.c
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// vectorization by JayDDee 2021-03-27
+//
+// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
+// Revised 03-Sep-15 for portability + OpenSSL - style API
+
+#include "sha3-4way.h"
+
+// constants
+static const uint64_t keccakf_rndc[24] = {
+        0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+        0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+        0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+        0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+        0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+        0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+        0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+        0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+    };
+
+void sha3_4way_keccakf( __m256i st[25] )
+{
+   int i, j, r;
+   __m256i t, bc[5];
+
+   for ( r = 0; r < KECCAKF_ROUNDS; r++ )
+   {
+      // Theta
+      bc[0] = _mm256_xor_si256( st[0],
+                           mm256_xor4( st[5], st[10], st[15], st[20] ) );
+      bc[1] = _mm256_xor_si256( st[1],
+                           mm256_xor4( st[6], st[11], st[16], st[21] ) );
+      bc[2] = _mm256_xor_si256( st[2],
+                           mm256_xor4( st[7], st[12], st[17], st[22] ) );
+      bc[3] = _mm256_xor_si256( st[3],
+                           mm256_xor4( st[8], st[13], st[18], st[23] ) );
+      bc[4] = _mm256_xor_si256( st[4],
+                           mm256_xor4( st[9], st[14], st[19], st[24] ) );
+
+      for ( i = 0; i < 5; i++ )
+      {
+         t = _mm256_xor_si256( bc[ (i+4) % 5 ],
+                               mm256_rol_64( bc[ (i+1) % 5 ], 1 ) );
+         st[ i    ]  = _mm256_xor_si256( st[ i    ],  t );
+         st[ i+5  ]  = _mm256_xor_si256( st[ i+ 5 ],  t );
+         st[ i+10 ]  = _mm256_xor_si256( st[ i+10 ],  t );
+         st[ i+15 ]  = _mm256_xor_si256( st[ i+15 ],  t );
+         st[ i+20 ]  = _mm256_xor_si256( st[ i+20 ],  t );
+      }
+
+      // Rho Pi
+#define RHO_PI( i, c ) \
+   bc[0] = st[ i ]; \
+   st[ i ] = mm256_rol_64( t, c ); \
+   t = bc[0]
+
+      t = st[1];
+
+      RHO_PI( 10,  1 );
+      RHO_PI(  7,  3 );
+      RHO_PI( 11,  6 );
+      RHO_PI( 17, 10 );
+      RHO_PI( 18, 15 );
+      RHO_PI(  3, 21 );
+      RHO_PI(  5, 28 );
+      RHO_PI( 16, 36 );
+      RHO_PI(  8, 45 );
+      RHO_PI( 21, 55 );
+      RHO_PI( 24,  2 );
+      RHO_PI(  4, 14 );
+      RHO_PI( 15, 27 );
+      RHO_PI( 23, 41 );
+      RHO_PI( 19, 56 );
+      RHO_PI( 13,  8 );
+      RHO_PI( 12, 25 );
+      RHO_PI(  2, 43 );
+      RHO_PI( 20, 62 );
+      RHO_PI( 14, 18 );
+      RHO_PI( 22, 39 );
+      RHO_PI(  9, 61 );
+      RHO_PI(  6, 20 );
+      RHO_PI(  1, 44 );
+
+#undef RHO_PI        
+
+      //  Chi
+      for ( j = 0; j < 25; j += 5 )
+      {
+         memcpy( bc, &st[ j ], 5*32 );
+         st[ j   ] = _mm256_xor_si256( st[ j   ],
+                                       _mm256_andnot_si256( bc[1], bc[2] ) );
+         st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
+                                       _mm256_andnot_si256( bc[2], bc[3] ) );
+         st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
+                                       _mm256_andnot_si256( bc[3], bc[4] ) );
+         st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
+                                       _mm256_andnot_si256( bc[4], bc[0] ) );
+         st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
+                                       _mm256_andnot_si256( bc[0], bc[1] ) );
+      }
+
+      //  Iota
+      st[0] = _mm256_xor_si256( st[0],
+                                _mm256_set1_epi64x( keccakf_rndc[ r ] ) );
+   }
+}
+
+int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen )
+{
+    for ( int i = 0; i < 25; i++ )  c->st[ i ] = m256_zero;
+    c->mdlen = mdlen;
+    c->rsiz = 200 - 2 * mdlen;
+    c->pt = 0;
+    return 1;
+}
+
+int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len )
+{
+    size_t i;
+    int j =  c->pt;
+    const int rsiz = c->rsiz / 8;
+    const int l = len / 8;
+
+    for ( i = 0; i < l; i++ )
+    {
+        c->st[ j ] = _mm256_xor_si256( c->st[ j ],
+                                       ( (const __m256i*)data )[i] );
+        j++;
+        if ( j >= rsiz )
+        {
+            sha3_4way_keccakf( c->st );
+            j = 0;
+        }
+    }
+    c->pt = j;
+
+    return 1;
+}
+
+int sha3_4way_final( void *md, sha3_4way_ctx_t *c )
+{
+    c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ],
+                                       m256_const1_64( 6 ) );
+    c->st[ c->rsiz / 8 - 1 ] =
+                       _mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ],
+                                         m256_const1_64( 0x8000000000000000 ) );
+    sha3_4way_keccakf( c->st );
+    memcpy( md, c->st, c->mdlen * 4 );
+    return 1;
+}
+
+void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen )
+{
+    sha3_4way_ctx_t ctx;
+    sha3_4way_init( &ctx, mdlen);
+    sha3_4way_update( &ctx, in, inlen );
+    sha3_4way_final( md, &ctx );
+    return md;
+}
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void sha3_8way_keccakf( __m512i st[25] )
+{
+    int i, j, r;
+    __m512i t, bc[5];
+
+    // actual iteration
+    for ( r = 0; r < KECCAKF_ROUNDS; r++ )
+    {
+
+        // Theta
+        for ( i = 0; i < 5; i++ )
+           bc[i] = _mm512_xor_si512( st[i], 
+              mm512_xor4( st[ i+5 ], st[ i+10 ], st[ i+15 ], st[i+20 ] ) );
+
+        for ( i = 0; i < 5; i++ )
+        {
+            t = _mm512_xor_si512( bc[(i + 4) % 5],
+                                  _mm512_rol_epi64( bc[(i + 1) % 5], 1 ) );
+            for ( j = 0; j < 25; j += 5 )
+                st[j + i]  = _mm512_xor_si512( st[j + i],  t );
+        }
+
+        // Rho Pi
+#define RHO_PI( i, c ) \
+   bc[0] = st[ i ]; \
+   st[ i ] = _mm512_rol_epi64( t, c ); \
+   t = bc[0]
+
+        t = st[1];
+
+        RHO_PI( 10,  1 );        
+        RHO_PI(  7,  3 );
+        RHO_PI( 11,  6 );
+        RHO_PI( 17, 10 );
+        RHO_PI( 18, 15 );
+        RHO_PI(  3, 21 );
+        RHO_PI(  5, 28 );
+        RHO_PI( 16, 36 );
+        RHO_PI(  8, 45 );
+        RHO_PI( 21, 55 );
+        RHO_PI( 24,  2 );
+        RHO_PI(  4, 14 );
+        RHO_PI( 15, 27 );
+        RHO_PI( 23, 41 );
+        RHO_PI( 19, 56 );
+        RHO_PI( 13,  8 );
+        RHO_PI( 12, 25 );
+        RHO_PI(  2, 43 );
+        RHO_PI( 20, 62 );
+        RHO_PI( 14, 18 );
+        RHO_PI( 22, 39 );
+        RHO_PI(  9, 61 );
+        RHO_PI(  6, 20 );
+        RHO_PI(  1, 44 );
+
+#undef RHO_PI        
+
+        //  Chi
+        for ( j = 0; j < 25; j += 5 )
+        {
+            for ( i = 0; i < 5; i++ )
+                bc[i] = st[j + i];
+            for ( i = 0; i < 5; i++ )
+                st[ j+i ] = _mm512_xor_si512(  st[ j+i ],  _mm512_andnot_si512(
+                                         bc[ (i+1) % 5 ], bc[ (i+2) % 5 ] ) );
+        }
+
+        //  Iota
+        st[0] = _mm512_xor_si512( st[0], _mm512_set1_epi64( keccakf_rndc[r] ) );
+    }
+}
+
+// Initialize the context for SHA3
+
+int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen )
+{
+    for ( int i = 0; i < 25; i++ )  c->st[ i ] = m512_zero;
+    c->mdlen = mdlen;
+    c->rsiz = 200 - 2 * mdlen;
+    c->pt = 0;
+    return 1;
+}
+
+// update state with more data
+
+int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len )
+{
+    size_t i;
+    int j =  c->pt;
+    const int rsiz = c->rsiz / 8;
+    const int l = len / 8;
+
+    for ( i = 0; i < l; i++ )
+    {
+        c->st[ j ] = _mm512_xor_si512( c->st[ j ],
+                                        ( (const __m512i*)data )[i] );
+        j++;
+        if ( j >= rsiz )
+        {
+            sha3_8way_keccakf( c->st );
+            j = 0;
+        }
+    }
+    c->pt = j;
+
+    return 1;
+}
+
+// finalize and output a hash
+
+int sha3_8way_final( void *md, sha3_8way_ctx_t *c )
+{
+    c->st[ c->pt ] =
+                       _mm512_xor_si512( c->st[ c->pt ],
+                                         m512_const1_64( 6 ) );
+    c->st[ c->rsiz / 8 - 1 ] =
+                       _mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ],
+                                         m512_const1_64( 0x8000000000000000 ) );
+    sha3_8way_keccakf( c->st );
+    memcpy( md, c->st, c->mdlen * 8 );
+    return 1;
+}
+
+// compute a SHA-3 hash (md) of given byte length from "in"
+
+void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen )
+{
+    sha3_8way_ctx_t sha3;
+    sha3_8way_init( &sha3, mdlen);
+    sha3_8way_update( &sha3, in, inlen );
+    sha3_8way_final( md, &sha3 );
+    return md;
+}
+
+#endif  // AVX512
+#endif  // AVX2
diff --git a/algo/verthash/tiny_sha3/sha3-4way.h b/algo/verthash/tiny_sha3/sha3-4way.h
new file mode 100644
index 00000000..6723b73b
--- /dev/null
+++ b/algo/verthash/tiny_sha3/sha3-4way.h
@@ -0,0 +1,67 @@
+// sha3.h
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// 2021-03-27 JayDDee
+//
+#ifndef SHA3_4WAY_H
+#define SHA3_4WAY_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "simd-utils.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef KECCAKF_ROUNDS
+#define KECCAKF_ROUNDS 24
+#endif
+
+#if defined(__AVX2__)
+
+typedef struct
+{
+   __m256i st[25];                     // 64-bit words * 4 lanes
+    int pt, rsiz, mdlen;                    // these don't overflow
+} sha3_4way_ctx_t __attribute__ ((aligned (64)));;
+
+// Compression function.
+void sha3_4way_keccakf( __m256i st[25] );
+
+// OpenSSL - like interfece
+int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen );    // mdlen = hash output in bytes
+int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len );
+int sha3_4way_final( void *md, sha3_4way_ctx_t *c );    // digest goes to md
+
+// compute a sha3 hash (md) of given byte length from "in"
+void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen );
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// state context
+typedef struct
+{
+   __m512i st[25];                     // 64-bit words * 8 lanes
+    int pt, rsiz, mdlen;                    // these don't overflow
+} sha3_8way_ctx_t __attribute__ ((aligned (64)));;
+
+// Compression function.
+void sha3_8way_keccakf( __m512i st[25] );
+
+// OpenSSL - like interfece
+int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen );    // mdlen = hash output in bytes
+int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len );
+int sha3_8way_final( void *md, sha3_8way_ctx_t *c );    // digest goes to md
+
+// compute a sha3 hash (md) of given byte length from "in"
+void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen );
+
+#endif // AVX512
+#endif // AVX2
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/algo/verthash/tiny_sha3/sha3.c b/algo/verthash/tiny_sha3/sha3.c
index 931ae020..94b06602 100644
--- a/algo/verthash/tiny_sha3/sha3.c
+++ b/algo/verthash/tiny_sha3/sha3.c
@@ -5,6 +5,7 @@
 // Revised 03-Sep-15 for portability + OpenSSL - style API
 
 #include "sha3.h"
+#include <string.h>
 
 // update the state with given number of rounds
 
@@ -21,6 +22,7 @@ void sha3_keccakf(uint64_t st[25])
         0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
         0x8000000000008080, 0x0000000080000001, 0x8000000080008008
     };
+/*
     const int keccakf_rotc[24] = {
         1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
         27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
@@ -29,6 +31,7 @@ void sha3_keccakf(uint64_t st[25])
         10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
         15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
     };
+*/
 
     // variables
     int i, j, r;
@@ -60,14 +63,50 @@ void sha3_keccakf(uint64_t st[25])
                 st[j + i] ^= t;
         }
 
+        
         // Rho Pi
+#define RHO_PI( i, c ) \
+   bc[0] = st[ i ]; \
+   st[ i ] = ROTL64( t, c ); \
+   t = bc[0]
+
         t = st[1];
+
+        RHO_PI( 10,  1 );
+        RHO_PI(  7,  3 );
+        RHO_PI( 11,  6 );
+        RHO_PI( 17, 10 );
+        RHO_PI( 18, 15 );
+        RHO_PI(  3, 21 );
+        RHO_PI(  5, 28 );
+        RHO_PI( 16, 36 );
+        RHO_PI(  8, 45 );
+        RHO_PI( 21, 55 );
+        RHO_PI( 24,  2 );
+        RHO_PI(  4, 14 );
+        RHO_PI( 15, 27 );
+        RHO_PI( 23, 41 );
+        RHO_PI( 19, 56 );
+        RHO_PI( 13,  8 );
+        RHO_PI( 12, 25 );
+        RHO_PI(  2, 43 );
+        RHO_PI( 20, 62 );
+        RHO_PI( 14, 18 );
+        RHO_PI( 22, 39 );
+        RHO_PI(  9, 61 );
+        RHO_PI(  6, 20 );
+        RHO_PI(  1, 44 );
+
+#undef RHO_PI        
+
+/*        
         for (i = 0; i < 24; i++) {
             j = keccakf_piln[i];
             bc[0] = st[j];
             st[j] = ROTL64(t, keccakf_rotc[i]);
             t = bc[0];
         }
+*/
 
         //  Chi
         for (j = 0; j < 25; j += 5) {
@@ -118,17 +157,20 @@ int sha3_init(sha3_ctx_t *c, int mdlen)
 int sha3_update(sha3_ctx_t *c, const void *data, size_t len)
 {
     size_t i;
-    int j;
-
-    j = c->pt;
-    for (i = 0; i < len; i++) {
-        c->st.b[j++] ^= ((const uint8_t *) data)[i];
-        if (j >= c->rsiz) {
-            sha3_keccakf(c->st.q);
+    int j = c->pt / 8;
+    const int rsiz = c->rsiz / 8;
+    const int l = len / 8;
+
+    for ( i = 0; i < l; i++ )
+    {
+        c->st.q[ j++ ] ^= ( ((const uint64_t *) data) [i] );
+        if ( j >= rsiz )
+        {
+            sha3_keccakf( c->st.q );
             j = 0;
         }
     }
-    c->pt = j;
+    c->pt = j*8;
 
     return 1;
 }
@@ -137,16 +179,10 @@ int sha3_update(sha3_ctx_t *c, const void *data, size_t len)
 
 int sha3_final(void *md, sha3_ctx_t *c)
 {
-    int i;
-
-    c->st.b[c->pt] ^= 0x06;
-    c->st.b[c->rsiz - 1] ^= 0x80;
+    c->st.q[ c->pt / 8 ] ^= 6;
+    c->st.q[ c->rsiz / 8 - 1 ] ^= 0x8000000000000000;
     sha3_keccakf(c->st.q);
-
-    for (i = 0; i < c->mdlen; i++) {
-        ((uint8_t *) md)[i] = c->st.b[i];
-    }
-
+    memcpy( md, c->st.q, c->mdlen );
     return 1;
 }
 
@@ -155,7 +191,6 @@ int sha3_final(void *md, sha3_ctx_t *c)
 void *sha3(const void *in, size_t inlen, void *md, int mdlen)
 {
     sha3_ctx_t sha3;
-
     sha3_init(&sha3, mdlen);
     sha3_update(&sha3, in, inlen);
     sha3_final(md, &sha3);
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
index 00c137fc..029ce462 100644
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -1,6 +1,7 @@
 #include "algo-gate-api.h"
 #include "algo/sha/sph_sha2.h"
 #include "Verthash.h"
+#include "tiny_sha3/sha3-4way.h"
 
 static verthash_info_t verthashInfo;
 
@@ -12,6 +13,82 @@ static const uint8_t verthashDatFileHash_bytes[32] =
   0x29, 0xec, 0xf8, 0x8f, 0x8a, 0xd4, 0x76, 0x39,
   0xb6, 0xed, 0xed, 0xaf, 0xd7, 0x21, 0xaa, 0x48 };
 
+#if defined(__AVX2__)
+
+static __thread sha3_4way_ctx_t sha3_mid_ctxA;
+static __thread sha3_4way_ctx_t sha3_mid_ctxB;
+
+#else
+
+static __thread sha3_ctx_t sha3_mid_ctx[8];
+
+#endif
+
+void verthash_sha3_512_prehash_72( const void *input )
+{
+#if defined(__AVX2__)
+   
+   __m256i vin[10];
+   mm256_intrlv80_4x64( vin, input );
+
+   sha3_4way_init( &sha3_mid_ctxA, 64 );
+   sha3_4way_init( &sha3_mid_ctxB, 64 );
+
+   vin[0] = _mm256_add_epi8( vin[0], _mm256_set_epi64x( 4,3,2,1 ) );
+   sha3_4way_update( &sha3_mid_ctxA, vin, 72 );
+
+   vin[0] = _mm256_add_epi8( vin[0], _mm256_set1_epi64x( 4 ) );
+   sha3_4way_update( &sha3_mid_ctxB, vin, 72 );
+
+#else
+
+   char in[80] __attribute__ ((aligned (64)));
+   memcpy( in, input, 80 );   
+   for ( int i = 0; i < 8; i++ )
+   {
+      in[0] += 1;
+      sha3_init( &sha3_mid_ctx[i], 64 );
+      sha3_update( &sha3_mid_ctx[i], in, 72 );
+   }
+
+#endif
+}
+
+void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
+{
+#if defined(__AVX2__)
+
+    __m256i vhashA[ 10 ] __attribute__ ((aligned (64)));
+    __m256i vhashB[ 10 ] __attribute__ ((aligned (64)));
+
+   sha3_4way_ctx_t ctx;
+   __m256i vnonce = _mm256_set1_epi64x( nonce );
+
+   memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx );
+   sha3_4way_update( &ctx, &vnonce, 8 );
+   sha3_4way_final( vhashA, &ctx );
+
+   memcpy( &ctx, &sha3_mid_ctxB, sizeof ctx );
+   sha3_4way_update( &ctx, &vnonce, 8 );
+   sha3_4way_final( vhashB, &ctx );
+
+   dintrlv_4x64( hash,     hash+64,  hash+128, hash+192, vhashA, 512 );
+   dintrlv_4x64( hash+256, hash+320, hash+384, hash+448, vhashB, 512 );
+   
+#else
+
+   for ( int i = 0; i < 8; i++ )
+   {
+      sha3_ctx_t ctx;
+      memcpy( &ctx, &sha3_mid_ctx[i], sizeof ctx );
+      sha3_update( &ctx, &nonce, 8 );
+      sha3_final( hash + i*64, &ctx );
+   }
+   
+#endif
+}
+
+
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -26,6 +103,8 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
 
    mm128_bswap32_80( edata, pdata );
+   verthash_sha3_512_prehash_72( edata );
+
    do
    {
       edata[19] = n;
@@ -51,15 +130,14 @@ bool register_verthash_algo( algo_gate_t* gate )
 
   opt_target_factor = 256.0;
   gate->scanhash  = (void*)&scanhash_verthash;
+  gate->optimizations = AVX2_OPT;
    
-  // verthash data file
   char *verthash_data_file = opt_data_file ? opt_data_file
                                            : default_verthash_data_file;
   
    int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
    if (vhLoadResult == 0) // No Error
    {
-      //  and verify data file(if it was enabled)
       if ( opt_verify )
       {
          uint8_t vhDataFileHash[32] = { 0 };
@@ -78,7 +156,6 @@ bool register_verthash_algo( algo_gate_t* gate )
       }
    }
    else
-
    {
       // Handle Verthash error codes
       if ( vhLoadResult == 1 )
diff --git a/configure b/configure
index 00998223..e18537aa 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.16.1'
-PACKAGE_STRING='cpuminer-opt 3.16.1'
+PACKAGE_VERSION='3.16.2'
+PACKAGE_STRING='cpuminer-opt 3.16.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.16.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.16.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.16.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.16.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.16.1
+cpuminer-opt configure 3.16.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.16.1, which was
+It was created by cpuminer-opt $as_me 3.16.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.16.1'
+ VERSION='3.16.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.16.1, which was
+This file was extended by cpuminer-opt $as_me 3.16.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.16.1
+cpuminer-opt config.status 3.16.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 29b3b9a7..5ee7b2f7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.1])
+AC_INIT([cpuminer-opt], [3.16.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 0ed29b92..e52168bd 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -555,7 +555,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
          if ( !s )
             continue;
          if ( !strcmp( s, "segwit" ) || !strcmp( s, "!segwit" ) )
+         {
             segwit = true;
+            if ( opt_debug )
+               applog( LOG_INFO, "GBT: SegWit is enabled" );
+         }
       }
    }
 // Segwit END
@@ -954,25 +958,25 @@ void scale_hash_for_display ( double* hashrate, char* prefix )
   else                          { *prefix = 'Y';  *hashrate /= 1e24; }
 }
 
-static inline void sprintf_et( char *str, int seconds )
+static inline void sprintf_et( char *str, long unsigned int seconds )
 {
-   // sprintf doesn't like uint64_t, Linux thinks it's long, Windows long long.
-   unsigned int min = seconds / 60;
-   unsigned int sec = seconds % 60;
-   unsigned int hrs = min / 60;
+   long unsigned int min = seconds / 60;
+   long unsigned int sec = seconds % 60;
+   long unsigned int hrs = min / 60;
+   
    if ( unlikely( hrs ) )   
    {
-      unsigned int years = hrs / (24*365);
-      unsigned int days = hrs / 24;
-      if ( years )
-         sprintf( str, "%uy%ud", years, years % 365 );
-      else if ( days )  //0d00h
-         sprintf( str, "%ud%02uh", days, hrs % 24 );
+      long unsigned int days = hrs / 24;
+      long unsigned int years = days / 365;
+      if ( years )      // 0y000d
+         sprintf( str, "%luy%lud", years, years % 365 );
+      else if ( days )  // 0d00h
+         sprintf( str, "%lud%02luh", days, hrs % 24 );
       else         // 0h00m  
-         sprintf( str, "%uh%02um", hrs, min % 60 );
+         sprintf( str, "%luh%02lum", hrs, min % 60 );
    }
    else         // 0m00s
-      sprintf( str, "%um%02us", min, sec );
+      sprintf( str, "%lum%02lus", min, sec );
 }
 
 const long double exp32 = EXP32;                                  // 2**32
@@ -1071,7 +1075,8 @@ void report_summary_log( bool force )
    
    double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
    double ghrate = global_hashrate;
-   double shrate = safe_div( exp32 * last_targetdiff * (double)(accepts),
+   double target_diff = exp32 * last_targetdiff;
+   double shrate = safe_div( target_diff * (double)(accepts),
                              share_time, 0. );
    double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                  (double)uptime.tv_sec, 0. );
@@ -1099,12 +1104,12 @@ void report_summary_log( bool force )
 
    if ( accepted_share_count < submitted_share_count )
    {
-      double ltd = exp32 * last_targetdiff;
       double lost_ghrate = uptime.tv_sec == 0 ? 0.
-                : ltd * (double)(submitted_share_count - accepted_share_count )
+                : target_diff
+                       * (double)(submitted_share_count - accepted_share_count )
                   / (double)uptime.tv_sec;
       double lost_shrate = share_time == 0. ? 0.
-               : ltd  * (double)(submits - accepts ) / share_time;
+               : target_diff  * (double)(submits - accepts ) / share_time;
       char lshr_units[4] = {0};
       char lghr_units[4] = {0};
       scale_hash_for_display( &lost_shrate, lshr_units );
@@ -2437,10 +2442,14 @@ static void *miner_thread( void *userdata )
 #if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32))
              applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
 #else
-             applog( LOG_NOTICE, "Total: %s %sH/s, CPU temp: %dC",
-                                  hr, hr_units, (uint32_t)cpu_temp(0) );
+             float lo_freq = 0., hi_freq = 0.;
+             linux_cpu_hilo_freq( &lo_freq, &hi_freq );
+             applog( LOG_NOTICE,
+                     "Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz",
+                     hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6,
+                     hi_freq / 1e6 );
 #endif
-	       }
+          }
        }  // benchmark
 
        // conditional mining
diff --git a/miner.h b/miner.h
index 9e2749a3..bea4f68a 100644
--- a/miner.h
+++ b/miner.h
@@ -900,7 +900,7 @@ Options:\n\
       --benchmark       run in offline benchmark mode\n\
       --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
       --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)\n\
-  -b, --api-bind        IP/Port for the miner API (default: 127.0.0.1:4048)\n\
+  -b, --api-bind=address[:port]   IP address for the miner API, default port is 4048)\n\
       --api-remote      Allow remote control\n\
       --max-temp=N      Only mine if cpu temp is less than specified value (linux)\n\
       --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index 93a5e19b..cedcae34 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -1225,37 +1225,6 @@ static inline void intrlv_4x64( void *dst, const void *src0,
    d[31] = _mm_unpackhi_epi64( s2[7], s3[7] );
 }
 
-/*
-static inline void intrlv_4x64( void *dst, void *src0,
-           void *src1, void *src2, void *src3, int bit_len )
-{
-   uint64_t *d = (uint64_t*)dst;
-   uint64_t *s0 = (uint64_t*)src0;
-   uint64_t *s1 = (uint64_t*)src1;
-   uint64_t *s2 = (uint64_t*)src2;
-   uint64_t *s3 = (uint64_t*)src3;
-   d[  0] = s0[ 0];   d[  1] = s1[ 0];   d[  2] = s2[ 0];   d[  3] = s3[ 0];
-   d[  4] = s0[ 1];   d[  5] = s1[ 1];   d[  6] = s2[ 1];   d[  7] = s3[ 1];
-   d[  8] = s0[ 2];   d[  9] = s1[ 2];   d[ 10] = s2[ 2];   d[ 11] = s3[ 2];
-   d[ 12] = s0[ 3];   d[ 13] = s1[ 3];   d[ 14] = s2[ 3];   d[ 15] = s3[ 3];
-   if ( bit_len <= 256 ) return;
-   d[ 16] = s0[ 4];   d[ 17] = s1[ 4];   d[ 18] = s2[ 4];   d[ 19] = s3[ 4];
-   d[ 20] = s0[ 5];   d[ 21] = s1[ 5];   d[ 22] = s2[ 5];   d[ 23] = s3[ 5];
-   d[ 24] = s0[ 6];   d[ 25] = s1[ 6];   d[ 26] = s2[ 6];   d[ 27] = s3[ 6];
-   d[ 28] = s0[ 7];   d[ 29] = s1[ 7];   d[ 30] = s2[ 7];   d[ 31] = s3[ 7];
-   if ( bit_len <= 512 ) return;
-   d[ 32] = s0[ 8];   d[ 33] = s1[ 8];   d[ 34] = s2[ 8];   d[ 35] = s3[ 8];
-   d[ 36] = s0[ 9];   d[ 37] = s1[ 9];   d[ 38] = s2[ 9];   d[ 39] = s3[ 9];
-   if ( bit_len <= 640 ) return;
-   d[ 40] = s0[10];   d[ 41] = s1[10];   d[ 42] = s2[10];   d[ 43] = s3[10];
-   d[ 44] = s0[11];   d[ 45] = s1[11];   d[ 46] = s2[11];   d[ 47] = s3[11];
-   d[ 48] = s0[12];   d[ 49] = s1[12];   d[ 50] = s2[12];   d[ 51] = s3[12];
-   d[ 52] = s0[13];   d[ 53] = s1[13];   d[ 54] = s2[13];   d[ 55] = s3[13];
-   d[ 56] = s0[14];   d[ 57] = s1[14];   d[ 58] = s2[14];   d[ 59] = s3[14];
-   d[ 60] = s0[15];   d[ 61] = s1[15];   d[ 62] = s2[15];   d[ 63] = s3[15];
-}
-*/
-
 static inline void intrlv_4x64_512( void *dst, const void *src0,
            const void *src1, const void *src2, const void *src3 )
 {
@@ -1282,26 +1251,6 @@ static inline void intrlv_4x64_512( void *dst, const void *src0,
    d[15] = _mm_unpackhi_epi64( s2[3], s3[3] );
 }
 
-/*
-static inline void intrlv_4x64_512( void *dst, const void *src0,
-                      const void *src1, const void *src2, const void *src3 )
-{
-   uint64_t *d = (uint64_t*)dst;
-   const uint64_t *s0 = (const uint64_t*)src0;
-   const uint64_t *s1 = (const uint64_t*)src1;
-   const uint64_t *s2 = (const uint64_t*)src2;
-   const uint64_t *s3 = (const uint64_t*)src3;
-   d[  0] = s0[ 0];   d[  1] = s1[ 0];   d[  2] = s2[ 0];   d[  3] = s3[ 0];
-   d[  4] = s0[ 1];   d[  5] = s1[ 1];   d[  6] = s2[ 1];   d[  7] = s3[ 1];
-   d[  8] = s0[ 2];   d[  9] = s1[ 2];   d[ 10] = s2[ 2];   d[ 11] = s3[ 2];
-   d[ 12] = s0[ 3];   d[ 13] = s1[ 3];   d[ 14] = s2[ 3];   d[ 15] = s3[ 3];
-   d[ 16] = s0[ 4];   d[ 17] = s1[ 4];   d[ 18] = s2[ 4];   d[ 19] = s3[ 4];
-   d[ 20] = s0[ 5];   d[ 21] = s1[ 5];   d[ 22] = s2[ 5];   d[ 23] = s3[ 5];
-   d[ 24] = s0[ 6];   d[ 25] = s1[ 6];   d[ 26] = s2[ 6];   d[ 27] = s3[ 6];
-   d[ 28] = s0[ 7];   d[ 29] = s1[ 7];   d[ 30] = s2[ 7];   d[ 31] = s3[ 7];
-}
-*/
-
 static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
                            void *dst3, const void *src, const int bit_len )
 {
@@ -1347,38 +1296,6 @@ static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
    d3[7] = _mm_unpackhi_epi64( s[29], s[31] );
 }
 
-
-/*
-static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
-                                 void *dst3, const void *src, int bit_len )
-{
-   uint64_t *d0 = (uint64_t*)dst0;
-   uint64_t *d1 = (uint64_t*)dst1;
-   uint64_t *d2 = (uint64_t*)dst2;
-   uint64_t *d3 = (uint64_t*)dst3;
-   const uint64_t *s = (const uint64_t*)src;
-   d0[ 0] = s[ 0];   d1[ 0] = s[ 1];    d2[ 0] = s[ 2];   d3[ 0] = s[ 3];
-   d0[ 1] = s[ 4];   d1[ 1] = s[ 5];    d2[ 1] = s[ 6];   d3[ 1] = s[ 7];
-   d0[ 2] = s[ 8];   d1[ 2] = s[ 9];    d2[ 2] = s[10];   d3[ 2] = s[11];
-   d0[ 3] = s[12];   d1[ 3] = s[13];    d2[ 3] = s[14];   d3[ 3] = s[15];
-   if ( bit_len <= 256 ) return;
-   d0[ 4] = s[16];   d1[ 4] = s[17];    d2[ 4] = s[18];   d3[ 4] = s[19];
-   d0[ 5] = s[20];   d1[ 5] = s[21];    d2[ 5] = s[22];   d3[ 5] = s[23];
-   d0[ 6] = s[24];   d1[ 6] = s[25];    d2[ 6] = s[26];   d3[ 6] = s[27];
-   d0[ 7] = s[28];   d1[ 7] = s[29];    d2[ 7] = s[30];   d3[ 7] = s[31];
-   if ( bit_len <= 512 ) return;
-   d0[ 8] = s[32];   d1[ 8] = s[33];    d2[ 8] = s[34];   d3[ 8] = s[35];
-   d0[ 9] = s[36];   d1[ 9] = s[37];    d2[ 9] = s[38];   d3[ 9] = s[39];
-   if ( bit_len <= 640 ) return;
-   d0[10] = s[40];   d1[10] = s[41];    d2[10] = s[42];   d3[10] = s[43];
-   d0[11] = s[44];   d1[11] = s[45];    d2[11] = s[46];   d3[11] = s[47];
-   d0[12] = s[48];   d1[12] = s[49];    d2[12] = s[50];   d3[12] = s[51];
-   d0[13] = s[52];   d1[13] = s[53];    d2[13] = s[54];   d3[13] = s[55];
-   d0[14] = s[56];   d1[14] = s[57];    d2[14] = s[58];   d3[14] = s[59];
-   d0[15] = s[60];   d1[15] = s[61];    d2[15] = s[62];   d3[15] = s[63];
-}
-*/
-
 static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
                                      void *dst3, const void *src )
 {
@@ -1405,26 +1322,6 @@ static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
    d3[3] = _mm_unpackhi_epi64( s[13], s[15] );
 }
 
-/*
-static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
-                                     void *dst3, const void *src )
-{
-   uint64_t *d0 = (uint64_t*)dst0;
-   uint64_t *d1 = (uint64_t*)dst1;
-   uint64_t *d2 = (uint64_t*)dst2;
-   uint64_t *d3 = (uint64_t*)dst3;
-   const uint64_t *s = (const uint64_t*)src;
-   d0[ 0] = s[ 0];   d1[ 0] = s[ 1];    d2[ 0] = s[ 2];   d3[ 0] = s[ 3];
-   d0[ 1] = s[ 4];   d1[ 1] = s[ 5];    d2[ 1] = s[ 6];   d3[ 1] = s[ 7];
-   d0[ 2] = s[ 8];   d1[ 2] = s[ 9];    d2[ 2] = s[10];   d3[ 2] = s[11];
-   d0[ 3] = s[12];   d1[ 3] = s[13];    d2[ 3] = s[14];   d3[ 3] = s[15];
-   d0[ 4] = s[16];   d1[ 4] = s[17];    d2[ 4] = s[18];   d3[ 4] = s[19];
-   d0[ 5] = s[20];   d1[ 5] = s[21];    d2[ 5] = s[22];   d3[ 5] = s[23];
-   d0[ 6] = s[24];   d1[ 6] = s[25];    d2[ 6] = s[26];   d3[ 6] = s[27];
-   d0[ 7] = s[28];   d1[ 7] = s[29];    d2[ 7] = s[30];   d3[ 7] = s[31];
-}
-*/
-
 static inline void extr_lane_4x64( void *d, const void *s,
                                    const int lane, const int bit_len )
 {
@@ -1440,9 +1337,41 @@ static inline void extr_lane_4x64( void *d, const void *s,
 }
 
 #if defined(__AVX2__)
+// Doesn't really need AVX2, just SSSE3, but is only used with AVX2 code.
+
+static inline void mm256_intrlv80_4x64( void *d, const void *src )
+{
+  __m128i s0 = casti_m128i( src,0 );
+  __m128i s1 = casti_m128i( src,1 );
+  __m128i s2 = casti_m128i( src,2 );
+  __m128i s3 = casti_m128i( src,3 );
+  __m128i s4 = casti_m128i( src,4 );
+
+  casti_m128i( d,  0 ) =
+  casti_m128i( d,  1 ) = _mm_shuffle_epi32( s0, 0x44 );
+  casti_m128i( d,  2 ) =
+  casti_m128i( d,  3 ) = _mm_shuffle_epi32( s0, 0xee );
+
+  casti_m128i( d,  4 ) =
+  casti_m128i( d,  5 ) = _mm_shuffle_epi32( s1, 0x44 );
+  casti_m128i( d,  6 ) =
+  casti_m128i( d,  7 ) = _mm_shuffle_epi32( s1, 0xee );
+
+  casti_m128i( d,  8 ) =
+  casti_m128i( d,  9 ) = _mm_shuffle_epi32( s2, 0x44 );
+  casti_m128i( d, 10 ) =
+  casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee );
+
+  casti_m128i( d, 12 ) =
+  casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 );
+  casti_m128i( d, 14 ) =
+  casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee );
 
-// There a alignment problems with the source buffer on Wwindows,
-// can't use 256 bit bswap.
+  casti_m128i( d, 16 ) =
+  casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 );
+  casti_m128i( d, 18 ) =
+  casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee );
+}
 
 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
@@ -1636,40 +1565,6 @@ static inline void intrlv_8x64_512( void *dst, const void *src0,
    d[31] = _mm_unpackhi_epi64( s6[3], s7[3] );
 }
 
-/*
-#define ILEAVE_8x64( i ) do \
-{ \
-  uint64_t *d = (uint64_t*)(dst) + ( (i) << 3 ); \
-  d[0] = *( (const uint64_t*)(s0) +(i) ); \
-  d[1] = *( (const uint64_t*)(s1) +(i) ); \
-  d[2] = *( (const uint64_t*)(s2) +(i) ); \
-  d[3] = *( (const uint64_t*)(s3) +(i) ); \
-  d[4] = *( (const uint64_t*)(s4) +(i) ); \
-  d[5] = *( (const uint64_t*)(s5) +(i) ); \
-  d[6] = *( (const uint64_t*)(s6) +(i) ); \
-  d[7] = *( (const uint64_t*)(s7) +(i) ); \
-} while(0)
-
-static inline void intrlv_8x64( void *dst, const void *s0,
-        const void *s1, const void *s2, const void *s3, const void *s4,
-        const void *s5, const void *s6, const void *s7, int bit_len )
-{
-   ILEAVE_8x64(  0 );   ILEAVE_8x64(  1 );
-   ILEAVE_8x64(  2 );   ILEAVE_8x64(  3 );
-   if ( bit_len <= 256 ) return;
-   ILEAVE_8x64(  4 );   ILEAVE_8x64(  5 );
-   ILEAVE_8x64(  6 );   ILEAVE_8x64(  7 );
-   if ( bit_len <= 512 ) return;
-   ILEAVE_8x64(  8 );   ILEAVE_8x64(  9 );
-   if ( bit_len <= 640 ) return;
-   ILEAVE_8x64( 10 );   ILEAVE_8x64( 11 );
-   ILEAVE_8x64( 12 );   ILEAVE_8x64( 13 );
-   ILEAVE_8x64( 14 );   ILEAVE_8x64( 15 );
-}
-
-#undef ILEAVE_8x64
-*/
-
 
 static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2,
          void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
@@ -1815,39 +1710,6 @@ static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2,
    d7[3] = _mm_unpackhi_epi64( s[27], s[31] );
 }
 
-/*
-#define DLEAVE_8x64( i ) do \
-{ \
-   const uint64_t *s = (const uint64_t*)(src) + ( (i) << 3 ); \
-   *( (uint64_t*)(d0) +(i) ) = s[0]; \
-   *( (uint64_t*)(d1) +(i) ) = s[1]; \
-   *( (uint64_t*)(d2) +(i) ) = s[2]; \
-   *( (uint64_t*)(d3) +(i) ) = s[3]; \
-   *( (uint64_t*)(d4) +(i) ) = s[4]; \
-   *( (uint64_t*)(d5) +(i) ) = s[5]; \
-   *( (uint64_t*)(d6) +(i) ) = s[6]; \
-   *( (uint64_t*)(d7) +(i) ) = s[7]; \
-} while(0)
-
-static inline void dintrlv_8x64( void *d0, void *d1, void *d2, void *d3,
-      void *d4, void *d5, void *d6, void *d7, const void *src, int bit_len )
-{
-   DLEAVE_8x64(  0 );   DLEAVE_8x64(  1 );
-   DLEAVE_8x64(  2 );   DLEAVE_8x64(  3 );
-   if ( bit_len <= 256 ) return;
-   DLEAVE_8x64(  4 );   DLEAVE_8x64(  5 );
-   DLEAVE_8x64(  6 );   DLEAVE_8x64(  7 );
-   if ( bit_len <= 512 ) return;
-   DLEAVE_8x64(  8 );   DLEAVE_8x64(  9 );
-   if ( bit_len <= 640 ) return;
-   DLEAVE_8x64( 10 );   DLEAVE_8x64( 11 );
-   DLEAVE_8x64( 12 );   DLEAVE_8x64( 13 );
-   DLEAVE_8x64( 14 );   DLEAVE_8x64( 15 );
-}
-
-#undef DLEAVE_8x64
-*/
-
 static inline void extr_lane_8x64( void *d, const void *s,
                                    const int lane, const int bit_len )
 {
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 35be6109..e166b14d 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -178,7 +178,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
 // Basic operations without equivalent SIMD intrinsic
 
 // Bitwise not (~v)  
-#define mm128_not( v )          _mm_xor_si128( (v), m128_neg1 ) 
+#define mm128_not( v )          _mm_xor_si128( v, m128_neg1 ) 
 
 // Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
@@ -263,7 +263,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
    _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX512VL__)
+//#if defined(__AVX512F__) && defined(__AVX512VL__)
 
 #define mm128_ror_64    _mm_ror_epi64
 #define mm128_rol_64    _mm_rol_epi64
@@ -291,16 +292,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
 #define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
-//#define mm128_swap_64( v )    _mm_alignr_epi8( v, v,  8 )
-//#define mm128_ror_1x32( v )   _mm_alignr_epi8( v, v,  4 )
-//#define mm128_rol_1x32( v )   _mm_alignr_epi8( v, v, 12 )
 
 // Swap 32 bit elements in 64 bit lanes
 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
 
 #if defined(__SSSE3__)
 
-// Rotate right by c bytes
+// Rotate right by c bytes, no SSE2 equivalent.
 static inline __m128i mm128_ror_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }
 
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 635eb4f2..7a37012a 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -18,7 +18,7 @@
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
 #define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
 
-// Mo0ve low element of vector to integer.
+// Move low element of vector to integer.
 #define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
 #define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
 
@@ -42,7 +42,7 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
 // 128 bit vector argument
 #define m256_const1_128( v ) \
    _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
-// 64 bit integer argument
+// 64 bit integer argument zero extended to 128 bits.
 #define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
 #define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
 #define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
@@ -168,7 +168,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
                     _mm256_srli_epi32( v, 32-(c) ) )
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+// The spec says both F & VL are required, but just in case AMD
+// decides to implement ROL/R without AVX512F.
+#if defined(__AVX512VL__)
+//#if defined(__AVX512F__) && defined(__AVX512VL__)
 
 // AVX512, control must be 8 bit immediate.
 
@@ -198,21 +201,14 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements accross all lanes.
 //
-// AVX2 has no full vector permute for elements less than 32 bits.
-// AVX512 has finer granularity full vector permutes.
-// AVX512 has full vector alignr which might be faster, especially for 32 bit
-
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-static inline __m256i mm256_swap_128( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 2 ); }
+// Swap 128 bit elements in 256 bit vector.
+#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
 
-static inline __m256i mm256_ror_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 1 ); }
+// Rotate 256 bit vector by one 64 bit element
+#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
+#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
 
-static inline __m256i mm256_rol_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 3 ); }
+#if defined(__AVX512F__) && defined(__AVX512VL__)
 
 static inline __m256i mm256_ror_1x32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
@@ -220,21 +216,8 @@ static inline __m256i mm256_ror_1x32( const __m256i v )
 static inline __m256i mm256_rol_1x32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 7 ); }
 
-static inline __m256i mm256_ror_3x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 3 ); }
-
-static inline __m256i mm256_rol_3x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 5 ); }
-
 #else   // AVX2
 
-// Swap 128 bit elements in 256 bit vector.
-#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
-
-// Rotate 256 bit vector by one 64 bit element
-#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
-#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
-
 // Rotate 256 bit vector by one 32 bit element.
 #define mm256_ror_1x32( v ) \
     _mm256_permutevar8x32_epi32( v, \
@@ -246,17 +229,6 @@ static inline __m256i mm256_rol_3x32( const __m256i v )
                      m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                     0x0000000200000001,  0x0000000000000007 )
 
-// Rotate 256 bit vector by three 32 bit elements (96 bits).
-#define mm256_ror_3x32( v ) \
-    _mm256_permutevar8x32_epi32( v, \
-                     m256_const_64( 0x0000000200000001, 0x0000000000000007, \
-                                    0x0000000600000005, 0x0000000400000003 ) 
-
-#define mm256_rol_3x32( v ) \
-    _mm256_permutevar8x32_epi32( v, \
-                     m256_const_64( 0x0000000400000003, 0x0000000200000001, \
-                                    0x0000000000000007, 0x0000000600000005 )
-
 #endif    // AVX512 else AVX2
 
 //
diff --git a/util.c b/util.c
index 6a7a0503..a3b764e2 100644
--- a/util.c
+++ b/util.c
@@ -943,6 +943,140 @@ bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen)
 	return true;
 }
 
+static uint32_t bech32_polymod_step(uint32_t pre) {
+    uint8_t b = pre >> 25;
+    return ((pre & 0x1FFFFFF) << 5) ^
+        (-((b >> 0) & 1) & 0x3b6a57b2UL) ^
+        (-((b >> 1) & 1) & 0x26508e6dUL) ^
+        (-((b >> 2) & 1) & 0x1ea119faUL) ^
+        (-((b >> 3) & 1) & 0x3d4233ddUL) ^
+        (-((b >> 4) & 1) & 0x2a1462b3UL);
+}
+
+static const int8_t bech32_charset_rev[128] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    15, -1, 10, 17, 21, 20, 26, 30,  7,  5, -1, -1, -1, -1, -1, -1,
+    -1, 29, -1, 24, 13, 25,  9,  8, 23, -1, 18, 22, 31, 27, 19, -1,
+     1,  0,  3, 16, 11, 28, 12, 14,  6,  4,  2, -1, -1, -1, -1, -1,
+    -1, 29, -1, 24, 13, 25,  9,  8, 23, -1, 18, 22, 31, 27, 19, -1,
+     1,  0,  3, 16, 11, 28, 12, 14,  6,  4,  2, -1, -1, -1, -1, -1
+};
+
+static bool bech32_decode(char *hrp, uint8_t *data, size_t *data_len, const char *input) {
+    uint32_t chk = 1;
+    size_t i;
+    size_t input_len = strlen(input);
+    size_t hrp_len;
+    int have_lower = 0, have_upper = 0;
+    if (input_len < 8 || input_len > 90) {
+        return false;
+    }
+    *data_len = 0;
+    while (*data_len < input_len && input[(input_len - 1) - *data_len] != '1') {
+        ++(*data_len);
+    }
+    hrp_len = input_len - (1 + *data_len);
+    if (1 + *data_len >= input_len || *data_len < 6) {
+        return false;
+    }
+    *(data_len) -= 6;
+    for (i = 0; i < hrp_len; ++i) {
+        int ch = input[i];
+        if (ch < 33 || ch > 126) {
+            return false;
+        }
+        if (ch >= 'a' && ch <= 'z') {
+            have_lower = 1;
+        } else if (ch >= 'A' && ch <= 'Z') {
+            have_upper = 1;
+            ch = (ch - 'A') + 'a';
+        }
+        hrp[i] = ch;
+        chk = bech32_polymod_step(chk) ^ (ch >> 5);
+    }
+    hrp[i] = 0;
+    chk = bech32_polymod_step(chk);
+    for (i = 0; i < hrp_len; ++i) {
+        chk = bech32_polymod_step(chk) ^ (input[i] & 0x1f);
+    }
+    ++i;
+    while (i < input_len) {
+        int v = (input[i] & 0x80) ? -1 : bech32_charset_rev[(int)input[i]];
+        if (input[i] >= 'a' && input[i] <= 'z') have_lower = 1;
+        if (input[i] >= 'A' && input[i] <= 'Z') have_upper = 1;
+        if (v == -1) {
+            return false;
+        }
+        chk = bech32_polymod_step(chk) ^ v;
+        if (i + 6 < input_len) {
+            data[i - (1 + hrp_len)] = v;
+        }
+        ++i;
+    }
+    if (have_lower && have_upper) {
+        return false;
+    }
+    return chk == 1;
+}
+
+static bool convert_bits(uint8_t *out, size_t *outlen, int outbits, const uint8_t *in, size_t inlen, int inbits, int pad) {
+    uint32_t val = 0;
+    int bits = 0;
+    uint32_t maxv = (((uint32_t)1) << outbits) - 1;
+    while (inlen--) {
+        val = (val << inbits) | *(in++);
+        bits += inbits;
+        while (bits >= outbits) {
+            bits -= outbits;
+            out[(*outlen)++] = (val >> bits) & maxv;
+        }
+    }
+    if (pad) {
+        if (bits) {
+            out[(*outlen)++] = (val << (outbits - bits)) & maxv;
+        }
+    } else if (((val << (outbits - bits)) & maxv) || bits >= inbits) {
+        return false;
+    }
+    return true;
+}
+
+static bool segwit_addr_decode(int *witver, uint8_t *witdata, size_t *witdata_len, const char *addr) {
+    uint8_t data[84];
+    char hrp_actual[84];
+    size_t data_len;
+    if (!bech32_decode(hrp_actual, data, &data_len, addr)) return false;
+    if (data_len == 0 || data_len > 65) return false;
+    if (data[0] > 16) return false;
+    *witdata_len = 0;
+    if (!convert_bits(witdata, witdata_len, 8, data + 1, data_len - 1, 5, 0)) return false;
+    if (*witdata_len < 2 || *witdata_len > 40) return false;
+    if (data[0] == 0 && *witdata_len != 20 && *witdata_len != 32) return false;
+    *witver = data[0];
+    return true;
+}
+
+static size_t bech32_to_script(uint8_t *out, size_t outsz, const char *addr) {
+    uint8_t witprog[40];
+    size_t witprog_len;
+    int witver;
+
+    if (!segwit_addr_decode(&witver, witprog, &witprog_len, addr))
+        return 0;
+    if (outsz < witprog_len + 2)
+        return 0;
+    out[0] = witver ? (0x50 + witver) : 0;
+    out[1] = witprog_len;
+    memcpy(out + 2, witprog, witprog_len);
+
+   if ( opt_debug )
+      applog( LOG_INFO, "Coinbase address uses Bech32 coding");
+
+    return witprog_len + 2;
+}
+
 size_t address_to_script( unsigned char *out, size_t outsz, const char *addr )
 {
 	unsigned char addrbin[ pk_buffer_size_max ];
@@ -950,12 +1084,15 @@ size_t address_to_script( unsigned char *out, size_t outsz, const char *addr )
 	size_t rv;
 
 	if ( !b58dec( addrbin, outsz, addr ) )
-		return 0;
+		return bech32_to_script( out, outsz, addr );
 
    addrver = b58check( addrbin, outsz, addr );
    if ( addrver < 0 )
 		return 0;
 
+   if ( opt_debug )
+      applog( LOG_INFO, "Coinbase address uses B58 coding");
+
    switch ( addrver )
    {
 		case 5:    /* Bitcoin script hash */
@@ -1486,9 +1623,6 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
    if ( !opt_quiet ) /* pool dynamic change */
       applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d",
          xnonce1, xn2_size);
-//   if (pndx == 0 && opt_debug)
-//		applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d",
-//			xnonce1, xn2_size);
 
 	return true;
 out:
@@ -1638,8 +1772,6 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 		opt_extranonce = false;
       goto out;
 	}
-   if ( !opt_quiet )
-      applog( LOG_INFO, "Extranonce subscription enabled" );
 
 	sret = stratum_recv_line( sctx );
 	if ( sret )
@@ -1658,8 +1790,8 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 					applog( LOG_WARNING, "Stratum answer id is not correct!" );
 			}
 			res_val = json_object_get( extra, "result" );
-//			if (opt_debug && (!res_val || json_is_false(res_val)))
-//				applog(LOG_DEBUG, "extranonce subscribe not supported");
+			if (opt_debug && (!res_val || json_is_false(res_val)))
+				applog(LOG_DEBUG, "Method extranonce.subscribe is not supported");
 			json_decref( extra );
 		}
 		free(sret);

From 3c5e8921b764e03ef09c57b2207b9960d45123b0 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 6 May 2021 14:55:03 -0400
Subject: [PATCH 06/20] v3.16.3

---
 RELEASE_NOTES                  |   8 +-
 algo/blake/blake2b-hash-4way.h |   4 +-
 algo/blake/blake2s-hash-4way.h |   6 +-
 algo/blake/sph-blake2s.c       |  10 +-
 algo/blake/sph-blake2s.h       |   2 +-
 algo/blake/sph_blake2b.h       |   2 +-
 algo/quark/quark-4way.c        |  30 ++---
 algo/verthash/Verthash.c       | 204 ++++++++++++++++++---------------
 algo/verthash/Verthash.h       |   6 +-
 algo/verthash/verthash-gate.c  |  18 ++-
 configure                      |  20 ++--
 configure.ac                   |   2 +-
 cpu-miner.c                    |  29 ++---
 util.c                         |  12 +-
 14 files changed, 183 insertions(+), 170 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index a1133db1..e1bd547c 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,11 +65,16 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.16.3
+
+#313 Fix compile error with GCC 11.
+Incremental improvements to verthash.
+
 v3.16.2
 
 Verthash: midstate prehash optimization for all architectures.
 Verthash: AVX2 optimization.
-GBT: added support for Bech32 addresses, untested.
+GBT: added support for Bech32 addresses.
 Linux: added CPU frequency to benchmark log.
 Fixed integer overflow in time calculations.
 
@@ -111,7 +116,6 @@ RPC getmininginfo method.
 v3.15.5
 
 Fix stratum jobs lost if 2 jobs received in less than one second.
- 
 
 v3.15.4
 
diff --git a/algo/blake/blake2b-hash-4way.h b/algo/blake/blake2b-hash-4way.h
index 979e4b22..1256fb18 100644
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -17,7 +17,7 @@
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
    __m512i b[16]; // input buffer
    __m512i h[8];  // chained state
    uint64_t t[2];  // total number of bytes
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
 #if defined(__AVX2__)
 
 // state context
-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
 	__m256i b[16]; // input buffer
 	__m256i h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h
index baf28656..fc86c4fc 100644
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
 } blake2s_nway_param;
 #pragma pack(pop)
 
-ALIGN( 64 ) typedef struct __blake2s_4way_state
+typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
    __m128i h[8];
    uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
 
 #if defined(__AVX2__)
 
-ALIGN( 64 ) typedef struct __blake2s_8way_state
+typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
    __m256i h[8];
    uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-ALIGN( 128 ) typedef struct __blake2s_16way_state
+typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
    __m512i h[8];
    uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c
index a732910d..0ebe547b 100644
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
 
 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
 {
-	blake2s_state S[1];
+	blake2s_state S;
 
 	/* Verify parameters */
 	if ( NULL == in ) return -1;
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
 
 	if( keylen > 0 )
 	{
-		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+		if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
 	}
 	else
 	{
-		if( blake2s_init( S, outlen ) < 0 ) return -1;
+		if( blake2s_init( &S, outlen ) < 0 ) return -1;
 	}
 
-	blake2s_update( S, ( uint8_t * )in, inlen );
-	blake2s_final( S, out, outlen );
+	blake2s_update( &S, ( uint8_t * )in, inlen );
+	blake2s_final( &S, out, outlen );
 	return 0;
 }
 
diff --git a/algo/blake/sph-blake2s.h b/algo/blake/sph-blake2s.h
index 2949fa62..eb66b7a5 100644
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -116,7 +116,7 @@ extern "C" {
 		uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
 	} blake2s_param;
 
-	ALIGN( 64 ) typedef struct __blake2s_state
+	typedef struct ALIGN( 64 ) __blake2s_state
 	{
 		uint32_t h[8];
 		uint32_t t[2];
diff --git a/algo/blake/sph_blake2b.h b/algo/blake/sph_blake2b.h
index eaae071d..17f4381c 100644
--- a/algo/blake/sph_blake2b.h
+++ b/algo/blake/sph_blake2b.h
@@ -18,7 +18,7 @@
 #endif
 
 // state context
-ALIGN(64) typedef struct {
+typedef ALIGN(64) struct {
 	uint8_t b[128]; // input buffer
 	uint64_t h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c
index d15e6bd8..f29b951d 100644
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -127,10 +127,8 @@ void quark_8way_hash( void *state, const void *input )
 
      rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     if ( ( vh_mask & 0x0f ) != 0x0f )
-       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-     if ( ( vh_mask & 0xf0 ) != 0xf0 )
-       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
 
      rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
 
@@ -139,22 +137,14 @@ void quark_8way_hash( void *state, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, 512 );
 
-    if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    if ( hash4[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    if ( hash5[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    if ( hash6[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    if ( hash7[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  512 );
diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
index 475c79a7..0d971f2e 100644
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -44,8 +44,8 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
        if ( opt_data_file || !opt_verify ) 
        {
           if ( opt_data_file )
-             applog( LOG_ERR,
-                     "Verthash data file not found or invalid: %s", info->fileName );
+             applog( LOG_ERR, "Verthash data file not found or invalid: %s",
+                     info->fileName );
           else
           {
              applog( LOG_ERR,
@@ -134,76 +134,117 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
     return (a ^ b) * 0x1000193;
 }
 
-void verthash_hash( const unsigned char* blob_bytes,
-                    const size_t blob_size,
-                    const unsigned char(*input)[VH_HEADER_SIZE],
-                    unsigned char(*output)[VH_HASH_OUT_SIZE] )
+#if 0
+static void rotate_indexes( uint32_t *p )
 {
-    unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64)));
-    unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64)));
-    uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
-    uint32_t* p0_index = (uint32_t*)p0;
+#if defined(__AVX2__)
 
-    verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] );
-    
-    for ( size_t x = 0; x < VH_N_ROT; ++x )
-    {
-        memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
-                p0, VH_N_SUBSET);
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
+   {
+      __m256i *px = (__m256i*)p + x;
 
-#if defined(__AVX2__)
+      px[0] = mm256_rol_32( px[0], 1 );
+      px[1] = mm256_rol_32( px[1], 1 );
+      px[2] = mm256_rol_32( px[2], 1 );
+      px[3] = mm256_rol_32( px[3], 1 );
+      px[4] = mm256_rol_32( px[4], 1 );
+      px[5] = mm256_rol_32( px[5], 1 );
+      px[6] = mm256_rol_32( px[6], 1 );
+      px[7] = mm256_rol_32( px[7], 1 );
+   }
 
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8)
-        {
-           casti_m256i( p0_index, y   ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y   ), 1 );
-           casti_m256i( p0_index, y+1 ) = mm256_rol_32( 
-                                            casti_m256i( p0_index, y+1 ), 1 );
-           casti_m256i( p0_index, y+2 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+2 ), 1 );
-           casti_m256i( p0_index, y+3 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+3 ), 1 );
-           casti_m256i( p0_index, y+4 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+4 ), 1 );
-           casti_m256i( p0_index, y+5 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+5 ), 1 );
-           casti_m256i( p0_index, y+6 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+6 ), 1 );
-           casti_m256i( p0_index, y+7 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+7 ), 1 );
-        }
+#else
+
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
+   {
+      __m128i *px = (__m128i*)p0_index + x;
+
+      px[0] = mm128_rol_32( px[0], 1 );
+      px[1] = mm128_rol_32( px[1], 1 );
+      px[2] = mm128_rol_32( px[2], 1 );
+      px[3] = mm128_rol_32( px[3], 1 );
+      px[4] = mm128_rol_32( px[4], 1 );
+      px[5] = mm128_rol_32( px[5], 1 );
+      px[6] = mm128_rol_32( px[6], 1 );
+      px[7] = mm128_rol_32( px[7], 1 );
+   }
+
+#endif
+/*   
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
+      p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
+*/
+}
+#endif
+
+static inline uint32_t rotl32( uint32_t a, size_t r )
+{
+   return ( a << r ) | ( a >> (32-r) );
+}
+
+// Vectorized and targetted version of fnv1a
+#if defined (__AVX2__)        
+
+#define MULXOR \
+   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
+                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );
+
+#elif defined(__SSE41__)
+
+#define MULXOR \
+   casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \
+                  casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \
+   casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \
+                  casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k );
 
 #else
 
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8)
-        {
-           casti_m128i( p0_index, y   ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y   ), 1 );
-           casti_m128i( p0_index, y+1 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+1 ), 1 );
-           casti_m128i( p0_index, y+2 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+2 ), 1 );
-           casti_m128i( p0_index, y+3 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+3 ), 1 );
-           casti_m128i( p0_index, y+4 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+4 ), 1 );
-           casti_m128i( p0_index, y+5 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+5 ), 1 );
-           casti_m128i( p0_index, y+6 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+6 ), 1 );
-           casti_m128i( p0_index, y+7 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+7 ), 1 );
-        }
-        
+#define MULXOR \
+   for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \
+       hash[j] = fnv1a( hash[j], blob_off[j] ); \
+
 #endif
 
-    }
+#define UPDATE_ACCUMULATOR \
+   accumulator = fnv1a( accumulator, blob_off[0] ); \
+   accumulator = fnv1a( accumulator, blob_off[1] ); \
+   accumulator = fnv1a( accumulator, blob_off[2] ); \
+   accumulator = fnv1a( accumulator, blob_off[3] ); \
+   accumulator = fnv1a( accumulator, blob_off[4] ); \
+   accumulator = fnv1a( accumulator, blob_off[5] ); \
+   accumulator = fnv1a( accumulator, blob_off[6] ); \
+   accumulator = fnv1a( accumulator, blob_off[7] )
+
+
+// first pass no rotate
+#define ROUND_0 \
+for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
+{ \
+   const uint32_t *blob_off = blob + \
+                         ( ( fnv1a( subset[i], accumulator ) % mdiv ) \
+                         * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
+   UPDATE_ACCUMULATOR; \
+   MULXOR; \
+}
 
-    sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE );
-    
-    uint32_t* p1_32 = (uint32_t*)p1;
-    uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes;
-    uint32_t value_accumulator = 0x811c9dc5;
+// subsequent passes rotate by r on demand, no need for mass rotate
+#define ROUND_r( r ) \
+for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
+{ \
+   const uint32_t *blob_off = blob + \
+                 ( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
+                 * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
+   UPDATE_ACCUMULATOR; \
+   MULXOR; \
+}
+
+void verthash_hash( const void *blob_bytes, const size_t blob_size,
+                    const void *input, void *output )
+{
+    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
+    uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
+    const uint32_t *blob = (const uint32_t*)blob_bytes;
+    uint32_t accumulator = 0x811c9dc5;
     const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
                              / VH_BYTE_ALIGNMENT ) + 1;
 #if defined (__AVX2__)        
@@ -211,40 +252,15 @@ void verthash_hash( const unsigned char* blob_bytes,
 #elif defined(__SSE41__)
     const __m128i k = _mm_set1_epi32( 0x1000193 );
 #endif
+    
+    sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE );
+    verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] );
 
-    for ( size_t i = 0; i < VH_N_INDEXES; i++ )
-    {
-        const uint32_t offset =
-                      ( fnv1a( seek_indexes[i], value_accumulator) % mdiv )
-                      * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) );
-        const uint32_t *blob_off = blob_bytes_32 + offset;
-
-        // update value accumulator for next seek index
-        value_accumulator = fnv1a( value_accumulator, blob_off[0] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[1] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[2] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[3] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[4] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[5] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[6] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[7] );
-        
-#if defined (__AVX2__)        
-        *(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256(
-                                  *(__m256i*)p1_32, *(__m256i*)blob_off ), k );
-#elif defined(__SSE41__)
-        casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128( 
-                    casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k );
-        casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128( 
-                    casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k );
-#else
-         for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ )
-            p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] );
-#endif
-
-    }
+    ROUND_0;
+    for ( size_t r = 1; r < VH_N_ROT; ++r )
+       ROUND_r( r );
 
-    memcpy( output, p1, VH_HASH_OUT_SIZE );
+    memcpy( output, hash, VH_HASH_OUT_SIZE );
 }
 
 //-----------------------------------------------------------------------------
diff --git a/algo/verthash/Verthash.h b/algo/verthash/Verthash.h
index 5cce653a..e3e4029b 100644
--- a/algo/verthash/Verthash.h
+++ b/algo/verthash/Verthash.h
@@ -47,10 +47,8 @@ void verthash_info_free(verthash_info_t* info);
 //! Generate verthash data file and save it to specified location.
 int verthash_generate_data_file(const char* output_file_name);
 
-void verthash_hash(const unsigned char* blob_bytes,
-                   const size_t blob_size,
-                   const unsigned char(*input)[VH_HEADER_SIZE],
-                   unsigned char(*output)[VH_HASH_OUT_SIZE]);
+void verthash_hash( const void *blob_bytes, const size_t blob_size,
+                    const void *input, void *output );
 
 void verthash_sha3_512_prehash_72( const void *input );
 void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
index 029ce462..a0103444 100644
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -62,7 +62,7 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
     __m256i vhashB[ 10 ] __attribute__ ((aligned (64)));
 
    sha3_4way_ctx_t ctx;
-   __m256i vnonce = _mm256_set1_epi64x( nonce );
+   const __m256i vnonce = _mm256_set1_epi64x( nonce );
 
    memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx );
    sha3_4way_update( &ctx, &vnonce, 8 );
@@ -88,14 +88,13 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
 #endif
 }
 
-
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t edata[20] __attribute__((aligned(64)));
    uint32_t hash[8] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 1;
    uint32_t n = first_nonce;
@@ -109,8 +108,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
    {
       edata[19] = n;
       verthash_hash( verthashInfo.data, verthashInfo.dataSize, 
-                     (const unsigned char (*)[80]) edata,
-                     (unsigned char (*)[32]) hash );
+                     edata,  hash );
       if ( valid_hash( hash, ptarget ) && !bench )
       {
          pdata[19] = bswap_32( n );
@@ -123,17 +121,16 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
    return 0;
 }
 
-const char *default_verthash_data_file = "verthash.dat";
+static const char *default_verthash_data_file = "verthash.dat";
 
 bool register_verthash_algo( algo_gate_t* gate )
 {
-
   opt_target_factor = 256.0;
   gate->scanhash  = (void*)&scanhash_verthash;
   gate->optimizations = AVX2_OPT;
    
-  char *verthash_data_file = opt_data_file ? opt_data_file
-                                           : default_verthash_data_file;
+  const char *verthash_data_file = opt_data_file ? opt_data_file
+                                                 : default_verthash_data_file;
   
    int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
    if (vhLoadResult == 0) // No Error
@@ -160,7 +157,8 @@ bool register_verthash_algo( algo_gate_t* gate )
       // Handle Verthash error codes
       if ( vhLoadResult == 1 )
       {
-         applog( LOG_ERR, "Verthash data file not found: %s", verthash_data_file );
+         applog( LOG_ERR, "Verthash data file not found: %s",
+                 verthash_data_file );
          if ( !opt_data_file )
             applog( LOG_NOTICE, "Add '--verify' to create verthash.dat");
       }
diff --git a/configure b/configure
index e18537aa..1d15c406 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.16.2'
-PACKAGE_STRING='cpuminer-opt 3.16.2'
+PACKAGE_VERSION='3.16.3'
+PACKAGE_STRING='cpuminer-opt 3.16.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.16.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.16.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.16.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.16.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.16.2
+cpuminer-opt configure 3.16.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.16.2, which was
+It was created by cpuminer-opt $as_me 3.16.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.16.2'
+ VERSION='3.16.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.16.2, which was
+This file was extended by cpuminer-opt $as_me 3.16.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.16.2
+cpuminer-opt config.status 3.16.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 5ee7b2f7..82a90496 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.2])
+AC_INIT([cpuminer-opt], [3.16.3])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index e52168bd..6b62a3c6 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1145,7 +1145,7 @@ void report_summary_log( bool force )
    if ( mismatch )
    {
       if ( mismatch != 1 )
-         applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect", mismatch );
+         applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
       else
          applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
    }
@@ -2171,11 +2171,11 @@ static void *miner_thread( void *userdata )
    /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
     * and if that fails, then SCHED_BATCH. No need for this to be an
     * error if it fails */
-   if (!opt_benchmark && opt_priority == 0)
+   if ( !opt_priority )
    {
       setpriority(PRIO_PROCESS, 0, 19);
-      if ( !thr_id && !opt_quiet )
-         applog(LOG_INFO, "Miner thread priority %d (nice 19)", opt_priority );
+      if ( !thr_id && opt_debug )
+         applog(LOG_INFO, "Default miner thread priority %d (nice 19)", opt_priority );
       drop_policy();
    }
    else
@@ -2192,9 +2192,12 @@ static void *miner_thread( void *userdata )
 	      case 4:   prio = -10;   break;
 	      case 5:   prio = -15;
       }
-	   if ( !( thr_id || opt_quiet ) )
-	      applog( LOG_INFO, "Miner thread priority %d (nice %d)",
+	   if ( !thr_id )
+      {
+         applog( LOG_INFO, "User set miner thread priority %d (nice %d)",
                           opt_priority, prio );
+         applog( LOG_WARNING, "High priority mining threads may cause system instability");
+      }
 #endif
       setpriority(PRIO_PROCESS, 0, prio);
 	   if ( opt_priority == 0 )
@@ -2439,7 +2442,7 @@ static void *miner_thread( void *userdata )
              char hr_units[2] = {0,0};
              scale_hash_for_display( &hashrate,  hr_units );
              sprintf( hr, "%.2f", hashrate );
-#if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32))
+#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
              applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
 #else
              float lo_freq = 0., hi_freq = 0.;
@@ -2739,10 +2742,10 @@ static void *stratum_thread(void *userdata )
 	          stratum.url = strdup( rpc_url );
 	          applog(LOG_BLUE, "Connection changed to %s", short_url);
           }
-          else // if ( !opt_quiet )
+          else 
 	          applog(LOG_WARNING, "Stratum connection reset");
           // reset stats queue as well
-          s_get_ptr = s_put_ptr = 0;
+          if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
       }
 
       while ( !stratum.curl )
@@ -2789,13 +2792,15 @@ static void *stratum_thread(void *userdata )
          else
          {
             applog(LOG_WARNING, "Stratum connection interrupted");
-            stratum_disconnect( &stratum );
+//            stratum_disconnect( &stratum );
+            stratum_need_reset = true;
          }
       }
       else
       {
          applog(LOG_ERR, "Stratum connection timeout");
-         stratum_disconnect( &stratum );
+         stratum_need_reset = true;
+//         stratum_disconnect( &stratum );
       }
 
    }  // loop
@@ -3394,8 +3399,6 @@ void parse_arg(int key, char *arg )
 		v = atoi(arg);
 		if (v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
-      // option is deprecated, show warning
-      applog( LOG_WARNING, "High priority mining threads may cause system instability");
 		opt_priority = v;
 		break;
    case 'N':    // N parameter for various scrypt algos
diff --git a/util.c b/util.c
index a3b764e2..2bfc8095 100644
--- a/util.c
+++ b/util.c
@@ -1789,10 +1789,14 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 				if ( !stratum_handle_method( sctx, sret ) )
 					applog( LOG_WARNING, "Stratum answer id is not correct!" );
 			}
-			res_val = json_object_get( extra, "result" );
-			if (opt_debug && (!res_val || json_is_false(res_val)))
-				applog(LOG_DEBUG, "Method extranonce.subscribe is not supported");
-			json_decref( extra );
+         else
+         {
+            res_val = json_object_get( extra, "result" );
+			   if ( opt_debug && ( !res_val || json_is_false( res_val ) ) )
+				   applog( LOG_DEBUG,
+                       "Method extranonce.subscribe is not supported" );
+         }
+         json_decref( extra );
 		}
 		free(sret);
 	}

From a053690170cf3c3b0232cb27ee0cf3463330055a Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Wed, 23 Jun 2021 21:52:42 -0400
Subject: [PATCH 07/20] v3.16.4

---
 RELEASE_NOTES               |  5 +++++
 algo/sha/sha256-hash-4way.c | 38 +++++++++++++++++++++++++++++++++--
 algo/sha/sha512-hash-4way.c | 40 +++++++++++++++++++++++++++++--------
 algo/sha/sph_sha2.c         |  4 ++--
 algo/sha/sph_sha2big.c      |  3 ++-
 configure                   | 20 +++++++++----------
 configure.ac                |  2 +-
 cpu-miner.c                 |  7 +++++--
 8 files changed, 93 insertions(+), 26 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index e1bd547c..bf9aec58 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.16.4
+
+Faster sha512 and sha256 when not using SHA CPU extension.
+#329: Fixed GBT incorrect target diff in stats.
+
 v3.16.3
 
 #313 Fix compile error with GCC 11.
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index d9fb503c..a1f657e1 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -74,9 +74,15 @@ static const uint32_t K256[64] =
 #define CHs(X, Y, Z) \
    _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
 
+/*
 #define MAJs(X, Y, Z) \
    _mm_or_si128( _mm_and_si128( X, Y ), \
                     _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
+*/
+
+#define MAJs(X, Y, Z) \
+  _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
+                                   _mm_xor_si128( Y, Z ) ) )
 
 #define BSG2_0(x) \
    _mm_xor_si128( _mm_xor_si128( \
@@ -345,9 +351,20 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
 #define CHx(X, Y, Z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 
+/*
 #define MAJx(X, Y, Z) \
    _mm256_or_si256( _mm256_and_si256( X, Y ), \
                     _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+*/
+/*
+#define MAJx(X, Y, Z) \
+  _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
+                                         _mm256_xor_si256( Y, Z ) ) )
+*/
+
+#define MAJx(X, Y, Z) \
+  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
+                                         Y_xor_Z ) )
 
 #define BSG2_0x(x) \
    _mm256_xor_si256( _mm256_xor_si256( \
@@ -375,6 +392,7 @@ do { \
   T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
   D  = _mm256_add_epi32( D,  T1 ); \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
@@ -382,7 +400,7 @@ do { \
 static void
 sha256_8way_round( sha256_8way_context *ctx,  __m256i *in, __m256i r[8] )
 {
-   register  __m256i A, B, C, D, E, F, G, H;
+   register  __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m256i W[16];
 
    mm256_block_bswap_32( W  , in   );
@@ -411,6 +429,8 @@ sha256_8way_round( sha256_8way_context *ctx,  __m256i *in, __m256i r[8] )
       H = m256_const1_64( 0x5BE0CD195BE0CD19 );
    }
 
+   Y_xor_Z = _mm256_xor_si256( B, C );
+   
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -591,9 +611,20 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 #define CHx16(X, Y, Z) \
    _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
 
+/*
 #define MAJx16(X, Y, Z) \
    _mm512_or_si512( _mm512_and_si512( X, Y ), \
                     _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+*/
+/*
+#define MAJx16(X, Y, Z) \
+  _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \
+                                         _mm512_xor_si512( Y, Z ) ) )
+*/
+
+#define MAJx16(X, Y, Z) \
+  _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \
+                                         Y_xor_Z ) )
 
 #define BSG2_0x16(x) \
    _mm512_xor_si512( _mm512_xor_si512( \
@@ -621,6 +652,7 @@ do { \
   T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
   D  = _mm512_add_epi32( D,  T1 ); \
   H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
@@ -628,7 +660,7 @@ do { \
 static void
 sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
 {
-   register  __m512i A, B, C, D, E, F, G, H;
+   register  __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m512i W[16];
 
    mm512_block_bswap_32( W  , in   );
@@ -657,6 +689,8 @@ sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
       H = m512_const1_64( 0x5BE0CD195BE0CD19 );
    }
 
+   Y_xor_Z = _mm512_xor_si512( B, C );
+
    SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index 9f5349b0..803c42f5 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -98,9 +98,21 @@ static const uint64_t K512[80] =
 #define CH8W(X, Y, Z) \
    _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
 
+/*
 #define MAJ8W(X, Y, Z) \
    _mm512_or_si512( _mm512_and_si512( X, Y ), \
                     _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+*/
+/* Functionally identical to original but optimizable,
+ * subexpression X^Y from one step can be reused in the next step as Y^Z
+#define MAJ8W(X, Y, Z) \
+  _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \
+                                         _mm512_xor_si512( Y, Z ) ) )
+*/
+
+#define MAJ8W(X, Y, Z) \
+  _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \
+                                         Y_xor_Z ) )
 
 #define BSG8W_5_0(x) \
    _mm512_xor_si512( _mm512_xor_si512( \
@@ -172,6 +184,7 @@ do { \
   T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
   D  = _mm512_add_epi64( D, T1 ); \
   H  = _mm512_add_epi64( T1, T2 ); \
 } while (0)
@@ -180,7 +193,7 @@ static void
 sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
 {
    int i;
-   register __m512i A, B, C, D, E, F, G, H;
+   register __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m512i W[80];
 
    mm512_block_bswap_64( W  , in );
@@ -213,6 +226,8 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
       H = m512_const1_64( 0x5BE0CD19137E2179 );
    }
 
+   Y_xor_Z = _mm512_xor_si512( B, C );
+
    for ( i = 0; i < 80; i += 8 )
    {
       SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
@@ -319,14 +334,20 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 
 // SHA-512 4 way 64 bit
 
-/*
+
 #define CH(X, Y, Z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 
+/*
 #define MAJ(X, Y, Z) \
    _mm256_or_si256( _mm256_and_si256( X, Y ), \
                     _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+*/
 
+#define MAJ(X, Y, Z) \
+  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
+                                         Y_xor_Z ) )
+                    
 #define BSG5_0(x) \
   mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
                    _mm256_xor_si256( mm256_ror_64( x,  5 ), x ), 6 ), x ), 28 )
@@ -334,7 +355,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 #define BSG5_1(x) \
   mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
                    _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
-*/
+
 /*
 #define BSG5_0(x) \
    _mm256_xor_si256( _mm256_xor_si256( \
@@ -402,7 +423,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
   w1  = _mm256_xor_si256( X1a, X1b ); \
 } while(0)
 */
-
+/*
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
   __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
@@ -431,7 +452,7 @@ do { \
   H  = _mm256_add_epi64( T1, T2 ); \
   D  = _mm256_add_epi64( D, T1 ); \
 } while (0)
-
+*/
 /*
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
@@ -445,7 +466,7 @@ do { \
 } while (0)
 */
 
-/*
+
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
   __m256i T1, T2; \
@@ -453,16 +474,17 @@ do { \
   T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
   D  = _mm256_add_epi64( D, T1 ); \
   H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)
-*/
+
 
 static void
 sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
 {
    int i;
-   register __m256i A, B, C, D, E, F, G, H;
+   register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m256i W[80];
 
    mm256_block_bswap_64( W  , in );
@@ -495,6 +517,8 @@ sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
       H = m256_const1_64( 0x5BE0CD19137E2179 );
    }
 
+   Y_xor_Z = _mm256_xor_si256( B, C );
+
    for ( i = 0; i < 80; i += 8 )
    {
       SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index e96a2d1c..b67b0143 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -40,8 +40,8 @@
 #endif
 
 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
-#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
-
+//#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+#define MAJ( X, Y, Z )   ( Y  ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
 #define ROTR    SPH_ROTR32
 
 #define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
diff --git a/algo/sha/sph_sha2big.c b/algo/sha/sph_sha2big.c
index 8ea292f6..06d2d16e 100644
--- a/algo/sha/sph_sha2big.c
+++ b/algo/sha/sph_sha2big.c
@@ -38,7 +38,8 @@
 #if SPH_64
 
 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
-#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+//#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+#define MAJ( X, Y, Z )   ( Y  ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
 
 #define ROTR64    SPH_ROTR64
 
diff --git a/configure b/configure
index 1d15c406..00e7ac37 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.16.3'
-PACKAGE_STRING='cpuminer-opt 3.16.3'
+PACKAGE_VERSION='3.16.4'
+PACKAGE_STRING='cpuminer-opt 3.16.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.16.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.16.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.16.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.16.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.16.3
+cpuminer-opt configure 3.16.4
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.16.3, which was
+It was created by cpuminer-opt $as_me 3.16.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.16.3'
+ VERSION='3.16.4'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.16.3, which was
+This file was extended by cpuminer-opt $as_me 3.16.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.16.3
+cpuminer-opt config.status 3.16.4
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 82a90496..6a4059da 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.3])
+AC_INIT([cpuminer-opt], [3.16.4])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 6b62a3c6..26e48b96 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -447,8 +447,10 @@ static bool work_decode( const json_t *val, struct work *work )
 
     if ( !allow_mininginfo )
         net_diff = algo_gate.calc_network_diff( work );
+    else
+        net_diff = hash_to_diff( work->target );
 
-    work->targetdiff = hash_to_diff( work->target );
+    work->targetdiff = net_diff;
     stratum_diff = last_targetdiff = work->targetdiff;
     work->sharediff = 0;
     algo_gate.decode_extra_data( work, &net_blocks );
@@ -908,7 +910,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
    }
    for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
       work->target[7 - i] = be32dec( target + i );
-
+   net_diff = work->targetdiff = hash_to_diff( work->target );
+   
    tmp = json_object_get( val, "workid" );
    if ( tmp )
    {

From 19cc88d10256afd1df67b184e2f926f66eae9f61 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Sat, 26 Jun 2021 12:27:44 -0400
Subject: [PATCH 08/20] v3.16.5

---
 RELEASE_NOTES |  5 +++++
 configure     | 20 ++++++++++----------
 configure.ac  |  2 +-
 cpu-miner.c   | 11 ++++++++---
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index bf9aec58..a500a8dc 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.16.5
+
+#329: Fixed GBT incorrect target diff in stats, second attempt.
+Fixed formatting error in share result log when --no-color option is used.
+
 v3.16.4
 
 Faster sha512 and sha256 when not using SHA CPU extension.
diff --git a/configure b/configure
index 00e7ac37..403892fc 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.4.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.5.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.16.4'
-PACKAGE_STRING='cpuminer-opt 3.16.4'
+PACKAGE_VERSION='3.16.5'
+PACKAGE_STRING='cpuminer-opt 3.16.5'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.16.4 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.16.5 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.16.4:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.16.5:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.16.4
+cpuminer-opt configure 3.16.5
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.16.4, which was
+It was created by cpuminer-opt $as_me 3.16.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.16.4'
+ VERSION='3.16.5'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.16.4, which was
+This file was extended by cpuminer-opt $as_me 3.16.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.16.4
+cpuminer-opt config.status 3.16.5
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 6a4059da..c0aca337 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.4])
+AC_INIT([cpuminer-opt], [3.16.5])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 26e48b96..e46d920a 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -484,13 +484,17 @@ static bool get_mininginfo( CURL *curl, struct work *work )
    // "networkhashps": 56475980
    if ( res )
    {
+      // net_diff is a global that is set from the work hash target by
+      // both getwork and GBT. Don't overwrite it, define a local to override
+      // the global.
+      double net_diff = 0.;
   		json_t *key = json_object_get( res, "difficulty" );
    	if ( key )
       {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = work->targetdiff = json_real_value( key );
+			   net_diff = json_real_value( key );
 	   }
 
       key = json_object_get( res, "networkhashps" );
@@ -1168,7 +1172,8 @@ static int share_result( int result, struct work *work,
    char bres[48];
    bool solved = false; 
    bool stale = false;
-   char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL;
+   char *acol, *bcol, *scol, *rcol;
+   acol = bcol = scol = rcol = "\0";
 
    pthread_mutex_lock( &stats_lock );
 
@@ -1210,7 +1215,7 @@ static int share_result( int result, struct work *work,
       sprintf( sres, "S%d", stale_share_count );
       sprintf( rres, "R%d", rejected_share_count );
       if unlikely( ( my_stats.net_diff > 0. )
-                && ( my_stats.share_diff >= net_diff ) )
+                && ( my_stats.share_diff >= my_stats.net_diff ) )
       {
          solved = true;
          solved_block_count++;

From 92b37339254568be31ac27dbc3dc31fa7ae29d5c Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 15 Jul 2021 20:30:44 -0400
Subject: [PATCH 09/20] v3.17.0

---
 Makefile.am                    |   2 +
 README.txt                     |   5 +
 RELEASE_NOTES                  |   7 +
 algo/bmw/bmw512-hash-4way.c    |  32 +--
 algo/hamsi/hamsi-hash-4way.c   |  53 ++--
 algo/haval/haval-hash-4way.c   |  71 ++---
 algo/jh/jh-hash-4way.c         |  28 +-
 algo/keccak/keccak-hash-4way.c |   9 +
 algo/keccak/keccak-macros.c    |  14 +
 algo/luffa/luffa-hash-2way.c   |  81 +++---
 algo/panama/panama-hash-4way.c |  19 +-
 algo/sha/sha-hash-4way.h       |  10 +
 algo/sha/sha2.c                |  24 +-
 algo/sha/sha256-hash-2way-ni.c | 345 ++++++++++++++++++++++++
 algo/sha/sha256-hash-4way.c    | 476 +++++++++++++++++++++++++++++----
 algo/sha/sha256-hash-opt.c     |  30 +--
 algo/sha/sha256-hash-opt.h     |  18 ++
 algo/sha/sha256d-4way.c        | 252 +++++++++++++++++
 algo/sha/sha256t-4way.c        | 338 +++++++++++++----------
 algo/sha/sha256t-gate.c        |   4 -
 algo/sha/sha256t-gate.h        |   8 +-
 algo/sha/sha256t.c             | 163 ++++++++++-
 algo/sha/sha512-hash-4way.c    |  85 +-----
 algo/sha/sph_sha2.c            | 189 ++++++++++++-
 algo/shabal/shabal-hash-4way.c |  11 +-
 algo/skein/skein-hash-4way.c   |  13 +-
 configure                      |  20 +-
 configure.ac                   |   2 +-
 cpu-miner.c                    |   8 +-
 miner.h                        |   1 +
 simd-utils/simd-256.h          |  75 ++++++
 simd-utils/simd-512.h          |  62 ++++-
 util.c                         |   9 +
 33 files changed, 1976 insertions(+), 488 deletions(-)
 create mode 100644 algo/sha/sha256-hash-2way-ni.c
 create mode 100644 algo/sha/sha256-hash-opt.h
 create mode 100644 algo/sha/sha256d-4way.c

diff --git a/Makefile.am b/Makefile.am
index d5398c00..a4adc3b7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -163,6 +163,8 @@ cpuminer_SOURCES = \
   algo/sha/sph_sha2big.c \
   algo/sha/sha256-hash-4way.c \
   algo/sha/sha512-hash-4way.c \
+  algo/sha/sha256-hash-opt.c \
+  algo/sha/sha256-hash-2way-ni.c \
   algo/sha/hmac-sha256-hash.c \
   algo/sha/hmac-sha256-hash-4way.c \
   algo/sha/sha2.c \
diff --git a/README.txt b/README.txt
index 08c34b9d..22428ec1 100644
--- a/README.txt
+++ b/README.txt
@@ -64,6 +64,11 @@ source code obtained from the author's official repository. The exact
 procedure is documented in the build instructions for Windows:
 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
 
+Some DLL filess may already be installed on the system by Windows or third
+party packages. They often will work and may be used instead of the included
+file. Without a compelling reason to do so it's recommended to use the included
+files as they are packaged.
+
 If you like this software feel free to donate:
 
 BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index a500a8dc..cdacd323 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,13 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.17.0
+
+AVX512 optimized using ternary logic instructions.
+Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%.
+Use SHA on supported CPUs to produce merkle hash.
+Fixed byte order in Extranonce2 log & replaced Block height with Job ID.
+
 v3.16.5
 
 #329: Fixed GBT incorrect target diff in stats, second attempt.
diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c
index ae97b942..4778914e 100644
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -1293,32 +1293,26 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
            mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
 
 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm512_add_epi64( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
-                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
+                                       _mm512_srli_epi64( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
 
 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm512_add_epi64( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
-                                    _mm512_slli_epi64( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \
+                                       _mm512_slli_epi64( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
 
 #define DH2L( m, rl, sl, h, a, b, c ) \
    _mm512_add_epi64( _mm512_add_epi64( \
-       mm512_rol_64( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
-   
+                        mm512_rol_64( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) ) 
+
 #define DH2R( m, rl, sr, h, a, b, c ) \
    _mm512_add_epi64( _mm512_add_epi64( \
-       mm512_rol_64( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+                        mm512_rol_64( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) )
 
 
    dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 2a952a73..9944ebe4 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -560,22 +560,14 @@ do { \
      __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
      dm = mm512_negate_32( _mm512_or_si512( dm, \
                                           _mm512_slli_epi64( dm, 32 ) ) ); \
-     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[0] ) ) ); \
-     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[1] ) ) ); \
-     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[2] ) ) ); \
-     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[3] ) ) ); \
-     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[4] ) ) ); \
-     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[5] ) ) ); \
-     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[6] ) ) ); \
-     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[7] ) ) ); \
+     m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
+     m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
+     m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
+     m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
+     m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
+     m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
+     m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
+     m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
      tp += 8; \
      db = _mm512_srli_epi64( db, 1 ); \
   } \
@@ -585,20 +577,13 @@ do { \
 do { \
   __m512i t; \
   t = a; \
-  a = _mm512_and_si512( a, c ); \
-  a = _mm512_xor_si512( a, d ); \
-  c = _mm512_xor_si512( c, b ); \
-  c = _mm512_xor_si512( c, a ); \
-  d = _mm512_or_si512( d, t ); \
-  d = _mm512_xor_si512( d, b ); \
+  a = mm512_xorand( d, a, c ); \
+  c = mm512_xor3( a, b, c ); \
+  b = mm512_xoror( b, d, t ); \
   t = _mm512_xor_si512( t, c ); \
-  b = d; \
-  d = _mm512_or_si512( d, t ); \
-  d = _mm512_xor_si512( d, a ); \
-  a = _mm512_and_si512( a, b ); \
-  t = _mm512_xor_si512( t, a ); \
-  b = _mm512_xor_si512( b, d ); \
-  b = _mm512_xor_si512( b, t ); \
+  d = mm512_xoror( a, b, t ); \
+  t = mm512_xorand( t, a, b ); \
+  b = mm512_xor3( b, d, t ); \
   a = c; \
   c = b; \
   b = d; \
@@ -609,14 +594,12 @@ do { \
 do { \
    a = mm512_rol_32( a, 13 ); \
    c = mm512_rol_32( c,  3 ); \
-   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
-   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
-                                              _mm512_slli_epi32( a, 3 ) ) ); \
+   b = mm512_xor3( a, b, c ); \
+   d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
    b = mm512_rol_32( b, 1 ); \
    d = mm512_rol_32( d, 7 ); \
-   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
-   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
-                                              _mm512_slli_epi32( b, 7 ) ) ); \
+   a = mm512_xor3( a, b, d ); \
+   c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
    a = mm512_rol_32( a,  5 ); \
    c = mm512_rol_32( c, 22 ); \
 } while (0)
diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c
index 6b45e10b..20c9755f 100644
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -522,50 +522,53 @@ do { \
 
 // Haval-256 8 way 32 bit avx2
 
+#if defined (__AVX512VL__)
+
+// ( ~( a ^ b ) ) & c
+#define mm256_andnotxor( a, b, c ) \
+   _mm256_ternarylogic_epi32( a, b, c, 0x82  )
+
+#else
+
+#define mm256_andnotxor( a, b, c ) \
+   _mm256_andnot_si256( _mm256_xor_si256( a, b ), c )
+
+#endif
+
 #define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( x0, \
-       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
-                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
-                                     _mm256_and_si256( x3, x6 ) ) ) ) \
+ mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \
+                 _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                   _mm256_and_si256( x3, x6 ) ) ) \
 
 #define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( \
-      _mm256_and_si256( x2, \
-         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
-                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
-                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
-         _mm256_xor_si256( \
-             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
-             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
+   mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \
+                       mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 )  ), \
+               mm256_andxor( x4, x1, x5 ), \
+               mm256_xorand( x0, x3, x5 ) ) \
 
 #define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
-    _mm256_and_si256( x3, \
-      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
-                     _mm256_xor_si256( x6, x0 ) ) ), \
-      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
-                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
+  mm256_xor3( x0, \
+              _mm256_and_si256( x3, \
+                         mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \
+              _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                                _mm256_and_si256( x2, x5 ) ) )
 
 #define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
-     _mm256_xor_si256( \
-        _mm256_and_si256( x3, \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
-                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
-        _mm256_and_si256( x4, \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
-                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
-     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
-
+  mm256_xor3( \
+      mm256_andxor( x3, x5, \
+                    _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                      _mm256_or_si256( x4, x6 ) ) ), \
+      _mm256_and_si256( x4, \
+                        mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \
+                                    _mm256_xor_si256( x1, x6 ) ) ), \
+      mm256_xorand( x0, x2, x6 ) )
 
 #define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
    _mm256_xor_si256( \
-       _mm256_and_si256( x0, \
-            mm256_not( _mm256_xor_si256( \
-                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
-      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
-                                    _mm256_and_si256( x2, x5 ) ), \
-                                    _mm256_and_si256( x3, x6 ) ) )
+         mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \
+         mm256_xor3( _mm256_and_si256( x1, x4 ), \
+                     _mm256_and_si256( x2, x5 ), \
+                     _mm256_and_si256( x3, x6 ) ) )
 
 #define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
    F1_8W(x1, x0, x3, x5, x6, x2, x4)
diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c
index 452bc8a6..98a9da01 100644
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -51,15 +51,15 @@ extern "C"{
 do { \
    __m512i cc = _mm512_set1_epi64( c ); \
     x3 = mm512_not( x3 ); \
-    x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
-    tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
-    x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
-    x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
-    x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
-    x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
-    x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
-    x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
-    x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
+    x0 = mm512_xorandnot( x0, x2, cc ); \
+    tmp = mm512_xorand( cc, x0, x1 ); \
+    x0 = mm512_xorand( x0, x2, x3 ); \
+    x3 = mm512_xorandnot( x3, x1, x2 ); \
+    x1 = mm512_xorand( x1, x0, x2 ); \
+    x2 = mm512_xorandnot( x2, x3, x0 ); \
+    x0 = mm512_xoror( x0, x1, x3 ); \
+    x3 = mm512_xorand( x3, x1, x2 ); \
+    x1 = mm512_xorand( x1, tmp, x0 ); \
     x2 = _mm512_xor_si512( x2, tmp ); \
 } while (0)
 
@@ -67,11 +67,11 @@ do { \
 do { \
     x4 = _mm512_xor_si512( x4, x1 ); \
     x5 = _mm512_xor_si512( x5, x2 ); \
-    x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
+    x6 = mm512_xor3( x6, x3, x0 ); \
     x7 = _mm512_xor_si512( x7, x0 ); \
     x0 = _mm512_xor_si512( x0, x5 ); \
     x1 = _mm512_xor_si512( x1, x6 ); \
-    x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
+    x2 = mm512_xor3( x2, x7, x4 ); \
     x3 = _mm512_xor_si512( x3, x4 ); \
 } while (0)
 
@@ -318,12 +318,12 @@ static const sph_u64 C[] = {
 #define Wz_8W(x, c, n) \
 do { \
    __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
-   x ## h = _mm512_or_si512( _mm512_and_si512( \
-                                _mm512_srli_epi64(x ## h, (n)), (c)), t ); \
+   x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \
    t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
-   x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
+   x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \
 } while (0)
 
+
 #define W80(x)   Wz_8W(x, m512_const1_64( 0x5555555555555555 ),  1 )
 #define W81(x)   Wz_8W(x, m512_const1_64( 0x3333333333333333 ),  2 )
 #define W82(x)   Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c
index cc883322..e2545b4d 100644
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -76,6 +76,9 @@ static const uint64_t RC[] = {
 #define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
 #define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
 #define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
+#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
+#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
+
 
 #include "keccak-macros.c"
 
@@ -238,6 +241,8 @@ keccak512_8way_close(void *cc, void *dst)
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
+#undef XOROR
+#undef XORAND
 
 #endif  // AVX512
 
@@ -255,6 +260,8 @@ keccak512_8way_close(void *cc, void *dst)
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
+#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
+#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
 
 #include "keccak-macros.c"
 
@@ -419,5 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
+#undef XOROR
+#undef XORAND
 
 #endif  // AVX2
diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c
index 8d5197c3..436d1ca3 100644
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -110,20 +110,34 @@
 #ifdef KHI_XO
 #undef KHI_XO
 #endif
+
+#define KHI_XO(d, a, b, c)   do { \
+      XOROR(d, a, b, c); \
+   } while (0)
+
+/*
 #define KHI_XO(d, a, b, c)   do { \
                 DECL64(kt); \
                 OR64(kt, b, c); \
                 XOR64(d, a, kt); \
         } while (0)
+*/
 
 #ifdef KHI_XA
 #undef KHI_XA
 #endif
+
+#define KHI_XA(d, a, b, c)   do { \
+      XORAND(d, a, b, c); \
+   } while (0)
+
+/*
 #define KHI_XA(d, a, b, c)   do { \
                 DECL64(kt); \
                 AND64(kt, b, c); \
                 XOR64(d, a, kt); \
         } while (0)
+*/
 
 #ifdef KHI
 #undef KHI
diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c
index bbc31b9b..3d1ce0d9 100644
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -97,6 +97,21 @@ do { \
     MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
     ADD_CONSTANT4W(*x, *(x+4), c0, c1);
 
+#define SUBCRUMB4W(a0,a1,a2,a3,t)\
+    t  = a0;\
+    a0 = mm512_xoror( a3, a0, a1 ); \
+    a2 = _mm512_xor_si512(a2,a3);\
+    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a3 = mm512_xorand( a2, a3, t ); \
+    a2 = mm512_xorand( a1, a2, a0);\
+    a1 = _mm512_or_si512(a1,a3);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    t  = _mm512_xor_si512(t,a1);\
+    a2 = _mm512_and_si512(a2,a1);\
+    a1 = mm512_xnor(a1,a0);\
+    a0 = t;
+
+/*
 #define SUBCRUMB4W(a0,a1,a2,a3,t)\
     t  = _mm512_load_si512(&a0);\
     a0 = _mm512_or_si512(a0,a1);\
@@ -115,7 +130,25 @@ do { \
     a2 = _mm512_and_si512(a2,a1);\
     a1 = _mm512_xor_si512(a1,a0);\
     a0 = _mm512_load_si512(&t);
+*/
 
+#define MIXWORD4W(a,b,t1,t2)\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,2);\
+    t2 = _mm512_srli_epi32(a,30);\
+    a  = mm512_xoror( b, t1, t2 ); \
+    t1 = _mm512_slli_epi32(b,14);\
+    t2 = _mm512_srli_epi32(b,18);\
+    b  = _mm512_or_si512(t1,t2);\
+    b  = mm512_xoror( a, t1, t2 ); \
+    t1 = _mm512_slli_epi32(a,10);\
+    t2 = _mm512_srli_epi32(a,22);\
+    a  = mm512_xoror( b, t1, t2 ); \
+    t1 = _mm512_slli_epi32(b,1);\
+    t2 = _mm512_srli_epi32(b,31);\
+    b  = _mm512_or_si512(t1,t2);
+
+/*
 #define MIXWORD4W(a,b,t1,t2)\
     b  = _mm512_xor_si512(a,b);\
     t1 = _mm512_slli_epi32(a,2);\
@@ -133,6 +166,7 @@ do { \
     t1 = _mm512_slli_epi32(b,1);\
     t2 = _mm512_srli_epi32(b,31);\
     b  = _mm512_or_si512(t1,t2);
+*/
 
 #define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
     a1 = _mm512_shuffle_epi32(a1,147);\
@@ -248,17 +282,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
     __m512i tmp[2];
     __m512i x[8];
 
-    t0 = chainv[0];
-    t1 = chainv[1];
-
-    t0 = _mm512_xor_si512( t0, chainv[2] );
-    t1 = _mm512_xor_si512( t1, chainv[3] );
-    t0 = _mm512_xor_si512( t0, chainv[4] );
-    t1 = _mm512_xor_si512( t1, chainv[5] );
-    t0 = _mm512_xor_si512( t0, chainv[6] );
-    t1 = _mm512_xor_si512( t1, chainv[7] );
-    t0 = _mm512_xor_si512( t0, chainv[8] );
-    t1 = _mm512_xor_si512( t1, chainv[9] );
+    t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = mm512_xor3( t0, chainv[6], chainv[8] );
+    t1 = mm512_xor3( t1, chainv[7], chainv[9] );
 
     MULT24W( t0, t1 );
 
@@ -319,8 +346,8 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
     chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
 
     MULT24W( chainv[0], chainv[1] );
-    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
-    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
+    chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
+    chainv[1] = mm512_xor3( chainv[1], t1, msg1 );
 
     MULT24W( msg0, msg1 );
     chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
@@ -398,19 +425,11 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
 
     /*---- blank round with m=0 ----*/
     rnd512_4way( state, zero );
-
-    t[0] = chainv[0];
-    t[1] = chainv[1];
-
-    t[0] = _mm512_xor_si512( t[0], chainv[2] );
-    t[1] = _mm512_xor_si512( t[1], chainv[3] );
-    t[0] = _mm512_xor_si512( t[0], chainv[4] );
-    t[1] = _mm512_xor_si512( t[1], chainv[5] );
-    t[0] = _mm512_xor_si512( t[0], chainv[6] );
-    t[1] = _mm512_xor_si512( t[1], chainv[7] );
-    t[0] = _mm512_xor_si512( t[0], chainv[8] );
-    t[1] = _mm512_xor_si512( t[1], chainv[9] );
-
+    
+    t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
+    t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
+    t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
+    t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
     t[0] = _mm512_shuffle_epi32( t[0], 27 );
     t[1] = _mm512_shuffle_epi32( t[1], 27 );
 
@@ -676,8 +695,6 @@ do { \
   a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
 } while(0)
 
-// confirm pointer arithmetic
-// ok but use array indexes
 #define STEP_PART(x,c0,c1,t)\
     SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
     SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -688,23 +705,23 @@ do { \
     ADD_CONSTANT(*x, *(x+4), c0, c1);
 
 #define SUBCRUMB(a0,a1,a2,a3,t)\
-    t  = _mm256_load_si256(&a0);\
+    t  = a0;\
     a0 = _mm256_or_si256(a0,a1);\
     a2 = _mm256_xor_si256(a2,a3);\
-    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a1 = mm256_not( a1 );\
     a0 = _mm256_xor_si256(a0,a3);\
     a3 = _mm256_and_si256(a3,t);\
     a1 = _mm256_xor_si256(a1,a3);\
     a3 = _mm256_xor_si256(a3,a2);\
     a2 = _mm256_and_si256(a2,a0);\
-    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a0 = mm256_not( a0 );\
     a2 = _mm256_xor_si256(a2,a1);\
     a1 = _mm256_or_si256(a1,a3);\
     t  = _mm256_xor_si256(t,a1);\
     a3 = _mm256_xor_si256(a3,a2);\
     a2 = _mm256_and_si256(a2,a1);\
     a1 = _mm256_xor_si256(a1,a0);\
-    a0 = _mm256_load_si256(&t);\
+    a0 = t;\
 
 #define MIXWORD(a,b,t1,t2)\
     b  = _mm256_xor_si256(a,b);\
diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c
index d0bc1868..912fb2ec 100644
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -312,10 +312,26 @@ do { \
       BUPDATE1_8W( 7, 1 ); \
 } while (0)
 
+#if defined(__AVX512VL__)
+
+#define GAMMA_8W(n0, n1, n2, n4)   \
+   ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )  
+
+#define THETA_8W(n0, n1, n2, n4)   \
+   ( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) )   
+
+#else
+
 #define GAMMA_8W(n0, n1, n2, n4)   \
    (g ## n0 = _mm256_xor_si256( a ## n0, \
                          _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
 
+#define THETA_8W(n0, n1, n2, n4)   \
+   ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
+                                                            a ## n4 ) ) )
+
+#endif
+
 #define PI_ALL_8W   do { \
       a0  = g0; \
       a1  = mm256_rol_32( g7,   1 ); \
@@ -336,9 +352,6 @@ do { \
       a16 = mm256_rol_32( g10,  8 ); \
    } while (0)
 
-#define THETA_8W(n0, n1, n2, n4)   \
-   ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
-                                                            a ## n4 ) ) )
 
 #define SIGMA_ALL_8W   do { \
       a0  = _mm256_xor_si256( g0, m256_one_32 ); \
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 3a0c61b0..f9505d12 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -59,6 +59,8 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
                          size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sha256_4way_full( void *dst, const void *data, size_t len );
+void sha256_4way_transform( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
 
 #endif  // SSE2
 
@@ -77,6 +79,8 @@ void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
 void sha256_8way_close( sha256_8way_context *sc, void *dst );
 void sha256_8way_full( void *dst, const void *data, size_t len );
+void sha256_8way_transform( __m256i *state_out, const __m256i *data,
+                            const __m256i *state_in );
 
 #endif  // AVX2
 
@@ -95,6 +99,12 @@ void sha256_16way_init( sha256_16way_context *sc );
 void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
 void sha256_16way_close( sha256_16way_context *sc, void *dst );
 void sha256_16way_full( void *dst, const void *data, size_t len );
+void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
+                             const __m512i *state_in );
+void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+                          const __m512i *state_in, const __m512i *state_mid );
 
 #endif // AVX512
 
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 33cc6c12..7eb40673 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -195,8 +195,28 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 		hash[i] = swab32(hash[i]);
 }
 
-extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
+#if defined (__SHA__)
+
+#include "algo/sha/sph_sha2.h"
+
+void sha256d(unsigned char *hash, const unsigned char *data, int len)
+{
+   sph_sha256_context ctx __attribute__ ((aligned (64)));
+
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, data, len );
+   sph_sha256_close( &ctx, hash );
+
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, hash, 32 );
+   sph_sha256_close( &ctx, hash );
+}
+
+#else
+
+void sha256d(unsigned char *hash, const unsigned char *data, int len)
 {
+
    uint32_t S[16], T[16];
 	int i, r;
 
@@ -220,6 +240,8 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
 		be32enc((uint32_t *)hash + i, T[i]);
 }
 
+#endif
+
 static inline void sha256d_preextend(uint32_t *W)
 {
 	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c
new file mode 100644
index 00000000..f169b63f
--- /dev/null
+++ b/algo/sha/sha256-hash-2way-ni.c
@@ -0,0 +1,345 @@
+/*   Intel SHA extensions using C intrinsics               */
+/*   Written and place in public domain by Jeffrey Walton  */
+/*   Based on code from Intel, and by Sean Gulley for      */
+/*   the miTLS project.                                    */
+
+// A stripped down version with byte swapping removed. 
+
+#if defined(__SHA__)
+
+#include "sha256-hash-opt.h"
+
+void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
+    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
+
+    // Load initial values
+    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
+    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
+    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
+    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
+
+    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
+    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
+    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
+    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
+    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
+    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE_X = STATE0_X;
+    ABEF_SAVE_Y = STATE0_Y;
+    CDGH_SAVE_X = STATE1_X;
+    CDGH_SAVE_Y = STATE1_Y;
+
+    // Rounds 0-3
+    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
+    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
+    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 4-7
+    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
+    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
+    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 8-11
+    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
+    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
+    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 12-15
+    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
+    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
+    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 16-19
+    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 20-23
+    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 24-27
+    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 28-31
+    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 32-35
+    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 36-39
+    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 40-43
+    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 44-47
+    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 48-51
+    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 52-55
+    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 56-59
+    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 60-63
+    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Add values back to state
+    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
+    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
+    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
+    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
+
+    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
+    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
+    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
+    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
+    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
+    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
+    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
+    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
+    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
+}
+
+#endif
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index a1f657e1..c5f60481 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -79,10 +79,15 @@ static const uint32_t K256[64] =
    _mm_or_si128( _mm_and_si128( X, Y ), \
                     _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
 */
-
+/*
 #define MAJs(X, Y, Z) \
   _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
                                    _mm_xor_si128( Y, Z ) ) )
+*/
+
+#define MAJs(X, Y, Z) \
+  _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
+                                   Y_xor_Z ) )
 
 #define BSG2_0(x) \
    _mm_xor_si128( _mm_xor_si128( \
@@ -100,6 +105,7 @@ static const uint32_t K256[64] =
    _mm_xor_si128( _mm_xor_si128( \
         mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
 
+/*
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
@@ -128,9 +134,9 @@ do { \
   H  = _mm_add_epi32( T1, T2 ); \
   D  = _mm_add_epi32( D, T1 ); \
 } while (0)
+*/
 
 
-/*
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m128i T1, T2; \
@@ -138,16 +144,98 @@ do { \
   T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
                                         K, W[i] ) ); \
   T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
   D  = _mm_add_epi32( D,  T1 ); \
   H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
-*/
 
 
+void sha256_4way_transform( __m128i *state_out, const __m128i *data,
+                            const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   __m128i W[16];
+
+   memcpy_128( W, data, 16 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+   Y_xor_Z = _mm_xor_si128( B, C );
+
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+   
+   state_out[0] = _mm_add_epi32( state_in[0], A );
+   state_out[1] = _mm_add_epi32( state_in[1], B );
+   state_out[2] = _mm_add_epi32( state_in[2], C );
+   state_out[3] = _mm_add_epi32( state_in[3], D );
+   state_out[4] = _mm_add_epi32( state_in[4], E );
+   state_out[5] = _mm_add_epi32( state_in[5], F );
+   state_out[6] = _mm_add_epi32( state_in[6], G );
+   state_out[7] = _mm_add_epi32( state_in[7], H );
+}
+
 static void
 sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
 {
-   register  __m128i A, B, C, D, E, F, G, H;
+   register  __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m128i W[16];
 
    mm128_block_bswap_32( W, in );
@@ -176,6 +264,8 @@ sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
       H = m128_const1_64( 0x5BE0CD195BE0CD19 );
    }
 
+   Y_xor_Z = _mm_xor_si128( B, C );
+
    SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -327,10 +417,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
     high = (sc->count_high << 3) | (low >> 29);
     low = low << 3;
 
-    sc->buf[ pad >> 2 ] =
-                 mm128_bswap_32( m128_const1_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm128_bswap_32( m128_const1_32( low ) );
+    sc->buf[  pad     >> 2 ] = m128_const1_32( bswap_32( high ) );
+    sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
     sha256_4way_round( sc, sc->buf, sc->val );
 
     mm128_block_bswap_32( dst, sc->val );
@@ -348,23 +436,39 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
 
 // SHA-256 8 way
 
+#if defined(__AVX512VL__)
+
 #define CHx(X, Y, Z) \
-   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+   _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
 
-/*
 #define MAJx(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
-*/
-/*
+   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
+
+#define BSG2_0x(x) \
+   mm256_xor3( mm256_ror_32(x,  2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
+
+#define BSG2_1x(x) \
+   mm256_xor3( mm256_ror_32(x,  6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
+
+#define SSG2_0x(x) \
+   mm256_xor3( mm256_ror_32(x,  7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
+
+#define SSG2_1x(x) \
+   mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
+
+#else  // AVX2
+
+#define CHx(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
 #define MAJx(X, Y, Z) \
   _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
                                          _mm256_xor_si256( Y, Z ) ) )
-*/
-
+/*
 #define MAJx(X, Y, Z) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
+*/
 
 #define BSG2_0x(x) \
    _mm256_xor_si256( _mm256_xor_si256( \
@@ -382,6 +486,8 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
    _mm256_xor_si256( _mm256_xor_si256( \
        mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
 
+#endif   // AVX512 else AVX2
+
 #define SHA2x_MEXP( a, b, c, d ) \
      mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
 
@@ -392,15 +498,95 @@ do { \
   T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
-  Y_xor_Z = X_xor_Y; \
   D  = _mm256_add_epi32( D,  T1 ); \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
+void sha256_8way_transform( __m256i *state_out, const __m256i *data,
+                            const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];
+
+   memcpy_256( W, data, 16 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm256_add_epi32( state_in[0], A );
+   state_out[1] = _mm256_add_epi32( state_in[1], B );
+   state_out[2] = _mm256_add_epi32( state_in[2], C );
+   state_out[3] = _mm256_add_epi32( state_in[3], D );
+   state_out[4] = _mm256_add_epi32( state_in[4], E );
+   state_out[5] = _mm256_add_epi32( state_in[5], F );
+   state_out[6] = _mm256_add_epi32( state_in[6], G );
+   state_out[7] = _mm256_add_epi32( state_in[7], H );
+}
+
 static void
-sha256_8way_round( sha256_8way_context *ctx,  __m256i *in, __m256i r[8] )
+sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
 {
-   register  __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   register  __m256i A, B, C, D, E, F, G, H;
    __m256i W[16];
 
    mm256_block_bswap_32( W  , in   );
@@ -429,8 +615,6 @@ sha256_8way_round( sha256_8way_context *ctx,  __m256i *in, __m256i r[8] )
       H = m256_const1_64( 0x5BE0CD195BE0CD19 );
    }
 
-   Y_xor_Z = _mm256_xor_si256( B, C );
-   
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -586,10 +770,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
     high = (sc->count_high << 3) | (low >> 29);
     low = low << 3;
 
-    sc->buf[ pad >> 2 ] =
-                 mm256_bswap_32( m256_const1_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm256_bswap_32( m256_const1_32( low ) );
+    sc->buf[   pad     >> 2 ] = m256_const1_32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
 
     sha256_8way_round( sc, sc->buf, sc->val );
 
@@ -609,38 +791,22 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 // SHA-256 16 way
 
 #define CHx16(X, Y, Z) \
-   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+   _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
 
-/*
-#define MAJx16(X, Y, Z) \
-   _mm512_or_si512( _mm512_and_si512( X, Y ), \
-                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
-*/
-/*
 #define MAJx16(X, Y, Z) \
-  _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \
-                                         _mm512_xor_si512( Y, Z ) ) )
-*/
-
-#define MAJx16(X, Y, Z) \
-  _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \
-                                         Y_xor_Z ) )
+   _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
 
 #define BSG2_0x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x,  2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
+   mm512_xor3( mm512_ror_32(x,  2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
 
 #define BSG2_1x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x,  6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
+   mm512_xor3( mm512_ror_32(x,  6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
 
 #define SSG2_0x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x,  7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) 
+   mm512_xor3( mm512_ror_32(x,  7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
 
 #define SSG2_1x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
+   mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
 
 #define SHA2x16_MEXP( a, b, c, d ) \
      mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
@@ -652,15 +818,220 @@ do { \
   T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
-  Y_xor_Z = X_xor_Y; \
   D  = _mm512_add_epi32( D,  T1 ); \
   H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
 
+// Tranform one 16 lane by 64 byte message block and update state.
+// Calling function is responsible for initializing the state, setting
+// correct byte order, counting bits and padding of the final block.
+// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
+// redundant byte swapping.
+//
+void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   memcpy_512( W, data, 16 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm512_add_epi32( state_in[0], A );
+   state_out[1] = _mm512_add_epi32( state_in[1], B );
+   state_out[2] = _mm512_add_epi32( state_in[2], C );
+   state_out[3] = _mm512_add_epi32( state_in[3], D );
+   state_out[4] = _mm512_add_epi32( state_in[4], E );
+   state_out[5] = _mm512_add_epi32( state_in[5], F );
+   state_out[6] = _mm512_add_epi32( state_in[6], G );
+   state_out[7] = _mm512_add_epi32( state_in[7], H );
+}
+
+// Aggresive prehashing
+void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
+                             const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+
+   A = _mm512_load_si512( state_in     );
+   B = _mm512_load_si512( state_in + 1 );
+   C = _mm512_load_si512( state_in + 2 );
+   D = _mm512_load_si512( state_in + 3 );
+   E = _mm512_load_si512( state_in + 4 );
+   F = _mm512_load_si512( state_in + 5 );
+   G = _mm512_load_si512( state_in + 6 );
+   H = _mm512_load_si512( state_in + 7 );
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+
+   _mm512_store_si512( state_mid    , A );
+   _mm512_store_si512( state_mid + 1, B );
+   _mm512_store_si512( state_mid + 2, C );
+   _mm512_store_si512( state_mid + 3, D );
+   _mm512_store_si512( state_mid + 4, E );
+   _mm512_store_si512( state_mid + 5, F );
+   _mm512_store_si512( state_mid + 6, G );
+   _mm512_store_si512( state_mid + 7, H );
+}   
+
+void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+                          const __m512i *state_in, const __m512i *state_mid )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   memcpy_512( W, data, 16 );
+
+   A = _mm512_load_si512( state_mid     );
+   B = _mm512_load_si512( state_mid + 1 );
+   C = _mm512_load_si512( state_mid + 2 );
+   D = _mm512_load_si512( state_mid + 3 );
+   E = _mm512_load_si512( state_mid + 4 );
+   F = _mm512_load_si512( state_mid + 5 );
+   G = _mm512_load_si512( state_mid + 6 );
+   H = _mm512_load_si512( state_mid + 7 );
+
+//   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+//   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+//   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   A = _mm512_add_epi32( A, _mm512_load_si512( state_in     ) );
+   B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) );
+   C = _mm512_add_epi32( C, _mm512_load_si512( state_in + 2 ) );
+   D = _mm512_add_epi32( D, _mm512_load_si512( state_in + 3 ) );
+   E = _mm512_add_epi32( E, _mm512_load_si512( state_in + 4 ) );
+   F = _mm512_add_epi32( F, _mm512_load_si512( state_in + 5 ) );
+   G = _mm512_add_epi32( G, _mm512_load_si512( state_in + 6 ) );
+   H = _mm512_add_epi32( H, _mm512_load_si512( state_in + 7 ) );
+   
+   _mm512_store_si512( state_out    ,  A );
+   _mm512_store_si512( state_out + 1,  B );
+   _mm512_store_si512( state_out + 2,  C );
+   _mm512_store_si512( state_out + 3,  D );
+   _mm512_store_si512( state_out + 4,  E );
+   _mm512_store_si512( state_out + 5,  F );
+   _mm512_store_si512( state_out + 6,  G );
+   _mm512_store_si512( state_out + 7,  H );
+}
+
 static void
 sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
 {
-   register  __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   register __m512i A, B, C, D, E, F, G, H;
    __m512i W[16];
 
    mm512_block_bswap_32( W  , in   );
@@ -689,7 +1060,6 @@ sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
       H = m512_const1_64( 0x5BE0CD195BE0CD19 );
    }
 
-   Y_xor_Z = _mm512_xor_si512( B, C );
 
    SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
@@ -834,10 +1204,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
     high = (sc->count_high << 3) | (low >> 29);
     low = low << 3;
 
-    sc->buf[ pad >> 2 ] =
-                 mm512_bswap_32( m512_const1_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm512_bswap_32( m512_const1_32( low ) );
+    sc->buf[   pad     >> 2 ] = m512_const1_32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
 
     sha256_16way_round( sc, sc->buf, sc->val );
 
diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c
index fb049b1f..78bda652 100644
--- a/algo/sha/sha256-hash-opt.c
+++ b/algo/sha/sha256-hash-opt.c
@@ -3,23 +3,24 @@
 /*   Based on code from Intel, and by Sean Gulley for      */
 /*   the miTLS project.                                    */
 
-// A drop in replacement for the function of the same name in sph_sha2.c.
+// A stripped down version with byte swapping removed. 
 
 #if defined(__SHA__)
 
-#include "simd-utils.h"
+#include "sha256-hash-opt.h"
 
-static void sha2_round( const uint8_t input[], uint32_t state[8] )
+void sha256_opt_transform( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in )
 {
     __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
+    __m128i MSG, TMP;
     __m128i TMSG0, TMSG1, TMSG2, TMSG3;
     __m128i ABEF_SAVE, CDGH_SAVE;
 
     // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+    TMP = _mm_load_si128((__m128i*) &state_in[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
+//    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
 
     TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
     STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
@@ -31,8 +32,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
     CDGH_SAVE = STATE1;
 
     // Rounds 0-3
-    MSG = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
+//    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
     MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
     MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -40,7 +41,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
 
     // Rounds 4-7
     TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+//    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
     MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
     MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -49,7 +50,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
 
     // Rounds 8-11
     TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+//    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
     MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
     MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -58,7 +59,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
 
     // Rounds 12-15
     TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+//    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
     MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
     TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
@@ -192,9 +193,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
     STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
 
     // Save state
-    _mm_store_si128((__m128i*) &state[0], STATE0);
-    _mm_store_si128((__m128i*) &state[4], STATE1);
+    _mm_store_si128((__m128i*) &state_out[0], STATE0);
+    _mm_store_si128((__m128i*) &state_out[4], STATE1);
 }
 
-
 #endif
diff --git a/algo/sha/sha256-hash-opt.h b/algo/sha/sha256-hash-opt.h
new file mode 100644
index 00000000..9ceacf43
--- /dev/null
+++ b/algo/sha/sha256-hash-opt.h
@@ -0,0 +1,18 @@
+#ifndef SHA2_HASH_OPT_H__
+#define SHA2_HASH_OPT_H__ 1
+
+#include <stddef.h>
+#include "simd-utils.h"
+
+#if defined(__SHA__)
+
+void sha256_opt_transform( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in );
+
+// 2 way with interleaved instructions
+void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+#endif
+#endif
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
new file mode 100644
index 00000000..9bbc5c8d
--- /dev/null
+++ b/algo/sha/sha256d-4way.c
@@ -0,0 +1,252 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha-hash-4way.h"
+
+#if defined(SHA256D_16WAY)
+
+int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (32)));
+   __m512i  initstate[8] __attribute__ ((aligned (32)));
+   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i *noncev = vdata + 19; 
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m512_const1_32( pdata[i] );
+
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m512_const1_64( 0x510E527F510E527F );
+   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_16way_transform( midstate, vdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy_512( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_512( block + 5, 10 );  
+      block[15] = m512_const1_32( 80*8 ); // bit count
+      sha256_16way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_512( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_512( block + 9, 6 );
+      block[15] = m512_const1_32( 32*8 ); // bit count
+      sha256_16way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm512_add_epi32( *noncev, sixteen );
+       n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#endif
+
+#if defined(SHA256D_8WAY)
+
+int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m256i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   __m256i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m256_const1_32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m256_const1_64( 0x510E527F510E527F );
+   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_8way_transform( midstate, vdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy_256( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_256( block + 5, 10 );
+      block[15] = m256_const1_32( 80*8 ); // bit count
+      sha256_8way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_256( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_256( block + 9, 6 );
+      block[15] = m256_const1_32( 32*8 ); // bit count
+      sha256_8way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm256_add_epi32( *noncev, eight );
+       n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256D_4WAY)
+
+int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  block[16]    __attribute__ ((aligned (64)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m128i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m128_const1_32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m128_const1_64( 0x510E527F510E527F );
+   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform( midstate, vdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy_128( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_128( block + 5, 10 );
+      block[15] = m128_const1_32( 80*8 ); // bit count
+      sha256_4way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_128( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_128( block + 9, 6 );
+      block[15] = m128_const1_32( 32*8 ); // bit count
+      sha256_4way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index eb11744f..0f4fb58d 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -7,133 +7,173 @@
 
 #if defined(SHA256T_16WAY)
 
-static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
-
-void sha256t_16way_hash( void* output, const void* input )
-{
-   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
-   sha256_16way_context ctx;
-   memcpy( &ctx, &sha256_ctx16, sizeof ctx );
-
-   sha256_16way_update( &ctx, input + (64<<4), 16 );
-   sha256_16way_close( &ctx, vhash );
-
-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
-
-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, output );
-}
-
 int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
-   uint32_t hash32[8*16] __attribute__ ((aligned (32)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (32)));
+   __m512i  initstate[8] __attribute__ ((aligned (32)));
+   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate2[8] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 = &(hash32[7<<4]);
+   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t targ32_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 16;
    uint32_t n = first_nonce;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   __m512i *noncev = vdata + 19; 
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m512_const1_32( pdata[i] );
 
-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
    *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                                n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-   sha256_16way_init( &sha256_ctx16 );
-   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+
+   // initialize state
+   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m512_const1_64( 0x510E527F510E527F );
+   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 byte block of data
+   sha256_16way_transform( midstate, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
 
    do
    {
-     pdata[19] = n;
-     sha256t_16way_hash( hash32, vdata );
-     for ( int lane = 0; lane < 16; lane++ )
-     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-     {
-        extr_lane_16x32( lane_hash, hash32, lane, 256 );
-        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_solution( work, lane_hash, mythr );
-        }
-      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
-      n += 16;
+      // 1. final 16 bytes of data, with padding
+      memcpy_512( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_512( block + 5, 10 );  
+      block[15] = m512_const1_32( 80*8 ); // bit count
+      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+//      sha256_16way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_512( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_512( block + 9, 6 );
+      block[15] = m512_const1_32( 32*8 ); // bit count
+      sha256_16way_transform( hash32, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy_512( block, hash32, 8 );
+      sha256_16way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm512_add_epi32( *noncev, sixteen );
+       n += 16;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
 
+
 #endif
 
 #if defined(SHA256T_8WAY)
 
-static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
-
-void sha256t_8way_hash( void* output, const void* input )
-{
-   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-   sha256_8way_context ctx;
-   memcpy( &ctx, &sha256_ctx8, sizeof ctx );
-
-   sha256_8way_update( &ctx, input + (64<<3), 16 );
-   sha256_8way_close( &ctx, vhash );
-
-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
-
-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, output );
-}
-
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
-   uint32_t hash32[8*8]    __attribute__ ((aligned (32)));
+   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate[8]  __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 = &(hash32[7<<3]);
+   __m256i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t targ32_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   __m256i *noncev = vdata + 19;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m256_const1_32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   sha256_8way_init( &sha256_ctx8 );
-   sha256_8way_update( &sha256_ctx8, vdata, 64 );
+   // initialize state
+   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m256_const1_64( 0x510E527F510E527F );
+   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_8way_transform( midstate, vdata, initstate );
 
    do
    {
-     pdata[19] = n;
-     sha256t_8way_hash( hash32, vdata );
-     for ( int lane = 0; lane < 8; lane++ )
-     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-     {
-        extr_lane_8x32( lane_hash, hash32, lane, 256 );
-        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_solution( work, lane_hash, mythr );
-        }
-      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
-      n += 8;
+      // 1. final 16 bytes of data, with padding
+      memcpy_256( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_256( block + 5, 10 );
+      block[15] = m256_const1_32( 80*8 ); // bit count
+      sha256_8way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_256( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_256( block + 9, 6 );
+      block[15] = m256_const1_32( 32*8 ); // bit count
+      sha256_8way_transform( hash32, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy_256( block, hash32, 8 );
+      sha256_8way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm256_add_epi32( *noncev, eight );
+       n += 8;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -144,82 +184,84 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 
 #if defined(SHA256T_4WAY)
 
-static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
-
-void sha256t_4way_hash( void* output, const void* input )
-{
-   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-   sha256_4way_context ctx;
-   memcpy( &ctx, &sha256_ctx4, sizeof ctx );
-
-   sha256_4way_update( &ctx, input + (64<<2), 16 );
-   sha256_4way_close( &ctx, vhash );
-
-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
-
-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, output );
-}
-
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
-	                   uint64_t *hashes_done, struct thr_info *mythr )
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<2]);
+   __m128i  block[16]    __attribute__ ((aligned (64)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m128i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   __m128i *noncev = vdata + 19;
    const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
 
-   const uint64_t htmax[] = {          0,
-                                     0xF,
-                                    0xFF,
-                                   0xFFF,
-                                  0xFFFF,
-                              0x10000000 };
-   const uint32_t masks[] = { 0xFFFFFFFF,
-                              0xFFFFFFF0,
-                              0xFFFFFF00,
-                              0xFFFFF000,
-                              0xFFFF0000,
-                                       0 };
-
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   sha256_4way_init( &sha256_ctx4 );
-   sha256_4way_update( &sha256_ctx4, vdata, 64 );
-
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
-   {
-      const uint32_t mask = masks[m];
-      do {
-         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
-         pdata[19] = n;
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m128_const1_32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m128_const1_64( 0x510E527F510E527F );
+   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
-         sha256t_4way_hash( hash, vdata );
+   // hash first 64 bytes of data
+   sha256_4way_transform( midstate, vdata, initstate );
 
-         for ( int lane = 0; lane < 4; lane++ )
-         if ( !( hash7[ lane ] & mask ) )
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy_128( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_128( block + 5, 10 );
+      block[15] = m128_const1_32( 80*8 ); // bit count
+      sha256_4way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_128( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_128( block + 9, 6 );
+      block[15] = m128_const1_32( 32*8 ); // bit count
+      sha256_4way_transform( hash32, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy_128( block, hash32, 8 );
+      sha256_4way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
          {
-            extr_lane_4x32( lane_hash, hash, lane, 256 );
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-              pdata[19] = n + lane;
-              submit_solution( work, lane_hash, mythr );
-	         }
-         } 
-         n += 4;
-      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
-      break;
-   }
-   *hashes_done = n - first_nonce + 1;
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c
index 166efe22..e05c7060 100644
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate )
     gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256T_16WAY)
     gate->scanhash   = (void*)&scanhash_sha256t_16way;
-    gate->hash       = (void*)&sha256t_16way_hash;
 #elif defined(__SHA__)
     gate->optimizations = SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256t;
-    gate->hash       = (void*)&sha256t_hash;
 #elif defined(SHA256T_8WAY)
     gate->scanhash   = (void*)&scanhash_sha256t_8way;
-    gate->hash       = (void*)&sha256t_8way_hash;
 #else
     gate->scanhash   = (void*)&scanhash_sha256t_4way;
-    gate->hash       = (void*)&sha256t_4way_hash;
 #endif
     return true;
 }
diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h
index 46266f2b..e74cfd1d 100644
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate );
 
 #if defined(SHA256T_16WAY)
 
-void sha256t_16way_hash( void *output, const void *input );
 int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_16way_hash( void *output, const void *input );
@@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce,
 
 #if defined(SHA256T_8WAY)
 
-void sha256t_8way_hash( void *output, const void *input );
 int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_8way_hash( void *output, const void *input );
@@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
 
 #if defined(SHA256T_4WAY)
 
-void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_4way_hash( void *output, const void *input );
@@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 
+#if defined(__SHA__)
 
-int sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
 int sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
index bd4edf0f..90d2754b 100644
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -3,10 +3,14 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sph_sha2.h"
+//#include "algo/sha/sph_sha2.h"
+#include "sha256-hash-opt.h"
+
+#if defined(__SHA__)
 
 // Only used on CPUs with SHA
 
+/*
 static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
 
 void sha256t_midstate( const void* input )
@@ -37,12 +41,21 @@ int sha256t_hash( void* output, const void* input )
 
    return 1;
 }
+*/
 
+/*
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
-   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t block[16]    __attribute__ ((aligned (64)));
+   uint32_t hash32[8]    __attribute__ ((aligned (32)));
+   uint32_t initstate[8] __attribute__ ((aligned (32)));
+   uint32_t midstate[8]  __attribute__ ((aligned (32)));
+
+
+
+//   uint32_t edata[20] __attribute__((aligned(64)));
+//   uint32_t hash[8] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
@@ -50,24 +63,148 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
+   __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+//   mm128_bswap32_80( edata, pdata );
+//   sha256t_midstate( edata );
+
+   // initialize state
+   initstate[0] = 0x6A09E667;
+   initstate[1] = 0xBB67AE85;
+   initstate[2] = 0x3C6EF372;
+   initstate[3] = 0xA54FF53A;
+   initstate[4] = 0x510E527F;
+   initstate[5] = 0x9B05688C;
+   initstate[6] = 0x1F83D9AB;
+   initstate[7] = 0x5BE0CD19;
 
-   mm128_bswap32_80( edata, pdata );
-   sha256t_midstate( edata );
+   // hash first 64 bytes of data
+   sha256_opt_transform( midstate, pdata, initstate );
 
    do
    {
-      edata[19] = n;
-      if ( likely( sha256t_hash( hash, edata ) ) )
-      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
-      {
-         pdata[19] = bswap_32( n );
-         submit_solution( work, hash, mythr );
-      }
+      // 1. final 16 bytes of data, with padding
+      memcpy( block, pdata + 16, 16 );
+      block[ 4] = 0x80000000;
+      memset( block + 5, 0, 40 );
+      block[15] = 80*8; // bit count
+      sha256_opt_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy( block, hash32, 32 );
+      block[ 8] = 0x80000000;
+      memset( block + 9, 0, 24 );
+      block[15] = 32*8; // bit count
+      sha256_opt_transform( hash32, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy( block, hash32, 32 );
+      sha256_opt_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      casti_m128i( hash32, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
+      casti_m128i( hash32, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
+
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+         submit_solution( work, hash32, mythr );
       n++;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
+      pdata[19] = n;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
    *hashes_done = n - first_nonce;
+   return 0;
+}
+*/
+
+int scanhash_sha256t( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block0[16]    __attribute__ ((aligned (64)));
+   uint32_t block1[16]    __attribute__ ((aligned (64)));
+   uint32_t hash0[8]    __attribute__ ((aligned (32)));
+   uint32_t hash1[8]    __attribute__ ((aligned (32)));
+   uint32_t initstate[8] __attribute__ ((aligned (32)));
+   uint32_t midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 1;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // initialize state
+   initstate[0] = 0x6A09E667;
+   initstate[1] = 0xBB67AE85;
+   initstate[2] = 0x3C6EF372;
+   initstate[3] = 0xA54FF53A;
+   initstate[4] = 0x510E527F;
+   initstate[5] = 0x9B05688C;
+   initstate[6] = 0x1F83D9AB;
+   initstate[7] = 0x5BE0CD19;
+
+   // hash first 64 bytes of data
+   sha256_opt_transform( midstate, pdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy( block0, pdata + 16, 16 );
+      memcpy( block1, pdata + 16, 16 );
+      block0[ 3] = n;
+      block1[ 3] = n+1;
+      block0[ 4] = block1[ 4] = 0x80000000;
+      memset( block0 + 5, 0, 40 );
+      memset( block1 + 5, 0, 40 );
+      block0[15] = block1[15] = 80*8; // bit count
+      sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy( block0, hash0, 32 );
+      memcpy( block1, hash1, 32 );
+      block0[ 8] = block1[ 8] = 0x80000000;
+      memset( block0 + 9, 0, 24 );
+      memset( block1 + 9, 0, 24 );
+      block0[15] = block1[15] = 32*8; // bit count
+      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy( block0, hash0, 32 );
+      memcpy( block1, hash1, 32 );
+      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+
+      // byte swap final hash for testing
+      casti_m128i( hash0, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
+      casti_m128i( hash0, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
+      casti_m128i( hash1, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
+      casti_m128i( hash1, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
+
+      if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash0, mythr );
+      }
+      if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
+      {
+         pdata[19] = n+1;
+         submit_solution( work, hash1, mythr );
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
    pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
+#endif
 
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index 803c42f5..e41a92ba 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -96,86 +96,22 @@ static const uint64_t K512[80] =
 // SHA-512 8 way 64 bit
 
 #define CH8W(X, Y, Z) \
-   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
-
-/*
-#define MAJ8W(X, Y, Z) \
-   _mm512_or_si512( _mm512_and_si512( X, Y ), \
-                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
-*/
-/* Functionally identical to original but optimizable,
- * subexpression X^Y from one step can be reused in the next step as Y^Z
-#define MAJ8W(X, Y, Z) \
-  _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \
-                                         _mm512_xor_si512( Y, Z ) ) )
-*/
+   _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
 
 #define MAJ8W(X, Y, Z) \
-  _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \
-                                         Y_xor_Z ) )
+   _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
 
 #define BSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
+   mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
 
 #define BSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
+   mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
 
 #define SSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
+   mm512_xor3( mm512_ror_64(x,  1), mm512_ror_64(x,  8), _mm512_srli_epi64(x, 7) ) 
 
 #define SSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
-
-static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
-{
-   __m512i w0a, w1a, w0b, w1b;
-   w0a = mm512_ror_64( w0, 1 );
-   w1a = mm512_ror_64( w1,19 );
-   w0b = mm512_ror_64( w0, 8 );
-   w1b = mm512_ror_64( w1,61 );
-   w0a = _mm512_xor_si512( w0a, w0b );
-   w1a = _mm512_xor_si512( w1a, w1b );
-   w0b = _mm512_srli_epi64( w0, 7 );
-   w1b = _mm512_srli_epi64( w1, 6 );
-   w0a = _mm512_xor_si512( w0a, w0b );
-   w1a = _mm512_xor_si512( w1a, w1b );
-   return _mm512_add_epi64( w0a, w1a );
-}
-
-
-#define SSG8W_512x2_0( w0, w1, i ) do \
-{ \
-   __m512i X0a, X1a, X0b, X1b; \
-  X0a = mm512_ror_64( W[i-15], 1 ); \
-  X1a = mm512_ror_64( W[i-14], 1 ); \
-  X0b = mm512_ror_64( W[i-15], 8 ); \
-  X1b = mm512_ror_64( W[i-14], 8 ); \
-  X0a = _mm512_xor_si512( X0a, X0b ); \
-  X1a = _mm512_xor_si512( X1a, X1b ); \
-  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
-  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
-  w0  = _mm512_xor_si512( X0a, X0b ); \
-  w1  = _mm512_xor_si512( X1a, X1b ); \
-} while(0)
-
-#define SSG8W_512x2_1( w0, w1, i ) do \
-{ \
-   __m512i X0a, X1a, X0b, X1b; \
-  X0a = mm512_ror_64( W[i-2],19 ); \
-  X1a = mm512_ror_64( W[i-1],19 ); \
-  X0b = mm512_ror_64( W[i-2],61 ); \
-  X1b = mm512_ror_64( W[i-1],61 ); \
-  X0a = _mm512_xor_si512( X0a, X0b ); \
-  X1a = _mm512_xor_si512( X1a, X1b ); \
-  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
-  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
-  w0  = _mm512_xor_si512( X0a, X0b ); \
-  w1  = _mm512_xor_si512( X1a, X1b ); \
-} while(0)
+   mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
 
 #define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
@@ -184,7 +120,6 @@ do { \
   T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
-  Y_xor_Z = X_xor_Y; \
   D  = _mm512_add_epi64( D, T1 ); \
   H  = _mm512_add_epi64( T1, T2 ); \
 } while (0)
@@ -193,15 +128,15 @@ static void
 sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
 {
    int i;
-   register __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   register __m512i A, B, C, D, E, F, G, H;
    __m512i W[80];
 
    mm512_block_bswap_64( W  , in );
    mm512_block_bswap_64( W+8, in+8 );
 
    for ( i = 16; i < 80; i++ )
-      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
-                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+      W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ),
+                             W[ i- 7 ], W[ i-16 ] );
 
    if ( ctx->initialized )
    {
@@ -226,8 +161,6 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
       H = m512_const1_64( 0x5BE0CD19137E2179 );
    }
 
-   Y_xor_Z = _mm512_xor_si512( B, C );
-
    for ( i = 0; i < 80; i += 8 )
    {
       SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index b67b0143..5e70c3e8 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -73,7 +73,194 @@ static const sph_u32 H256[8] = {
 
 #if defined(__SHA__)
 
-#include "sha256-hash-opt.c"
+#include "simd-utils.h"
+
+static void sha2_round( const uint8_t input[], uint32_t state[8] )
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP, MASK;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm_load_si128((__m128i*) &state[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    MSG = _mm_load_si128((const __m128i*) (input+0));
+    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 4-7
+    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
+    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 8-11
+    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
+    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 12-15
+    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
+    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 16-19
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 20-23
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 24-27
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 28-31
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 32-35
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 36-39
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 40-43
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 44-47
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 48-51
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 52-55
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 56-59
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 60-63
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Add values back to state
+    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &state[0], STATE0);
+    _mm_store_si128((__m128i*) &state[4], STATE1);
+}
 
 #else   // no SHA
 
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index dffa18d1..c53cb39f 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -310,12 +310,13 @@ do { \
 
 #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
-   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
+   xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256(  \
             _mm256_andnot_si256( xb3, xb2 ), \
-            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
-               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
-                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
-   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
+            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
+                                   _mm256_set1_epi32(5UL) ) ), \
+               _mm256_set1_epi32(3UL) ) ) ); \
+   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
 
 #define PERM_STEP_0_8   do { \
diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c
index d7cd4705..711d8ac2 100644
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -309,22 +309,16 @@ static const uint64_t IV512[] = {
       sc->bcount = bcount; \
    } while (0)
    
-// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
-
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
-  k8 = _mm512_xor_si512( _mm512_xor_si512( \
-                            _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
-                                              _mm512_xor_si512( k2, k3 ) ), \
-                            _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
-                                              _mm512_xor_si512( k6, k7 ) ) ), \
-                         m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
+  k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
+                   mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
   t2 = t0 ^ t1; \
 } while (0)
-   
+
 #define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
 do { \
   w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
@@ -340,7 +334,6 @@ do { \
                                          m512_const1_64( s ) ) ); \
 } while (0)
 
-
 #define TFBIG_MIX_8WAY(x0, x1, rc) \
 do { \
      x0 = _mm512_add_epi64( x0, x1 ); \
diff --git a/configure b/configure
index 403892fc..8382a1bb 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.5.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.16.5'
-PACKAGE_STRING='cpuminer-opt 3.16.5'
+PACKAGE_VERSION='3.17.0'
+PACKAGE_STRING='cpuminer-opt 3.17.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.16.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.17.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.16.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.17.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.16.5
+cpuminer-opt configure 3.17.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.16.5, which was
+It was created by cpuminer-opt $as_me 3.17.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.16.5'
+ VERSION='3.17.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.16.5, which was
+This file was extended by cpuminer-opt $as_me 3.17.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.16.5
+cpuminer-opt config.status 3.17.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index c0aca337..f5612ef9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.5])
+AC_INIT([cpuminer-opt], [3.17.0])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index e46d920a..9b723766 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2093,10 +2093,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                          sctx->block_height, net_diff, g_work->job_id );
    else if ( !opt_quiet )
    {
-      unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
-                                            g_work->xnonce2_len );
-      applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g",
-                  xnonce2str, sctx->block_height, net_diff );
+      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
+                                             g_work->xnonce2_len );
+      applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
+                        xnonce2str, sctx->block_height, g_work->job_id );
       free( xnonce2str );
    }
 
diff --git a/miner.h b/miner.h
index bea4f68a..9ca56b83 100644
--- a/miner.h
+++ b/miner.h
@@ -307,6 +307,7 @@ extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass,
 extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
+char  *bebin2hex( const unsigned char *p, size_t len );
 bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                     size_t buflen );
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 7a37012a..1b9fca80 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -136,9 +136,84 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_add4_8( a, b, c, d ) \
    _mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) )
 
+#if defined(__AVX512VL__)
+
+// AVX512 has ternary logic that supports any 3 input boolean expression.
+
+// a ^ b ^ c
+#define mm256_xor3( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x96 )
+
+// legacy convenience only
+#define mm256_xor4( a, b, c, d ) \
+   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
+
+// a & b & c
+#define mm256_and3( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x80 )
+
+// a | b | c
+#define mm256_or3( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0xfe )
+
+// a ^ ( b & c )
+#define mm256_xorand( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x78 )
+
+// a & ( b ^ c )
+#define mm256_andxor( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x60 )
+
+// a ^ ( b | c )
+#define mm256_xoror( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x1e )
+
+// a ^ ( ~b & c )   
+#define mm256_xorandnot( a, b, c ) \
+  _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
+
+// a | ( b & c )
+#define mm256_orand( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0xf8  )
+
+// ~( a ^ b ), same as (~a) ^ b
+#define mm256_xnor( a, b ) \
+   _mm256_ternarylogic_epi64( a, b, b, 0x81  )
+    
+#else
+
+#define mm256_xor3( a, b, c ) \
+   _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
+
 #define mm256_xor4( a, b, c, d ) \
    _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
 
+#define mm256_and3( a, b, c ) \
+   _mm256_and_si256( a, _mm256_and_si256( b, c ) )
+
+#define mm256_or3( a, b, c ) \
+   _mm256_or_si256( a, _mm256_or_si256( b, c ) )
+
+#define mm256_xorand( a, b, c ) \
+ _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
+
+#define mm256_andxor( a, b, c ) \
+  _mm256_and_si256( a, _mm256_xor_si256( b, c ))
+
+#define mm256_xoror( a, b, c ) \
+ _mm256_xor_si256( a, _mm256_or_si256( b, c ) )
+
+#define mm256_xorandnot( a, b, c ) \
+ _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
+
+#define mm256_orand( a, b, c ) \
+ _mm256_or_si256( a, _mm256_and_si256( b, c ) )
+
+#define mm256_xnor( a, b ) \
+  mm256_not( _mm256_xor_si256( a, b ) )
+
+#endif
+
 //
 //           Bit rotations.
 //
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index 22c5331a..e6b7ac22 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -61,7 +61,7 @@
 //
 //    Additionally, permutations using smaller vectors can be more efficient
 //    if the permutation doesn't cross lane boundaries, typically 128 bits,
-//    and the smnaller vector can use an imm comtrol.
+//    and the smaller vector can use an imm comtrol.
 //
 //    If the permutation doesn't cross lane boundaries a shuffle instructions
 //    can be used with imm control instead of permute.
@@ -107,7 +107,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
   return v.m512i;
 }
 
-// Equivalent of set1, broadcast lo element all elements.
+// Equivalent of set1, broadcast lo element to all elements.
 static inline __m512i m512_const1_256( const __m256i v )
 { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  
 
@@ -166,7 +166,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 // Basic operations without SIMD equivalent
 
 // ~x
-#define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+// #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+static inline __m512i mm512_not( const __m512i x )
+{  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
 
 // -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
@@ -221,11 +223,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_add4_8( a, b, c, d ) \
    _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) )
 
+//
+// Ternary logic uses 8 bit truth table to define any 3 input logical
+// operation using any number or combinations of AND, OR XOR, NOT.
+
+// a ^ b ^ c
+#define mm512_xor3( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x96 )
+
+// legacy convenience only
 #define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )
+   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
+
+// a & b & c
+#define mm512_and3( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x80 )
+
+// a | b | c
+#define mm512_or3( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0xfe )
+
+// a ^ ( b & c )
+#define mm512_xorand( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x78 )
+
+// a & ( b ^ c )
+#define mm512_andxor( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x60 )
+
+// a ^ ( b & c )
+#define mm512_xoror( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x1e )
+
+// a ^ ( ~b & c )     [ xor( a, andnot( b, c ) ]
+#define mm512_xorandnot( a, b, c ) \
+  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
+
+// a | ( b & c )
+#define mm512_orand( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0xf8  )
+
+// Some 2 input operations that don't have their own instruction mnemonic.
+
+// ~( a | b )
+#define mm512_nor( a, b ) \
+   _mm512_ternarylogic_epi64( a, b, b, 0x01  )
+
+// ~( a ^ b ), same as (~a) ^ b
+#define mm512_xnor( a, b ) \
+   _mm512_ternarylogic_epi64( a, b, b, 0x81  )
+
+// ~( a & b )
+#define mm512_nand( a, b ) \
+   _mm512_ternarylogic_epi64( a, b, b, 0xef  )
 
 
-//
 // Bit rotations.
 
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
diff --git a/util.c b/util.c
index 2bfc8095..b96c4fe0 100644
--- a/util.c
+++ b/util.c
@@ -795,6 +795,15 @@ char *abin2hex(const unsigned char *p, size_t len)
 	return s;
 }
 
+char *bebin2hex(const unsigned char *p, size_t len)
+{
+   char *s = (char*) malloc((len * 2) + 1);
+   if (!s)  return NULL;
+   for ( size_t i = 0, j = len - 1; i < len; i++, j-- )
+      sprintf( s + ( i*2 ), "%02x", (unsigned int) p[ j ] );
+   return s;
+}
+
 bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
 {
 	char hex_byte[3];

From 9b905fccc87884ec028102c70e7f454999d5a80d Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Mon, 26 Jul 2021 15:01:37 -0400
Subject: [PATCH 10/20] v3.17.1

---
 RELEASE_NOTES                             |   6 +
 algo/blake/blake-hash-4way.h              |   1 +
 algo/blake/blake256-hash-4way.c           |  32 ++---
 algo/blake/blake2b-hash-4way.c            |  16 +--
 algo/blake/blake2s-gate.h                 |   3 -
 algo/blake/blake2s-hash-4way.c            |   4 +-
 algo/blake/blake512-hash-4way.c           | 128 +++++++----------
 algo/bmw/bmw256-hash-4way.c               | 168 +++++-----------------
 algo/bmw/bmw512-hash-4way.c               |  13 +-
 algo/echo/aes_ni/hash.c                   |  41 +++++-
 algo/echo/echo-hash-4way.c                |  82 +++++++++--
 algo/fugue/fugue-aesni.c                  | 155 +++++++++++---------
 algo/groestl/aes_ni/groestl-intr-aes.h    |  98 ++++++++++++-
 algo/groestl/aes_ni/groestl256-intr-aes.h |  98 ++++++++++++-
 algo/groestl/groestl256-intr-4way.h       |  97 ++++++++++++-
 algo/groestl/groestl512-intr-4way.h       | 160 ++++++++++-----------
 algo/groestl/myrgr-4way.c                 |  25 ++--
 algo/sha/hmac-sha256-hash.c               |  50 +------
 algo/sha/hmac-sha256-hash.h               |  10 --
 algo/sha/sph_sha2.c                       |  73 +++++++++-
 algo/shavite/shavite-hash-4way.c          |  14 +-
 algo/shavite/sph-shavite-aesni.c          |   9 --
 algo/simd/simd-hash-2way.c                |  10 +-
 algo/simd/vector.c                        |   4 -
 algo/yespower/yespower-opt.c              |  61 ++++++--
 algo/yespower/yespower.h                  |   1 -
 build-allarch.sh                          |   2 +-
 configure                                 |  20 +--
 configure.ac                              |   2 +-
 cpu-miner.c                               |  26 ++--
 simd-utils/simd-128.h                     |  19 +++
 simd-utils/simd-256.h                     |  24 +++-
 winbuild-cross.sh                         |   2 +-
 33 files changed, 889 insertions(+), 565 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index cdacd323..3f6b080d 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.17.1
+
+Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
+More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
+Fixed my-gr algo for VAES.
+
 v3.17.0
 
 AVX512 optimized using ternary logic instructions.
diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h
index fc64583d..a5d74e0a 100644
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
 void blake512_8way_full( blake_8way_big_context *sc, void * dst,
                         const void *data, size_t len );
+void blake512_8way_hash_le80( void *hash, const void *data );
 
 #endif  // AVX512
 #endif  // AVX2
diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c
index 3de03633..65fbe1fa 100644
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -669,14 +669,14 @@ do { \
       ROUND_S_8WAY(2); \
       ROUND_S_8WAY(3); \
    } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
+   H0 = mm256_xor3( V8, V0, H0 ); \
+   H1 = mm256_xor3( V9, V1, H1 ); \
+   H2 = mm256_xor3( VA, V2, H2 ); \
+   H3 = mm256_xor3( VB, V3, H3 ); \
+   H4 = mm256_xor3( VC, V4, H4 ); \
+   H5 = mm256_xor3( VD, V5, H5 ); \
+   H6 = mm256_xor3( VE, V6, H6 ); \
+   H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)
 
 
@@ -808,14 +808,14 @@ do { \
       ROUND_S_16WAY(2); \
       ROUND_S_16WAY(3); \
    } \
-   H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
-   H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
-   H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
-   H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
-   H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
-   H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
-   H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
-   H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
+   H0 = mm512_xor3( V8, V0, H0 ); \
+   H1 = mm512_xor3( V9, V1, H1 ); \
+   H2 = mm512_xor3( VA, V2, H2 ); \
+   H3 = mm512_xor3( VB, V3, H3 ); \
+   H4 = mm512_xor3( VC, V4, H4 ); \
+   H5 = mm512_xor3( VD, V5, H5 ); \
+   H6 = mm512_xor3( VE, V6, H6 ); \
+   H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)
 
 #endif
diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c
index d9853c2e..f4824434 100644
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
       B2B8W_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
    }
 
-   ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
-   ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
-   ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
-   ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
-   ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
-   ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
-   ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
-   ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
+   ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
+   ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
+   ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
+   ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
+   ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
+   ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
+   ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
+   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }
 
 int blake2b_8way_init( blake2b_8way_ctx *ctx )
diff --git a/algo/blake/blake2s-gate.h b/algo/blake/blake2s-gate.h
index 4c621b40..4a7942c3 100644
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -4,7 +4,6 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 
-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)
   #define BLAKE2S_4WAY
 #endif
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
 
 #elif defined (BLAKE2S_8WAY)
 
-//#if defined(BLAKE2S_8WAY)
-
 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c
index 094edd0b..190ad0b7 100644
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -368,7 +368,7 @@ do { \
    ROUND8W( 9 );
 
    for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
 
 #undef G8W
 #undef ROUND8W
@@ -566,7 +566,7 @@ do { \
    ROUND16W( 9 );
 
    for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
 
 #undef G16W
 #undef ROUND16W
diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c
index a5d53948..d1b5d2bf 100644
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
       H5 = (state)->H[5]; \
       H6 = (state)->H[6]; \
       H7 = (state)->H[7]; \
-      S0 = (state)->S[0]; \
-      S1 = (state)->S[1]; \
-      S2 = (state)->S[2]; \
-      S3 = (state)->S[3]; \
       T0 = (state)->T0; \
       T1 = (state)->T1; \
    } while (0)
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
       (state)->H[5] = H5; \
       (state)->H[6] = H6; \
       (state)->H[7] = H7; \
-      (state)->S[0] = S0; \
-      (state)->S[1] = S1; \
-      (state)->S[2] = S2; \
-      (state)->S[3] = S3; \
       (state)->T0 = T0; \
       (state)->T1 = T1; \
    } while (0)
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {
 
 #define DECL_STATE64_8WAY \
    __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
-        __m512i S0, S1, S2, S3; \
    uint64_t T0, T1;
 
 #define COMPRESS64_8WAY( buf )   do \
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-  V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) );  \
-  V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) );  \
-  VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) );  \
-  VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) );  \
+  V8 = m512_const1_64( CB0 );  \
+  V9 = m512_const1_64( CB1 );  \
+  VA = m512_const1_64( CB2 );  \
+  VB = m512_const1_64( CB3 );  \
   VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
                          m512_const1_64( CB4 ) );  \
   VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
   ROUND_B_8WAY(3); \
   ROUND_B_8WAY(4); \
   ROUND_B_8WAY(5); \
-  H0 = mm512_xor4( V8, V0, S0, H0 ); \
-  H1 = mm512_xor4( V9, V1, S1, H1 ); \
-  H2 = mm512_xor4( VA, V2, S2, H2 ); \
-  H3 = mm512_xor4( VB, V3, S3, H3 ); \
-  H4 = mm512_xor4( VC, V4, S0, H4 ); \
-  H5 = mm512_xor4( VD, V5, S1, H5 ); \
-  H6 = mm512_xor4( VE, V6, S2, H6 ); \
-  H7 = mm512_xor4( VF, V7, S3, H7 ); \
+  H0 = mm512_xor3( V8, V0, H0 ); \
+  H1 = mm512_xor3( V9, V1, H1 ); \
+  H2 = mm512_xor3( VA, V2, H2 ); \
+  H3 = mm512_xor3( VB, V3, H3 ); \
+  H4 = mm512_xor3( VC, V4, H4 ); \
+  H5 = mm512_xor3( VD, V5, H5 ); \
+  H6 = mm512_xor3( VE, V6, H6 ); \
+  H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)
 
 void blake512_8way_compress( blake_8way_big_context *sc )
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
   V5 = sc->H[5];
   V6 = sc->H[6];
   V7 = sc->H[7];
-  V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
-  V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
-  VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
-  VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
+  V8 = m512_const1_64( CB0 );
+  V9 = m512_const1_64( CB1 );
+  VA = m512_const1_64( CB2 );
+  VB = m512_const1_64( CB3 );
   VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
                             m512_const1_64( CB4 ) );
   VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
   ROUND_B_8WAY(4);
   ROUND_B_8WAY(5);
 
-  sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
-  sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
-  sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
-  sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
-  sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
-  sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
-  sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
-  sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
+  sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
+  sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
+  sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
+  sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
+  sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
+  sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
+  sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
+  sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
 }
 
 void blake512_8way_init( blake_8way_big_context *sc )
 {
-   __m512i zero = m512_zero;
    casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
    casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
    casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
    casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
    casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
 
-   casti_m512i( sc->S, 0 ) = zero;
-   casti_m512i( sc->S, 1 ) = zero;
-   casti_m512i( sc->S, 2 ) = zero;
-   casti_m512i( sc->S, 3 ) = zero;
-
    sc->T0 = sc->T1 = 0;
    sc->ptr = 0;
 }
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
    casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
    casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
 
-   casti_m512i( sc->S, 0 ) = m512_zero;
-   casti_m512i( sc->S, 1 ) = m512_zero;
-   casti_m512i( sc->S, 2 ) = m512_zero;
-   casti_m512i( sc->S, 3 ) = m512_zero;
-
    sc->T0 = sc->T1 = 0;
    sc->ptr = 0;
 
@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)
 
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
-        __m256i S0, S1, S2, S3; \
 	uint64_t T0, T1;
 
 #define COMPRESS64_4WAY   do \
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-  V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) );  \
-  V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) );  \
-  VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) );  \
-  VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) );  \
+  V8 = m256_const1_64( CB0 );  \
+  V9 = m256_const1_64( CB1 );  \
+  VA = m256_const1_64( CB2 );  \
+  VB = m256_const1_64( CB3 );  \
   VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
                          m256_const1_64( CB4 ) );  \
   VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
   ROUND_B_4WAY(3); \
   ROUND_B_4WAY(4); \
   ROUND_B_4WAY(5); \
-  H0 = mm256_xor4( V8, V0, S0, H0 ); \
-  H1 = mm256_xor4( V9, V1, S1, H1 ); \
-  H2 = mm256_xor4( VA, V2, S2, H2 ); \
-  H3 = mm256_xor4( VB, V3, S3, H3 ); \
-  H4 = mm256_xor4( VC, V4, S0, H4 ); \
-  H5 = mm256_xor4( VD, V5, S1, H5 ); \
-  H6 = mm256_xor4( VE, V6, S2, H6 ); \
-  H7 = mm256_xor4( VF, V7, S3, H7 ); \
+  H0 = mm256_xor3( V8, V0, H0 ); \
+  H1 = mm256_xor3( V9, V1, H1 ); \
+  H2 = mm256_xor3( VA, V2, H2 ); \
+  H3 = mm256_xor3( VB, V3, H3 ); \
+  H4 = mm256_xor3( VC, V4, H4 ); \
+  H5 = mm256_xor3( VD, V5, H5 ); \
+  H6 = mm256_xor3( VE, V6, H6 ); \
+  H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)
 
 
@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
   V5 = sc->H[5];
   V6 = sc->H[6];
   V7 = sc->H[7];
-  V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
-  V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
-  VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
-  VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
+  V8 = m256_const1_64( CB0 );
+  V9 = m256_const1_64( CB1 );
+  VA = m256_const1_64( CB2 );
+  VB = m256_const1_64( CB3 );
   VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
                              m256_const1_64( CB4 ) );
   VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
   ROUND_B_4WAY(4);
   ROUND_B_4WAY(5);
 
-  sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
-  sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
-  sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
-  sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
-  sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
-  sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
-  sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
-  sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
+  sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
+  sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
+  sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
+  sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
+  sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
+  sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
+  sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
+  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }
 
 void blake512_4way_init( blake_4way_big_context *sc )
 {
-   __m256i zero = m256_zero;
    casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
    casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
    casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
    casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
    casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
 
-   casti_m256i( sc->S, 0 ) = zero;
-   casti_m256i( sc->S, 1 ) = zero;
-   casti_m256i( sc->S, 2 ) = zero;
-   casti_m256i( sc->S, 3 ) = zero;
-
    sc->T0 = sc->T1 = 0;
    sc->ptr = 0;
 }
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
    casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
    casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
 
-   casti_m256i( sc->S, 0 ) = m256_zero;
-   casti_m256i( sc->S, 1 ) = m256_zero;
-   casti_m256i( sc->S, 2 ) = m256_zero;
-   casti_m256i( sc->S, 3 ) = m256_zero;
-
    sc->T0 = sc->T1 = 0;
    sc->ptr = 0;
 
diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c
index 92e71836..8b9de767 100644
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
    qt[30] = expand2s8( qt, M, H, 30 );
    qt[31] = expand2s8( qt, M, H, 31 );
 
-   xl = _mm256_xor_si256(
-              mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
-              mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm256_xor_si256( xl,  _mm256_xor_si256(
-                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
-                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+   xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
+                    mm256_xor3( qt[19], qt[20], qt[21] ),
+                    _mm256_xor_si256( qt[22], qt[23] ) );
+
+   xh = mm256_xor3( mm256_xor3( xl,     qt[24], qt[25] ),
+                    mm256_xor3( qt[26], qt[27], qt[28] ),
+                    mm256_xor3( qt[29], qt[30], qt[31] ) );
 
 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
-               _mm256_xor_si256( M[m], \
-                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
-                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+   _mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
+                                       _mm256_srli_epi32( qt[a], sr ) ), \
+                     mm256_xor3( xl, qt[b], qt[c] ) )
 
 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
-               _mm256_xor_si256( M[m], \
-                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
-                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+   _mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
+                                       _mm256_slli_epi32( qt[a], sr ) ), \
+                     mm256_xor3( xl, qt[b], qt[c] ) )
 
 #define DH2L( m, rl, sl, h, a, b, c ) \
    _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
-                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+                        mm256_rol_32( dH[h], rl ), \
+                        mm256_xor3( xh, qt[a], M[m] ) ), \
+                     mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) ) 
 
 #define DH2R( m, rl, sr, h, a, b, c ) \
    _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
-                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+                        mm256_rol_32( dH[h], rl ), \
+                        mm256_xor3( xh, qt[a], M[m] ) ), \
+                     mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
 
    dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
    dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
 #undef DH2L
 #undef DH2R
 
-/*   
-   dH[ 0] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[0],
-                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
-                                        _mm256_srli_epi32( qt[16], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
-   dH[ 1] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[1],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
-                                        _mm256_slli_epi32( qt[17], 8 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
-   dH[ 2] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[2],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
-                                        _mm256_slli_epi32( qt[18], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
-   dH[ 3] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[3],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
-                                        _mm256_slli_epi32( qt[19], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
-   dH[ 4] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[4],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
-                                        _mm256_slli_epi32( qt[20], 0 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
-   dH[ 5] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[5],
-                      _mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
-                                        _mm256_srli_epi32( qt[21], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
-   dH[ 6] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[6],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
-                                        _mm256_slli_epi32( qt[22], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
-   dH[ 7] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[7],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
-                                        _mm256_slli_epi32( qt[23], 2 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
-   dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[4], 9 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
-                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[5], 10 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
-                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[6], 11 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
-                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
-   dH[11] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[7], 12 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
-                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
-   dH[12] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[0], 13 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
-                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
-   dH[13] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[1], 14 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
-                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
-   dH[14] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[2], 15 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
-                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
-   dH[15] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[3], 16 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
-                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-*/
 }
 
 static const __m256i final_s8[16] =
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
    qt[30] = expand2s16( qt, M, H, 30 );
    qt[31] = expand2s16( qt, M, H, 31 );
 
-   xl = _mm512_xor_si512(
-              mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
-              mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm512_xor_si512( xl,  _mm512_xor_si512(
-                 mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
-                 mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+   xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
+                    mm512_xor3( qt[19], qt[20], qt[21] ),
+                    _mm512_xor_si512( qt[22], qt[23] ) );
+
+   xh = mm512_xor3( mm512_xor3( xl,     qt[24], qt[25] ),
+                    mm512_xor3( qt[26], qt[27], qt[28] ),
+                    mm512_xor3( qt[29], qt[30], qt[31] ) );
 
 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm512_add_epi32( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
-                                    _mm512_srli_epi32( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
+                                       _mm512_srli_epi32( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
 
 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm512_add_epi32( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
-                                    _mm512_slli_epi32( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
+                                       _mm512_slli_epi32( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
 
 #define DH2L( m, rl, sl, h, a, b, c ) \
    _mm512_add_epi32( _mm512_add_epi32( \
-       mm512_rol_32( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+                        mm512_rol_32( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) ) 
 
 #define DH2R( m, rl, sr, h, a, b, c ) \
    _mm512_add_epi32( _mm512_add_epi32( \
-       mm512_rol_32( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+                        mm512_rol_32( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
 
    dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
    dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c
index 4778914e..3587cc4d 100644
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -1285,12 +1285,13 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
    qt[30] = expand2b8( qt, M, H, 30 );
    qt[31] = expand2b8( qt, M, H, 31 );
 
-   xl = _mm512_xor_si512(
-           mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
-           mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm512_xor_si512( xl, _mm512_xor_si512(
-           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
-           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+   xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
+                    mm512_xor3( qt[19], qt[20], qt[21] ),
+                    _mm512_xor_si512( qt[22], qt[23] ) );
+
+   xh = mm512_xor3( mm512_xor3( xl,     qt[24], qt[25] ),
+                    mm512_xor3( qt[26], qt[27], qt[28] ),
+                    mm512_xor3( qt[29], qt[30], qt[31] ) );
 
 #define DH1L( m, sl, sr, a, b, c ) \
    _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c
index a4e3958c..ca1688a4 100644
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -53,6 +53,20 @@ MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x000
 MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
 
 
+#define ECHO_SUBBYTES4(state, j) \
+   state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
+   state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
+   state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
+   state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
+
 #define ECHO_SUBBYTES(state, i, j) \
 	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
    k1 = _mm_add_epi32(k1, M128(const1));\
@@ -73,7 +87,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	t1 = _mm_and_si128(t1, M128(lsbmask));\
 	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
 	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+	state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
 	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
 	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
 	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
@@ -83,7 +97,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
 	s2 = _mm_xor_si128(s2, t2);\
 	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+	state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
 	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
 	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
 	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
@@ -93,10 +107,29 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	s2 = _mm_xor_si128(s2, t2);\
 	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
 	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+	state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
 	state2[3][j] = _mm_xor_si128(state2[3][j], s2)
 
 
+#define ECHO_ROUND_UNROLL2 \
+   ECHO_SUBBYTES4(_state, 0);\
+   ECHO_SUBBYTES4(_state, 1);\
+   ECHO_SUBBYTES4(_state, 2);\
+   ECHO_SUBBYTES4(_state, 3);\
+   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES4(_state2, 0);\
+   ECHO_SUBBYTES4(_state2, 1);\
+   ECHO_SUBBYTES4(_state2, 2);\
+   ECHO_SUBBYTES4(_state2, 3);\
+   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+/*
 #define ECHO_ROUND_UNROLL2 \
 	ECHO_SUBBYTES(_state, 0, 0);\
 	ECHO_SUBBYTES(_state, 1, 0);\
@@ -138,7 +171,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
-
+*/
 
 
 #define SAVESTATE(dst, src)\
diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c
index 51a9f0a8..c8e52cae 100644
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-
-//#define mul2mask    m512_const2_64( 0, 0x00001b00 )
-//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
-//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
-
-//#define lsbmask    m512_const1_32( 0x01010101 ) 
+#define ECHO_SUBBYTES4(state, j) \
+   state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
+   state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
+   state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
+   state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
 
 #define ECHO_SUBBYTES( state, i, j ) \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
@@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
    t1 = _mm512_and_si512( t1, lsbmask ); \
    t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
    s2 = _mm512_xor_si512( s2, t2 );\
-   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
-                              _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
+   state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
    state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
    state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
    state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
    t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
    s2 = _mm512_xor_si512( s2, t2 ); \
    state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
-   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
-                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
+   state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
    state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
    state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
    s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
    s2 = _mm512_xor_si512( s2, t2 ); \
    state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
    state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
-   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
-                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
+   state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
    state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
 } while(0)
 
+#define ECHO_ROUND_UNROLL2 \
+   ECHO_SUBBYTES4(_state, 0);\
+   ECHO_SUBBYTES4(_state, 1);\
+   ECHO_SUBBYTES4(_state, 2);\
+   ECHO_SUBBYTES4(_state, 3);\
+   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES4(_state2, 0);\
+   ECHO_SUBBYTES4(_state2, 1);\
+   ECHO_SUBBYTES4(_state2, 2);\
+   ECHO_SUBBYTES4(_state2, 3);\
+   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+/*
 #define ECHO_ROUND_UNROLL2 \
    ECHO_SUBBYTES(_state, 0, 0);\
    ECHO_SUBBYTES(_state, 1, 0);\
@@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
    ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
    ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
    ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+*/
 
 #define SAVESTATE(dst, src)\
 	dst[0][0] = src[0][0];\
@@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
 
 #define lsbmask_2way    m256_const1_32( 0x01010101 ) 
 
+#define ECHO_SUBBYTES4_2WAY( state, j ) \
+   state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
+   state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
+   state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
+   state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
+
 #define ECHO_SUBBYTES_2WAY( state, i, j ) \
         state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
         k1 = _mm256_add_epi32( k1, m256_one_128 ); \
@@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
    state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
 } while(0)
 
+#define ECHO_ROUND_UNROLL2_2WAY \
+   ECHO_SUBBYTES4_2WAY(_state, 0);\
+   ECHO_SUBBYTES4_2WAY(_state, 1);\
+   ECHO_SUBBYTES4_2WAY(_state, 2);\
+   ECHO_SUBBYTES4_2WAY(_state, 3);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES4_2WAY(_state2, 0);\
+   ECHO_SUBBYTES4_2WAY(_state2, 1);\
+   ECHO_SUBBYTES4_2WAY(_state2, 2);\
+   ECHO_SUBBYTES4_2WAY(_state2, 3);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
+
+/*
 #define ECHO_ROUND_UNROLL2_2WAY \
    ECHO_SUBBYTES_2WAY(_state, 0, 0);\
    ECHO_SUBBYTES_2WAY(_state, 1, 0);\
@@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
    ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
    ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
    ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
+*/
 
 #define SAVESTATE_2WAY(dst, src)\
         dst[0][0] = src[0][0];\
diff --git a/algo/fugue/fugue-aesni.c b/algo/fugue/fugue-aesni.c
index 2dd253a7..8f0af139 100644
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
 	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
 	s7 = _mm_xor_si128(s7, t1)
 
+#define PRESUPERMIX(t0, t1, t2, t3, t4)\
+   t2 = t0;\
+   t3 = _mm_add_epi8(t0, t0);\
+   t4 = _mm_add_epi8(t3, t3);\
+   t1 = _mm_srli_epi16(t0, 6);\
+   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
+   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
+   t0  = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
 
+/*
 #define PRESUPERMIX(x, t1, s1, s2, t2)\
 	s1 = x;\
 	s2 = _mm_add_epi8(x, x);\
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
 	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
 	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
 	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
+*/
 
-#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
+#define SUBSTITUTE(r0, _t2 )\
 	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
 	_t2 = _mm_aesenclast_si128( _t2, m128_zero )
-	
+
+#define SUPERMIX(t0, t1, t2, t3, t4)\
+   t2 = t0;\
+   t3 = _mm_add_epi8(t0, t0);\
+   t4 = _mm_add_epi8(t3, t3);\
+   t1 = _mm_srli_epi16(t0, 6);\
+   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
+   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
+   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
+   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
+   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
+   t4 = _mm_xor_si128(t4, t1);\
+   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
+   t4 = _mm_xor_si128(t4, t1);\
+   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
+   t2 = mm128_xor3(t2, t3, t0 );\
+   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
+   t4 = mm128_xor3( t4, t1, t2 ); \
+   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
+   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
+   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
+   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
+   t4 = mm128_xor3( t4, t2, t1 ); \
+   t0 = _mm_xor_si128(t0, t3);\
+   t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
+
+/*
 #define SUPERMIX(t0, t1, t2, t3, t4)\
 	PRESUPERMIX(t0, t1, t2, t3, t4);\
 	POSTSUPERMIX(t0, t1, t2, t3, t4)
-
+*/
 
 #define POSTSUPERMIX(t0, t1, t2, t3, t4)\
-	t1 = t2;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
+	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
 	t4 = t1;\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t1 = t4;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
+	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t1 = t2;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
+	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t2 = _mm_xor_si128(t2, t3);\
-	t2 = _mm_xor_si128(t2, t0);\
+	t2 = mm128_xor3(t2, t3, t0 );\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-	t1 = t0;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
+	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
 	t0 = _mm_xor_si128(t0, t3);\
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
 	t4 = _mm_xor_si128(t4, t0)
 
-
 #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
    _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
    _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	UNPACK_S0(r3c, r3a, _t3)
 
-
 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
    _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
    _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
 	r4c = _mm_xor_si128(r4c, _t0);\
    _t0 = mm128_mask_32( _t0, 8 ); \
 	r4d = _mm_xor_si128(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
-	SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
 	UNPACK_S0(r4c, r4a, _t3)
 
-
-
 #define LOADCOLUMN(x, s, a)\
 	block[0] = col[(base + a + 0) % s];\
 	block[1] = col[(base + a + 1) % s];\
@@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
       case 1:
          TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4],
                        ctx->state[5], ctx->state[ 6], ctx->state[8],
-		       ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
+                       ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
 
-	 SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
+	      SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
                         ctx->state[1], ctx->state[7], ctx->state[8],
-		       	ctx->state[6], ctx->state[0], ctx->state[6],
-		       	ctx->state[7], ctx->state[5], ctx->state[11],
-		       	ctx->state[5], ctx->state[6], ctx->state[4],
-		       	ctx->state[10] );
+		                  ctx->state[6], ctx->state[0], ctx->state[6],
+		                  ctx->state[7], ctx->state[5], ctx->state[11],
+		                  ctx->state[5], ctx->state[6], ctx->state[4],
+		       	         ctx->state[10] );
          ctx->base++;
          pmsg += 4;
          uBlockCount--;
@@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
       case 2:
          TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0],
                        ctx->state[ 1], ctx->state[2], ctx->state[4],
-		       ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
+                       ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
 
          SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3],
                         ctx->state[9], ctx->state[3], ctx->state[4],
-		       	ctx->state[2], ctx->state[8], ctx->state[2],
-		       	ctx->state[3], ctx->state[1], ctx->state[7],
-		       	ctx->state[1], ctx->state[2], ctx->state[0],
-		       	ctx->state[6]);
+                        ctx->state[2], ctx->state[8], ctx->state[2],
+                        ctx->state[3], ctx->state[1], ctx->state[7],
+                        ctx->state[1], ctx->state[2], ctx->state[0],
+                        ctx->state[6]);
 
          ctx->base = 0;
          pmsg += 4;
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
       break;
    }
 
-
    while( uBlockCount > 0 )
    {
-      TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
-                    ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
-              _t0, _t1, _t2 );
-      SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
-                     ctx->state[5], ctx->state[11], ctx->state[0],
-		     ctx->state[10], ctx->state[4], ctx->state[10],
-		     ctx->state[11], ctx->state[9], ctx->state[3],
-		     ctx->state[9], ctx->state[10], ctx->state[8],
-		     ctx->state[2] );
+      TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
+                    ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
+                    _t0, _t1, _t2 );
+      SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
+                     ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
+                     ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
+		               ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
 
       ctx->base++;
       pmsg += 4;
       uBlockCount--;
       if( uBlockCount == 0 ) break;
 
-      TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
-                    ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
-              _t0, _t1, _t2 );
+      TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
+                    ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
+                    _t0, _t1, _t2 );
 
-      SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1],                     ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
-		     ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
-		     ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
+      SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
+                     ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
+		               ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
+		               ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
 
       ctx->base++;
       pmsg += 4;
       uBlockCount--;
       if( uBlockCount == 0 ) break;
 
-      TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
-		    ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
-               _t0, _t1, _t2);
-      SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
-		     ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
-		     ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
-		     ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
+      TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
+                    ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
+                    _t0, _t1, _t2);
+      SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
+                     ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
+                     ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
+		               ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);
 
       ctx->base = 0;
       pmsg += 4;
@@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
 
 void Final512(hashState_fugue *ctx, BitSequence *hashval)
 {
-        unsigned int block[4] __attribute__ ((aligned (32)));
-        unsigned int col[36] __attribute__ ((aligned (16)));
+   unsigned int block[4] __attribute__ ((aligned (32)));
+   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
 	__m128i r0, _t0, _t1, _t2, _t3;
 
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 	}
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 	}
diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h
index e09e8dea..f2d376e9 100644
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  * xmm[j] will be lost
  * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
-  j = _mm_cmpgt_epi8(j, i);\
+  j = _mm_cmpgt_epi8( m128_zero, i);\
   i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
+  i = mm128_xorand(i, j, k );\
 } 
 
  /**/
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
    We almost fit into 16 registers, need only 3 spills to memory.
    This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
    K. Matusiewicz, 2011/05/29 */
+
+#if defined(__AVX512VL__)
+
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  TEMP2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+   \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  TEMP0 = mm128_xor3( b0, a4, a6 ); \
+  /* spill values y_4, y_5 to memory */\
+  TEMP1 = mm128_xor3( b1, a5, a7 );\
+  b2 = mm128_xor3( b2, a6, a0 ); \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b3 = mm128_xor3( b3, a7, a1 ); \
+  b1 = a1;\
+  b6 = mm128_xor3( b6, a4, TEMP2 ); \
+  b4 = mm128_xor3( b4, a0, TEMP2 ); \
+  b7 = mm128_xor3( b7, a5, a3 ); \
+  b5 = mm128_xor3( b5, a1, a3 ); \
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(TEMP2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#else
+
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
   /* t_i = a_i + a_{i+1} */\
   b6 = a0;\
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
   b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/
 
+#endif
+
 
 /* one round
  * a0-a7 = input rows
diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h
index 61c1b7b0..a8e76747 100644
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  * xmm[j] will be lost
  * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
-  j = _mm_cmpgt_epi8(j, i);\
+  j = _mm_cmpgt_epi8( m128_zero, i);\
   i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
+  i = mm128_xorand(i, j, k );\
 } 
 
 /* Yet another implementation of MixBytes.
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
    We almost fit into 16 registers, need only 3 spills to memory.
    This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
    K. Matusiewicz, 2011/05/29 */
+
+#if defined(__AVX512VL__)
+
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  TEMP2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+   \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  TEMP0 = mm128_xor3( b0, a4, a6 ); \
+  /* spill values y_4, y_5 to memory */\
+  TEMP1 = mm128_xor3( b1, a5, a7 );\
+  b2 = mm128_xor3( b2, a6, a0 ); \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b3 = mm128_xor3( b3, a7, a1 ); \
+  b1 = a1;\
+  b6 = mm128_xor3( b6, a4, TEMP2 ); \
+  b4 = mm128_xor3( b4, a0, TEMP2 ); \
+  b7 = mm128_xor3( b7, a5, a3 ); \
+  b5 = mm128_xor3( b5, a1, a3 ); \
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(TEMP2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#else
+
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
   /* t_i = a_i + a_{i+1} */\
   b6 = a0;\
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
   b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/
 
+#endif
+
 /* one round
  * i = round number
  * a0-a7 = input rows
diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h
index 25d91713..ff62a1c3 100644
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
  * xmm[j] will be lost
  * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm512_xor_si512(j, j);\
-  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
   i = _mm512_add_epi8(i, i);\
-  j = _mm512_and_si512(j, k);\
-  i = _mm512_xor_si512(i, j);\
+  i = mm512_xorand( i, j, k );\
 } 
 
 /* Yet another implementation of MixBytes.
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
    We almost fit into 16 registers, need only 3 spills to memory.
    This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
    K. Matusiewicz, 2011/05/29 */
+
+#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
+                  b0, b1, b2, b3, b4, b5, b6, b7) { \
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0; \
+  b7 = a1; \
+  a0 = _mm512_xor_si512( a0, a1 ); \
+  b0 = a2; \
+  a1 = _mm512_xor_si512( a1, a2 ); \
+  b1 = a3; \
+  TEMP2 = _mm512_xor_si512( a2, a3 ); \
+  b2 = a4; \
+  a3 = _mm512_xor_si512( a3, a4 ); \
+  b3 = a5; \
+  a4 = _mm512_xor_si512( a4, a5 );\
+  b4 = a6; \
+  a5 = _mm512_xor_si512( a5, a6 ); \
+  b5 = a7; \
+  a6 = _mm512_xor_si512( a6, a7 ); \
+  a7 = _mm512_xor_si512( a7, b6 ); \
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  TEMP0 = mm512_xor3( b0, a4, a6 ); \
+  /* spill values y_4, y_5 to memory */\
+  TEMP1 = mm512_xor3( b1, a5, a7 ); \
+  b2 = mm512_xor3( b2, a6, a0 ); \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0; \
+  b3 = mm512_xor3( b3, a7, a1 ); \
+  b1 = a1; \
+  b6 = mm512_xor3( b6, a4, TEMP2 ); \
+  b4 = mm512_xor3( b4, a0, TEMP2 ); \
+  b7 = mm512_xor3( b7, a5, a3 ); \
+  b5 = mm512_xor3( b5, a1, a3 ); \
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm512_xor_si512( a0, a3 ); \
+  a1 = _mm512_xor_si512( a1, a4 ); \
+  a2 = _mm512_xor_si512( TEMP2, a5 ); \
+  a3 = _mm512_xor_si512( a3, a6 ); \
+  a4 = _mm512_xor_si512( a4, a7 ); \
+  a5 = _mm512_xor_si512( a5, b0 ); \
+  a6 = _mm512_xor_si512( a6, b1 ); \
+  a7 = _mm512_xor_si512( a7, TEMP2 ); \
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
+  MUL2( a0, b0, b1 ); \
+  a0 = _mm512_xor_si512( a0, TEMP0 ); \
+  MUL2( a1, b0, b1 ); \
+  a1 = _mm512_xor_si512( a1, TEMP1 ); \
+  MUL2( a2, b0, b1 ); \
+  a2 = _mm512_xor_si512( a2, b2 ); \
+  MUL2( a3, b0, b1 ); \
+  a3 = _mm512_xor_si512( a3, b3 ); \
+  MUL2( a4, b0, b1 ); \
+  a4 = _mm512_xor_si512( a4, b4 ); \
+  MUL2( a5, b0, b1 ); \
+  a5 = _mm512_xor_si512( a5, b5 ); \
+  MUL2( a6, b0, b1 ); \
+  a6 = _mm512_xor_si512( a6, b6 ); \
+  MUL2( a7, b0, b1 ); \
+  a7 = _mm512_xor_si512( a7, b7 ); \
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2( a0, b0, b1 ); \
+  b5 = _mm512_xor_si512( b5, a0 ); \
+  MUL2( a1, b0, b1 ); \
+  b6 = _mm512_xor_si512( b6, a1 ); \
+  MUL2( a2, b0, b1 ); \
+  b7 = _mm512_xor_si512( b7, a2 ); \
+  MUL2( a5, b0, b1 ); \
+  b2 = _mm512_xor_si512( b2, a5 ); \
+  MUL2( a6, b0, b1 ); \
+  b3 = _mm512_xor_si512( b3, a6 ); \
+  MUL2( a7, b0, b1 ); \
+  b4 = _mm512_xor_si512( b4, a7 ); \
+  MUL2( a3, b0, b1 ); \
+  MUL2( a4, b0, b1 ); \
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm512_xor_si512( b0, a3 ); \
+  b1 = _mm512_xor_si512( b1, a4 ); \
+}/*MixBytes*/
+
+
+#if 0
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
   /* t_i = a_i + a_{i+1} */\
   b6 = a0;\
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
   b0 = _mm512_xor_si512(b0, a3);\
   b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/
-
+#endif
 
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
   /* AddRoundConstant */\
diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h
index 5d8d7155..354e0187 100644
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  * xmm[j] will be lost
  * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm512_xor_si512(j, j);\
-  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
   i = _mm512_add_epi8(i, i);\
-  j = _mm512_and_si512(j, k);\
-  i = _mm512_xor_si512(i, j);\
+  i = mm512_xorand( i, j, k );\
 } 
 
  /**/
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    We almost fit into 16 registers, need only 3 spills to memory.
    This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
    K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
+                  b0, b1, b2, b3, b4, b5, b6, b7) { \
   /* t_i = a_i + a_{i+1} */\
-  b6 = a0;\
-  b7 = a1;\
-  a0 = _mm512_xor_si512(a0, a1);\
-  b0 = a2;\
-  a1 = _mm512_xor_si512(a1, a2);\
-  b1 = a3;\
-  a2 = _mm512_xor_si512(a2, a3);\
-  b2 = a4;\
-  a3 = _mm512_xor_si512(a3, a4);\
-  b3 = a5;\
-  a4 = _mm512_xor_si512(a4, a5);\
-  b4 = a6;\
-  a5 = _mm512_xor_si512(a5, a6);\
-  b5 = a7;\
-  a6 = _mm512_xor_si512(a6, a7);\
-  a7 = _mm512_xor_si512(a7, b6);\
+  b6 = a0; \
+  b7 = a1; \
+  a0 = _mm512_xor_si512( a0, a1 ); \
+  b0 = a2; \
+  a1 = _mm512_xor_si512( a1, a2 ); \
+  b1 = a3; \
+  TEMP2 = _mm512_xor_si512( a2, a3 ); \
+  b2 = a4; \
+  a3 = _mm512_xor_si512( a3, a4 ); \
+  b3 = a5; \
+  a4 = _mm512_xor_si512( a4, a5 );\
+  b4 = a6; \
+  a5 = _mm512_xor_si512( a5, a6 ); \
+  b5 = a7; \
+  a6 = _mm512_xor_si512( a6, a7 ); \
+  a7 = _mm512_xor_si512( a7, b6 ); \
   \
   /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm512_xor_si512(b0, a4);\
-  b6 = _mm512_xor_si512(b6, a4);\
-  b1 = _mm512_xor_si512(b1, a5);\
-  b7 = _mm512_xor_si512(b7, a5);\
-  b2 = _mm512_xor_si512(b2, a6);\
-  b0 = _mm512_xor_si512(b0, a6);\
+  TEMP0 = mm512_xor3( b0, a4, a6 ); \
   /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  b3 = _mm512_xor_si512(b3, a7);\
-  b1 = _mm512_xor_si512(b1, a7);\
-  TEMP1 = b1;\
-  b4 = _mm512_xor_si512(b4, a0);\
-  b2 = _mm512_xor_si512(b2, a0);\
+  TEMP1 = mm512_xor3( b1, a5, a7 ); \
+  b2 = mm512_xor3( b2, a6, a0 ); \
   /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b5 = _mm512_xor_si512(b5, a1);\
-  b3 = _mm512_xor_si512(b3, a1);\
-  b1 = a1;\
-  b6 = _mm512_xor_si512(b6, a2);\
-  b4 = _mm512_xor_si512(b4, a2);\
-  TEMP2 = a2;\
-  b7 = _mm512_xor_si512(b7, a3);\
-  b5 = _mm512_xor_si512(b5, a3);\
+  b0 = a0; \
+  b3 = mm512_xor3( b3, a7, a1 ); \
+  b1 = a1; \
+  b6 = mm512_xor3( b6, a4, TEMP2 ); \
+  b4 = mm512_xor3( b4, a0, TEMP2 ); \
+  b7 = mm512_xor3( b7, a5, a3 ); \
+  b5 = mm512_xor3( b5, a1, a3 ); \
   \
   /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm512_xor_si512(a0, a3);\
-  a1 = _mm512_xor_si512(a1, a4);\
-  a2 = _mm512_xor_si512(a2, a5);\
-  a3 = _mm512_xor_si512(a3, a6);\
-  a4 = _mm512_xor_si512(a4, a7);\
-  a5 = _mm512_xor_si512(a5, b0);\
-  a6 = _mm512_xor_si512(a6, b1);\
-  a7 = _mm512_xor_si512(a7, TEMP2);\
+  a0 = _mm512_xor_si512( a0, a3 ); \
+  a1 = _mm512_xor_si512( a1, a4 ); \
+  a2 = _mm512_xor_si512( TEMP2, a5 ); \
+  a3 = _mm512_xor_si512( a3, a6 ); \
+  a4 = _mm512_xor_si512( a4, a7 ); \
+  a5 = _mm512_xor_si512( a5, b0 ); \
+  a6 = _mm512_xor_si512( a6, b1 ); \
+  a7 = _mm512_xor_si512( a7, TEMP2 ); \
   \
   /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
   /* compute w_i : add y_{i+4} */\
-  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
-  MUL2(a0, b0, b1);\
-  a0 = _mm512_xor_si512(a0, TEMP0);\
-  MUL2(a1, b0, b1);\
-  a1 = _mm512_xor_si512(a1, TEMP1);\
-  MUL2(a2, b0, b1);\
-  a2 = _mm512_xor_si512(a2, b2);\
-  MUL2(a3, b0, b1);\
-  a3 = _mm512_xor_si512(a3, b3);\
-  MUL2(a4, b0, b1);\
-  a4 = _mm512_xor_si512(a4, b4);\
-  MUL2(a5, b0, b1);\
-  a5 = _mm512_xor_si512(a5, b5);\
-  MUL2(a6, b0, b1);\
-  a6 = _mm512_xor_si512(a6, b6);\
-  MUL2(a7, b0, b1);\
-  a7 = _mm512_xor_si512(a7, b7);\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
+  MUL2( a0, b0, b1 ); \
+  a0 = _mm512_xor_si512( a0, TEMP0 ); \
+  MUL2( a1, b0, b1 ); \
+  a1 = _mm512_xor_si512( a1, TEMP1 ); \
+  MUL2( a2, b0, b1 ); \
+  a2 = _mm512_xor_si512( a2, b2 ); \
+  MUL2( a3, b0, b1 ); \
+  a3 = _mm512_xor_si512( a3, b3 ); \
+  MUL2( a4, b0, b1 ); \
+  a4 = _mm512_xor_si512( a4, b4 ); \
+  MUL2( a5, b0, b1 ); \
+  a5 = _mm512_xor_si512( a5, b5 ); \
+  MUL2( a6, b0, b1 ); \
+  a6 = _mm512_xor_si512( a6, b6 ); \
+  MUL2( a7, b0, b1 ); \
+  a7 = _mm512_xor_si512( a7, b7 ); \
   \
   /* compute v_i : double w_i      */\
   /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
-  b5 = _mm512_xor_si512(b5, a0);\
-  MUL2(a1, b0, b1);\
-  b6 = _mm512_xor_si512(b6, a1);\
-  MUL2(a2, b0, b1);\
-  b7 = _mm512_xor_si512(b7, a2);\
-  MUL2(a5, b0, b1);\
-  b2 = _mm512_xor_si512(b2, a5);\
-  MUL2(a6, b0, b1);\
-  b3 = _mm512_xor_si512(b3, a6);\
-  MUL2(a7, b0, b1);\
-  b4 = _mm512_xor_si512(b4, a7);\
-  MUL2(a3, b0, b1);\
-  MUL2(a4, b0, b1);\
+  MUL2( a0, b0, b1 ); \
+  b5 = _mm512_xor_si512( b5, a0 ); \
+  MUL2( a1, b0, b1 ); \
+  b6 = _mm512_xor_si512( b6, a1 ); \
+  MUL2( a2, b0, b1 ); \
+  b7 = _mm512_xor_si512( b7, a2 ); \
+  MUL2( a5, b0, b1 ); \
+  b2 = _mm512_xor_si512( b2, a5 ); \
+  MUL2( a6, b0, b1 ); \
+  b3 = _mm512_xor_si512( b3, a6 ); \
+  MUL2( a7, b0, b1 ); \
+  b4 = _mm512_xor_si512( b4, a7 ); \
+  MUL2( a3, b0, b1 ); \
+  MUL2( a4, b0, b1 ); \
   b0 = TEMP0;\
   b1 = TEMP1;\
-  b0 = _mm512_xor_si512(b0, a3);\
-  b1 = _mm512_xor_si512(b1, a4);\
+  b0 = _mm512_xor_si512( b0, a3 ); \
+  b1 = _mm512_xor_si512( b1, a4 ); \
 }/*MixBytes*/
 
 /* one round
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
  * xmm[j] will be lost
  * xmm[k] has to be all 0x1b */
 #define MUL2_2WAY(i, j, k){\
-  j = _mm256_xor_si256(j, j);\
-  j = _mm256_cmpgt_epi8(j, i );\
+  j = _mm256_cmpgt_epi8( m256_zero, i );\
   i = _mm256_add_epi8(i, i);\
-  j = _mm256_and_si256(j, k);\
-  i = _mm256_xor_si256(i, j);\
+  i = mm256_xorand( i, j, k );\
 }
 
 #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c
index 9fca48bf..c9f558cc 100644
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
 
      rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
      groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
      groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
 
      uint32_t hash0[20] __attribute__ ((aligned (64)));
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
 //     rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
-     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
-                       hash6, hash7 );
 
 #else
 
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
                    hash4, hash5, hash6, hash7, input, 640 );
 
      update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
-
-     intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
-                         hash4, hash5, hash6, hash7, 512 );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
 
 #endif
 
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                       hash6, hash7 );
+     
      sha256_8way_update( &ctx.sha, vhash, 64 );
      sha256_8way_close( &ctx.sha, output );
 }
diff --git a/algo/sha/hmac-sha256-hash.c b/algo/sha/hmac-sha256-hash.c
index 3c2f4d20..e09a4c2a 100644
--- a/algo/sha/hmac-sha256-hash.c
+++ b/algo/sha/hmac-sha256-hash.c
@@ -39,17 +39,10 @@
 void
 SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 {
-#if defined(HMAC_SPH_SHA)
    sph_sha256_context ctx;
    sph_sha256_init( &ctx );
    sph_sha256( &ctx, in, len );
    sph_sha256_close( &ctx, digest );
-#else
-   SHA256_CTX ctx;
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, in, len );
-   SHA256_Final( digest, &ctx );
-#endif
 }
 
 /**
@@ -79,51 +72,29 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
    /* If Klen > 64, the key is really SHA256(K). */
    if ( Klen > 64 )
    {
-	   
-#if defined(HMAC_SPH_SHA)
       sph_sha256_init( &ctx->ictx );
       sph_sha256( &ctx->ictx, K, Klen );
       sph_sha256_close( &ctx->ictx, khash );
-#else
-      SHA256_Init( &ctx->ictx );
-      SHA256_Update( &ctx->ictx, K, Klen );
-      SHA256_Final( khash, &ctx->ictx );
-#endif
-       K = khash;
-       Klen = 32;
+
+      K = khash;
+      Klen = 32;
    }
 
    /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-#if defined(HMAC_SPH_SHA)
    sph_sha256_init( &ctx->ictx );
-#else
-   SHA256_Init( &ctx->ictx );
-#endif
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x36;
 
    memset( pad + Klen, 0x36, 64 - Klen );
-#if defined(HMAC_SPH_SHA)
    sph_sha256( &ctx->ictx, pad, 64 );
-#else
-   SHA256_Update( &ctx->ictx, pad, 64 );
-#endif
 
    /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-#if defined(HMAC_SPH_SHA)
    sph_sha256_init( &ctx->octx );
-#else   
-   SHA256_Init( &ctx->octx );
-#endif
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x5c;
 
    memset( pad + Klen, 0x5c, 64 - Klen );
-#if defined(HMAC_SPH_SHA)
    sph_sha256( &ctx->octx, pad, 64 );
-#else
-   SHA256_Update( &ctx->octx, pad, 64 );
-#endif
 }
 
 /* Add bytes to the HMAC-SHA256 operation. */
@@ -131,11 +102,7 @@ void
 HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-#if defined(HMAC_SPH_SHA)
    sph_sha256( &ctx->ictx, in, len );
-#else
-   SHA256_Update( &ctx->ictx, in, len );
-#endif
 }
 
 /* Finish an HMAC-SHA256 operation. */
@@ -144,20 +111,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
 {
    unsigned char ihash[32];
 
-#if defined(HMAC_SPH_SHA)
    sph_sha256_close( &ctx->ictx, ihash );
    sph_sha256( &ctx->octx, ihash, 32 );
    sph_sha256_close( &ctx->octx, digest );
-#else
-   /* Finish the inner SHA256 operation. */
-   SHA256_Final( ihash, &ctx->ictx );
-
-   /* Feed the inner hash to the outer SHA256 operation. */
-   SHA256_Update( &ctx->octx, ihash, 32 );
-
-   /* Finish the outer SHA256 operation. */
-   SHA256_Final( digest, &ctx->octx );
-#endif
 }
 
 /**
diff --git a/algo/sha/hmac-sha256-hash.h b/algo/sha/hmac-sha256-hash.h
index 41e5673a..a735c53a 100644
--- a/algo/sha/hmac-sha256-hash.h
+++ b/algo/sha/hmac-sha256-hash.h
@@ -29,24 +29,14 @@
 #ifndef HMAC_SHA256_H__
 #define HMAC_SHA256_H__
 
-//#define HMAC_SSL_SHA 1
-#define HMAC_SPH_SHA 1
-
 #include <sys/types.h>
 #include <stdint.h>
 #include "sph_sha2.h"
-#include <openssl/sha.h>
-
 
 typedef struct HMAC_SHA256Context
 {
-#if defined(HMAC_SPH_SHA)
    sph_sha256_context ictx;
    sph_sha256_context octx;
-#else
-   SHA256_CTX ictx;
-   SHA256_CTX octx;
-#endif
 } HMAC_SHA256_CTX;
 
 void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index 5e70c3e8..7e399548 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -41,7 +41,7 @@
 
 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
 //#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
-#define MAJ( X, Y, Z )   ( Y  ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
+#define MAJ( X, Y, Z )   ( Y  ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
 #define ROTR    SPH_ROTR32
 
 #define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
@@ -319,6 +319,7 @@ static const sph_u32 K[64] = {
 		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
 			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
 		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
+      Y_xor_Z = X_xor_Y; \
 		d = SPH_T32(d + t1); \
 		h = SPH_T32(t1 + t2); \
 	} while (0)
@@ -329,7 +330,7 @@ static const sph_u32 K[64] = {
 	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
 
 #define SHA2_ROUND_BODY(in, r)   do { \
-		sph_u32 A, B, C, D, E, F, G, H; \
+		sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
 		sph_u32 W[16]; \
 		unsigned pcount; \
  \
@@ -342,6 +343,7 @@ static const sph_u32 K[64] = {
 		G = (r)[6]; \
 		H = (r)[7]; \
 		pcount = 0; \
+      Y_xor_Z = B ^ C; \
 		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
 		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
 		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
@@ -389,7 +391,7 @@ static const sph_u32 K[64] = {
 #else  // large footprint (default)
 
 #define SHA2_ROUND_BODY(in, r)   do { \
-		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
+		sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
 		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
 		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
  \
@@ -401,388 +403,453 @@ static const sph_u32 K[64] = {
 		F = (r)[5]; \
 		G = (r)[6]; \
 		H = (r)[7]; \
+      Y_xor_Z = B ^ C; \
 		W00 = in(0); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x428A2F98) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = in(1); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x71374491) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = in(2); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB5C0FBCF) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = in(3); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xE9B5DBA5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = in(4); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x3956C25B) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = in(5); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x59F111F1) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = in(6); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x923F82A4) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = in(7); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xAB1C5ED5) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = in(8); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xD807AA98) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = in(9); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x12835B01) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = in(10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x243185BE) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = in(11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x550C7DC3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = in(12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x72BE5D74) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = in(13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x80DEB1FE) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = in(14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x9BDC06A7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = in(15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC19BF174) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xE49B69C1) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xEFBE4786) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x0FC19DC6) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x240CA1CC) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x2DE92C6F) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4A7484AA) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5CB0A9DC) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x76F988DA) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x983E5152) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA831C66D) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB00327C8) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xBF597FC7) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xC6E00BF3) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD5A79147) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x06CA6351) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x14292967) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x27B70A85) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x2E1B2138) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x4D2C6DFC) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x53380D13) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x650A7354) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x766A0ABB) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x81C2C92E) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x92722C85) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xA2BFE8A1) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA81A664B) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xC24B8B70) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xC76C51A3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xD192E819) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD6990624) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xF40E3585) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x106AA070) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x19A4C116) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x1E376C08) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x2748774C) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x34B0BCB5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x391C0CB3) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4ED8AA4A) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5B9CCA4F) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x682E6FF3) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x748F82EE) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x78A5636F) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x84C87814) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x8CC70208) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x90BEFFFA) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xA4506CEB) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xBEF9A3F7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC67178F2) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		(r)[0] = SPH_T32((r)[0] + A); \
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
index 2b0b7353..2c93df96 100644
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
    register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
    __m512i *M = (__m512i*)msg;
    __m512i *H = (__m512i*)ctx->h;
+   const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
+                                            ctx->count1, ctx->count0 );
    int r;
 
    P0 = H[0];
@@ -62,16 +64,16 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
                                   _mm512_aesenc_epi128( K0, m512_zero ) ) );
 
      if ( r == 0 )
-        K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( 
-		              ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
+        K0 = _mm512_xor_si512( K0,
+                    _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
 
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
      K1 = _mm512_xor_si512( K0,
 		           mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
 
      if ( r == 1 )
-        K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
-	                 ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); 
+        K1 = _mm512_xor_si512( K1, mm512_ror128_32(
+                 _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
 
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
      K2 = _mm512_xor_si512( K1,
@@ -96,8 +98,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 		           mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
 
      if ( r == 2 )
-        K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
-                    ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
+        K7 = _mm512_xor_si512( K7, mm512_swap128_64(
+                 _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
  
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
      P1 = _mm512_xor_si512( P1, X );
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index e047d778..a593cf55 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
    // round
 
-//  working proof of concept   
-/*
-   __m512i K = m512_const1_128( m[0] );
-   __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
-   X = _mm512_aesenc_epi128( X, m512_zero );
-   k00 = _mm512_castsi512_si128( K );
-   x = _mm512_castsi512_si128( X );
-*/
-
    k00 = m[0];
    x = _mm_xor_si128( p1, k00 );
    x = _mm_aesenc_si128( x, zero );
diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c
index f2652f35..856a07f7 100644
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 
   static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
 
-
-//  static const m512_v16 code[] = { c1_16(185), c1_16(233),
-//                                   c1_16(185), c1_16(233) };
-
-
   S0l = _mm512_xor_si512( S[0], M[0] );
   S0h = _mm512_xor_si512( S[1], M[1] );
   S1l = _mm512_xor_si512( S[2], M[2] );
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 // targetted, local macros don't need a unique name
 #define S(i) S##i
 
+#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
+#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )  
+
+/*  
 #define F_0(B, C, D) \
    _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
 #define F_1(B, C, D) \
    _mm512_or_si512( _mm512_and_si512( D, C ),\
                     _mm512_and_si512( _mm512_or_si512( D,C ), B ) )
+*/
 
 #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
 #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
diff --git a/algo/simd/vector.c b/algo/simd/vector.c
index 12692db6..60f0cc76 100644
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -6,10 +6,6 @@
 
 #define PRINT_SOME 0
 
-/* JDD all ocurrances of macro X in this file renamed to XX
- * due to name conflict
- */
-
 int SupportedLength(int hashbitlen) {
   if (hashbitlen <= 0 || hashbitlen > 512)
     return 0;
diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c
index e21e4f17..fd16c241 100644
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B,
 #define INTEGERIFY (uint32_t)X.d[0]
 #endif
 
+// AVX512 ternary logic optimization
+#if defined(__AVX512VL__)
+
+#define XOR_X_XOR_X( in1, in2 ) \
+ X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); 
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+ X0 =  _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+ X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
+ SALSA20(out)
+
+#else
+
+#define XOR_X_XOR_X( in1, in2 ) \
+  XOR_X( in1 ) \
+  XOR_X( in2 ) 
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   XOR_X_2( in1, in2 ) \
+   XOR_X( in3 )
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+   XOR_X(in1) \
+   XOR_X(in2) \
+   SALSA20( out )
+
+#endif
+
 /**
  * Apply the Salsa20 core to the block provided in X ^ in.
  */
@@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
 {
 	DECL_X
 
-	XOR_X_2(Bin1[1], Bin2[1])
-	XOR_X(Bin1[0])
+   XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )   
+//	XOR_X_2(Bin1[1], Bin2[1])
+//	XOR_X(Bin1[0])
 	SALSA20_XOR_MEM(Bin2[0], Bout[0])
-	XOR_X(Bin1[1])
-	SALSA20_XOR_MEM(Bin2[1], Bout[1])
+
+// Factor out the XOR from salsa20 to do a xor3
+   XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
+//   XOR_X(Bin1[1])
+//	SALSA20_XOR_MEM(Bin2[1], Bout[1])
 
 	return INTEGERIFY;
 }
@@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	i = 0;
 	r--;
 	do {
-		XOR_X(Bin1[i])
-		XOR_X(Bin2[i])
+      XOR_X_XOR_X( Bin1[i], Bin2[i] )
+//      XOR_X(Bin1[i])
+//      XOR_X(Bin2[i])
 		PWXFORM
 		WRITE_X(Bout[i])
 
-		XOR_X(Bin1[i + 1])
-		XOR_X(Bin2[i + 1])
+      XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )     
+//		XOR_X(Bin1[i + 1])
+//		XOR_X(Bin2[i + 1])
 		PWXFORM
 
 		if (unlikely(i >= r))
diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h
index c5b6d78a..260322a7 100644
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -35,7 +35,6 @@
 #include "miner.h"
 #include "simd-utils.h"
 #include "algo/sha/sph_sha2.h"
-#include <openssl/sha.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/build-allarch.sh b/build-allarch.sh
index fa1d8666..c4d9ffd4 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -63,7 +63,7 @@ mv cpuminer cpuminer-avx
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
diff --git a/configure b/configure
index 8382a1bb..7430186f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.17.0'
-PACKAGE_STRING='cpuminer-opt 3.17.0'
+PACKAGE_VERSION='3.17.1'
+PACKAGE_STRING='cpuminer-opt 3.17.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.17.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.17.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.17.0
+cpuminer-opt configure 3.17.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.17.0, which was
+It was created by cpuminer-opt $as_me 3.17.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.17.0'
+ VERSION='3.17.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.17.0, which was
+This file was extended by cpuminer-opt $as_me 3.17.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.17.0
+cpuminer-opt config.status 3.17.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index f5612ef9..332d1e68 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.17.0])
+AC_INIT([cpuminer-opt], [3.17.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 9b723766..e9c01fe6 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1054,6 +1054,8 @@ void report_summary_log( bool force )
          applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz",
                  tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
          if ( curr_temp > hi_temp ) hi_temp = curr_temp;
+         if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) )
+            restart_threads();
          prev_temp = curr_temp;
       }
    }
@@ -2856,7 +2858,6 @@ static bool cpu_capability( bool display_only )
      bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
      bool use_aes;
      bool use_sse2;
-     bool use_sse42;
      bool use_avx2;
      bool use_avx512;
      bool use_sha;
@@ -2930,13 +2931,14 @@ static bool cpu_capability( bool display_only )
      if ( algo_features == EMPTY_SET ) printf( " None" );
      else
      {
-        if      ( algo_has_avx512 )    printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2  " );
-        else if ( algo_has_sse42  )    printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2  " );
-        if      ( algo_has_vaes   )    printf( " VAES"   );
-        else if ( algo_has_aes    )    printf( "  AES"   );
-        if      ( algo_has_sha    )    printf( " SHA"    );
+        if      ( algo_has_avx512  )  printf( " AVX512" );
+        else if ( algo_has_avx2    )  printf( " AVX2  " );
+        else if ( algo_has_sse42   )  printf( " SSE4.2" );
+        else if ( algo_has_sse2    )  printf( " SSE2  " );
+        if      ( algo_has_vaes ||
+                  algo_has_vaes256 )  printf( " VAES"   );
+        else if ( algo_has_aes     )  printf( "  AES"   );
+        if      ( algo_has_sha     )  printf( " SHA"    );
      }
      printf("\n");
 
@@ -2972,13 +2974,12 @@ static bool cpu_capability( bool display_only )
      // Determine mining options
      use_sse2   = cpu_has_sse2   && algo_has_sse2;
      use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
-     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
      use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
      use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
      use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes
-          && ( use_avx512 || algo_has_vaes256 );
-     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && ( algo_has_vaes
+                                                    || algo_has_vaes256 );
+     use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
                    use_sha || use_vaes );
 
      // Display best options
@@ -2988,7 +2989,6 @@ static bool cpu_capability( bool display_only )
      {
         if      ( use_avx512 ) printf( " AVX512" );
         else if ( use_avx2   ) printf( " AVX2"   );
-        else if ( use_sse42  ) printf( " SSE4.2" );
         else if ( use_sse2   ) printf( " SSE2"   );
         if      ( use_vaes   ) printf( " VAES"   );
         else if ( use_aes    ) printf( " AES"    );
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index e166b14d..90066f09 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -237,6 +237,25 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n )
 static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 
+#if defined(__AVX512VL__)
+
+// a ^ b ^ c
+#define mm128_xor3( a, b, c ) \
+   _mm_ternarylogic_epi64( a, b, c, 0x96 )
+
+// a ^ ( b & c )
+#define mm128_xorand( a, b, c ) \
+   _mm_ternarylogic_epi64( a, b, c, 0x78 )
+
+#else
+
+#define mm128_xor3( a, b, c ) \
+   _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+
+#define mm128_xorand( a, b, c ) \
+  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+
+#endif
 
 //
 // Bit rotations
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 1b9fca80..125e2c82 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -275,15 +275,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 //
 // Rotate elements accross all lanes.
-//
-// Swap 128 bit elements in 256 bit vector.
-#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
 
-// Rotate 256 bit vector by one 64 bit element
-#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
-#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+#if defined(__AVX512VL__)
+
+static inline __m256i mm256_swap_128( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 2 ); }
 
-#if defined(__AVX512F__) && defined(__AVX512VL__)
+static inline __m256i mm256_ror_1x64( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 1 ); }
+
+static inline __m256i mm256_rol_1x64( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 3 ); }
 
 static inline __m256i mm256_ror_1x32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
@@ -293,6 +295,13 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
 
 #else   // AVX2
 
+// Swap 128 bit elements in 256 bit vector.
+#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
+
+// Rotate 256 bit vector by one 64 bit element
+#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
+#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+
 // Rotate 256 bit vector by one 32 bit element.
 #define mm256_ror_1x32( v ) \
     _mm256_permutevar8x32_epi32( v, \
@@ -304,6 +313,7 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
                      m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                     0x0000000200000001,  0x0000000000000007 )
 
+       
 #endif    // AVX512 else AVX2
 
 //
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index f6402bad..4953cec2 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe

From 2cd1507c2e59c592f40be02d723a974644357808 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Wed, 29 Sep 2021 17:31:16 -0400
Subject: [PATCH 11/20] v3.7.4

---
 Makefile.am                                   |    3 +
 RELEASE_NOTES                                 |   31 +
 algo-gate-api.h                               |    4 +
 algo/argon2/argon2d/blake2/blamka-round-opt.h |   18 +-
 algo/bmw/bmw512-hash-4way.c                   |  471 +-
 algo/cubehash/cube-hash-2way.c                |  238 +-
 algo/cubehash/cube-hash-2way.h                |   44 +-
 algo/cubehash/cubehash_sse2.c                 |   12 +-
 algo/groestl/aes_ni/hash-groestl.h            |    3 +-
 algo/groestl/aes_ni/hash-groestl256.h         |    3 +-
 algo/groestl/myr-groestl.c                    |    7 +-
 algo/hamsi/hamsi-hash-4way.c                  |  218 +-
 algo/hodl/hodl-wolf.c                         |    1 +
 algo/keccak/keccak-gate.c                     |    1 +
 algo/keccak/keccak-hash-4way.c                |   14 +-
 algo/lyra2/allium-4way.c                      |   53 +-
 algo/lyra2/sponge.h                           |   30 +-
 algo/m7m/m7m.c                                |   24 +-
 algo/ripemd/lbry.c                            |   27 +-
 algo/scrypt/neoscrypt.c                       |    8 +-
 algo/scrypt/scrypt-core-4way.c                | 3981 +++++++++++++++++
 algo/scrypt/scrypt-core-4way.h                |   70 +
 algo/scrypt/scrypt-core-ref.c                 |  206 +
 algo/scrypt/scrypt.c                          | 1398 ++++--
 algo/sha/hmac-sha256-hash.c                   |   50 +-
 algo/sha/hmac-sha256-hash.h                   |    8 +-
 algo/sha/sha-hash-4way.h                      |   14 +-
 algo/sha/sha2.c                               |    8 +-
 algo/sha/sha256-hash-2way-ni.c                |  348 +-
 algo/sha/sha256-hash-4way.c                   |  473 +-
 algo/sha/sha256-hash-opt.c                    |  192 +-
 algo/sha/sha256-hash-opt.h                    |   18 -
 algo/sha/sha256-hash.c                        |  142 +
 algo/sha/sha256-hash.h                        |   56 +
 algo/sha/sha256d-4way.c                       |   31 +-
 algo/sha/sha256d.c                            |    8 +
 algo/sha/sha256d.h                            |    7 +
 algo/sha/sha256q.c                            |   30 +-
 algo/sha/sha256t-4way.c                       |   23 +-
 algo/sha/sha256t.c                            |  118 +-
 algo/sha/sha512-hash-4way.c                   |  150 +-
 algo/sha/sph_sha2.c                           |  210 +-
 algo/sha/sph_sha2.h                           |    7 +
 algo/shavite/shavite-hash-2way.c              |   52 +-
 algo/shavite/shavite-hash-4way.c              |   54 +-
 algo/shavite/sph-shavite-aesni.c              |   52 +-
 algo/skein/skein-4way.c                       |   21 +-
 algo/skein/skein.c                            |   13 +-
 algo/verthash/Verthash.c                      |    8 +-
 algo/verthash/verthash-gate.c                 |    4 +-
 algo/whirlpool/whirlpool.c                    |    2 +-
 algo/x16/x16r-4way.c                          |  143 +-
 algo/x16/x16r-gate.c                          |    1 +
 algo/x16/x16r-gate.h                          |    5 +-
 algo/x16/x21s-4way.c                          |   22 +-
 algo/x16/x21s.c                               |    8 +-
 algo/x17/x17-4way.c                           |    9 +-
 algo/x22/x22i-4way.c                          |   58 +-
 algo/x22/x22i.c                               |    6 +-
 algo/x22/x25x-4way.c                          |   56 +-
 algo/x22/x25x.c                               |    8 +-
 algo/yespower/crypto/blake2b-yp.c             |    8 +-
 algo/yespower/yescrypt-r8g.c                  |    4 +-
 algo/yespower/yespower-gate.c                 |   13 +-
 algo/yespower/yespower-opt.c                  |   19 +-
 algo/yespower/yespower.h                      |    6 +-
 build-allarch.sh                              |    2 +-
 configure                                     |   20 +-
 configure.ac                                  |    2 +-
 cpu-miner.c                                   |  136 +-
 miner.h                                       |   37 +-
 simd-utils.h                                  |    2 +
 simd-utils/intrlv.h                           |    2 +-
 simd-utils/simd-128.h                         |  177 +-
 simd-utils/simd-256.h                         |  159 +-
 simd-utils/simd-512.h                         |  204 +-
 simd-utils/simd-64.h                          |   10 +-
 simd-utils/simd-int.h                         |   13 +-
 sysinfos.c                                    |    2 +-
 util.c                                        |   60 +-
 80 files changed, 8102 insertions(+), 2054 deletions(-)
 create mode 100644 algo/scrypt/scrypt-core-4way.c
 create mode 100644 algo/scrypt/scrypt-core-4way.h
 create mode 100644 algo/scrypt/scrypt-core-ref.c
 delete mode 100644 algo/sha/sha256-hash-opt.h
 create mode 100644 algo/sha/sha256-hash.c
 create mode 100644 algo/sha/sha256-hash.h
 create mode 100644 algo/sha/sha256d.c
 create mode 100644 algo/sha/sha256d.h

diff --git a/Makefile.am b/Makefile.am
index a4adc3b7..a4163b33 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -158,7 +158,9 @@ cpuminer_SOURCES = \
   algo/ripemd/lbry.c \
   algo/ripemd/lbry-4way.c \
   algo/scrypt/scrypt.c \
+  algo/scrypt/scrypt-core-4way.c \
   algo/scrypt/neoscrypt.c \
+  algo/sha/sha256-hash.c \
   algo/sha/sph_sha2.c \
   algo/sha/sph_sha2big.c \
   algo/sha/sha256-hash-4way.c \
@@ -167,6 +169,7 @@ cpuminer_SOURCES = \
   algo/sha/sha256-hash-2way-ni.c \
   algo/sha/hmac-sha256-hash.c \
   algo/sha/hmac-sha256-hash-4way.c \
+  algo/sha/sha256d.c \
   algo/sha/sha2.c \
   algo/sha/sha256t-gate.c \
   algo/sha/sha256t-4way.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 3f6b080d..056491f7 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,37 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.18.0
+
+Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
+  - AVX512 & SHA support for SHA256, AVX512 has priority,
+  - up to 50% increase in hashrate,
+  - memory requirements reduced 30-60% depending on CPU architecture,
+  - memory usage displayed at startup,
+  - scrypt, default N=1024 (LTC), will likely perform slower.
+
+Improved stale share detection and handling for Scrypt with large N factor:
+  - abort and discard partially computed hash when new work is detected,
+  - quicker response to new job, less time wasted mining stale job.
+
+Improved stale share handling for all algorithms:
+  - report possible stale share when new work received with a previously
+    submitted share still pending,
+  - when new work is detected report the submission of an already completed,
+    otherwise valid, but likely stale, share,
+  - fixed incorrect block height in stale share log.
+
+Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
+
+When stratum disconnects miner threads go to idle until reconnected.
+
+Colour changes to some logs.
+
+Some low level function name changes for clarity and consistency.
+
+The reference hashrate in the summary log and the benchmark total hashrate
+are now the mean hashrate for the session. 
+
 v3.17.1
 
 Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 8d61d266..56594d59 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -1,3 +1,6 @@
+#ifndef __ALGO_GATE_API_H__
+#define __ALGO_GATE_API_H__ 1
+
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdint.h>
@@ -319,3 +322,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
 // algo name if valid alias, NULL if invalid alias or algo.
 void get_algo_alias( char **algo_or_alias );
 
+#endif
diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h
index 81563314..809961c3 100644
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -328,7 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
 
 #include <immintrin.h>
 
-#define ror64(x, n) _mm512_ror_epi64((x), (n))
+#define ROR64(x, n) _mm512_ror_epi64((x), (n))
 
 static __m512i muladd(__m512i x, __m512i y)
 {
@@ -344,8 +344,8 @@ static __m512i muladd(__m512i x, __m512i y)
         D0 = _mm512_xor_si512(D0, A0); \
         D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ror64(D0, 32); \
-        D1 = ror64(D1, 32); \
+        D0 = ROR64(D0, 32); \
+        D1 = ROR64(D1, 32); \
 \
         C0 = muladd(C0, D0); \
         C1 = muladd(C1, D1); \
@@ -353,8 +353,8 @@ static __m512i muladd(__m512i x, __m512i y)
         B0 = _mm512_xor_si512(B0, C0); \
         B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ror64(B0, 24); \
-        B1 = ror64(B1, 24); \
+        B0 = ROR64(B0, 24); \
+        B1 = ROR64(B1, 24); \
     } while ((void)0, 0)
 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +365,8 @@ static __m512i muladd(__m512i x, __m512i y)
         D0 = _mm512_xor_si512(D0, A0); \
         D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ror64(D0, 16); \
-        D1 = ror64(D1, 16); \
+        D0 = ROR64(D0, 16); \
+        D1 = ROR64(D1, 16); \
 \
         C0 = muladd(C0, D0); \
         C1 = muladd(C1, D1); \
@@ -374,8 +374,8 @@ static __m512i muladd(__m512i x, __m512i y)
         B0 = _mm512_xor_si512(B0, C0); \
         B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ror64(B0, 63); \
-        B1 = ror64(B1, 63); \
+        B0 = ROR64(B0, 63); \
+        B1 = ROR64(B1, 63); \
     } while ((void)0, 0)
 
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c
index 3587cc4d..9ab4f897 100644
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
 #define rb6(x)    mm256_rol_64( x, 43 ) 
 #define rb7(x)    mm256_rol_64( x, 53 ) 
 
-#define rol_off_64( M, j, off ) \
-   mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
-                  ( ( (j) + (off) ) & 0xF ) + 1 )
-
-#define add_elt_b( M, H, j ) \
-   _mm256_xor_si256( \
-      _mm256_add_epi64( \
-            _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
-                                                rol_off_64( M, j, 3 ) ), \
-                             rol_off_64( M, j, 10 ) ), \
-            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 0xF ] )
+#define rol_off_64( M, j ) \
+   mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
 
+#define add_elt_b( mj0, mj3, mj10, h, K ) \
+  _mm256_xor_si256( h, _mm256_add_epi64( K, \
+              _mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
 
-#define expand1b( qt, M, H, i ) \
-   _mm256_add_epi64( mm256_add4_64( \
+#define expand1_b( qt, i ) \
+   mm256_add4_64( \
       mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
                      sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
       mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
@@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
       mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
                      sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
       mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
-                     sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
+                     sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
 
-#define expand2b( qt, M, H, i) \
-   _mm256_add_epi64( mm256_add4_64( \
+#define expand2_b( qt, i) \
+   mm256_add4_64( \
       mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
                      qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
       mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
@@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
       mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
                      qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
       mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
-                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
-
-
+                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
 
 #define Wb0 \
    _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-         _mm256_xor_si256( M[10], H[10] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
+      _mm256_add_epi64( mh[13], mh[14] ) )
 
 #define Wb1 \
    _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
-                            _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
-                         _mm256_xor_si256( M[15], H[15] ) ) )
+       _mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
+       _mm256_sub_epi64( mh[14], mh[15] ) )
 
 #define Wb2 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
+      _mm256_sub_epi64( mh[12], mh[15] ) )
 
 #define Wb3 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
-                        _mm256_xor_si256( M[13], H[13] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
+      _mm256_sub_epi64( mh[10], \
+                        mh[13] ) )
 
 #define Wb4 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
+      _mm256_add_epi64( mh[11], mh[14] ) )
 
 #define Wb5 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-         _mm256_xor_si256( M[10], H[10] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
+      _mm256_sub_epi64( mh[12], mh[15] ) )
 
 #define Wb6 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
-                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-         _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
-                        _mm256_xor_si256( M[13], H[13] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
+      _mm256_sub_epi64( mh[11], mh[13] ) )
 
 #define Wb7 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-         _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
+      _mm256_add_epi64( mh[12], mh[14] ) )
 
 #define Wb8 \
    _mm256_add_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[13], mh[15] ) )
 
 #define Wb9 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[ 7], mh[14] ) )
 
 #define Wb10 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-         _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
+      _mm256_sub_epi64( mh[ 7], mh[15] ) )
 
 #define Wb11 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-         _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                        _mm256_xor_si256( M[ 9], H[ 9] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
+      _mm256_sub_epi64( mh[ 5], mh[ 9] ) )
 
 #define Wb12 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
-                        _mm256_xor_si256( M[10], H[10] ) ) )
+      _mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[ 9], mh[10] ) )
 
 #define Wb13 \
    _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-         _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
-                        _mm256_xor_si256( M[11], H[11] ) ) )
+      _mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
+      _mm256_add_epi64( mh[10], mh[11] ) )
 
 #define Wb14 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
-                        _mm256_xor_si256( M[12], H[12] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
+      _mm256_add_epi64( mh[11], mh[12] ) )
 
 #define Wb15 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                           _mm256_xor_si256( M[ 4], H[4] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
-                        _mm256_xor_si256( M[13], H[13] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[ 9], mh[13] ) )
 
 
 void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 {
    __m256i qt[32], xl, xh;
+   __m256i mh[16];
+   int i;
+
+   for ( i = 0; i < 16; i++ )
+      mh[i] = _mm256_xor_si256( M[i], H[i] );
 
    qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); 
    qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); 
@@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
    qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
    qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); 
    qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); 
-   qt[16] = expand1b( qt, M, H, 16 ); 
-   qt[17] = expand1b( qt, M, H, 17 ); 
-   qt[18] = expand2b( qt, M, H, 18 ); 
-   qt[19] = expand2b( qt, M, H, 19 ); 
-   qt[20] = expand2b( qt, M, H, 20 ); 
-   qt[21] = expand2b( qt, M, H, 21 ); 
-   qt[22] = expand2b( qt, M, H, 22 ); 
-   qt[23] = expand2b( qt, M, H, 23 ); 
-   qt[24] = expand2b( qt, M, H, 24 ); 
-   qt[25] = expand2b( qt, M, H, 25 ); 
-   qt[26] = expand2b( qt, M, H, 26 ); 
-   qt[27] = expand2b( qt, M, H, 27 ); 
-   qt[28] = expand2b( qt, M, H, 28 ); 
-   qt[29] = expand2b( qt, M, H, 29 ); 
-   qt[30] = expand2b( qt, M, H, 30 ); 
-   qt[31] = expand2b( qt, M, H, 31 ); 
+
+   __m256i mj[16];
+   for ( i = 0; i < 16; i++ )
+      mj[i] = rol_off_64( M, i );
+
+   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
+              (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
+   qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
+              (const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
+   qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
+              (const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
+   qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
+              (const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
+   qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
+              (const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
+   qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
+              (const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
+   qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
+              (const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
+   qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
+              (const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
+   qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
+              (const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
+   qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
+              (const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
+   qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
+              (const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
+   qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
+              (const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
+   qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
+              (const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
+   qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
+              (const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
+   qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
+              (const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
+   qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
+              (const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
+
+   qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
+   qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
+   qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
+   qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
+   qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
+   qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
+   qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
+   qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
+   qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
+   qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
+   qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
+   qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
+   qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
+   qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
+   qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
+   qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
 
    xl = _mm256_xor_si256(
            mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), 
@@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
            mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
            mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
 
-
 #define DH1L( m, sl, sr, a, b, c ) \
    _mm256_add_epi64( \
                _mm256_xor_si256( M[m], \
@@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define r8b6(x)    mm512_rol_64( x, 43 )
 #define r8b7(x)    mm512_rol_64( x, 53 )
 
-#define rol8w_off_64( M, j, off ) \
-   mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
-                  ( ( (j) + (off) ) & 0xF ) + 1 )
+#define rol8w_off_64( M, j ) \
+   mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
 
-#define add_elt_b8( M, H, j ) \
-   _mm512_xor_si512( \
-      _mm512_add_epi64( \
-            _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
-                                                rol8w_off_64( M, j, 3 ) ), \
-                             rol8w_off_64( M, j, 10 ) ), \
-            _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 0xF ] )
+#define add_elt_b8( mj0, mj3, mj10, h, K ) \
+  _mm512_xor_si512( h, _mm512_add_epi64( K, \
+              _mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
 
-#define expand1b8( qt, M, H, i ) \
-   _mm512_add_epi64( mm512_add4_64( \
+#define expand1_b8( qt, i ) \
+   mm512_add4_64( \
       mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
                      s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
       mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
@@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
       mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
                      s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
       mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
-                     s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b8( M, H, (i)-16 ) )
+                     s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
 
-#define expand2b8( qt, M, H, i) \
-   _mm512_add_epi64( mm512_add4_64( \
+#define expand2_b8( qt, i) \
+   mm512_add4_64( \
       mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
                      qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
       mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
@@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
       mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
                      qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
       mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
-                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b8( M, H, (i)-16 ) )
+                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
 
 #define W8b0 \
    _mm512_add_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
-                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
-         _mm512_xor_si512( M[10], H[10] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
+      _mm512_add_epi64( mh[13], mh[14] ) )
 
 #define W8b1 \
    _mm512_add_epi64( \
-       _mm512_add_epi64( \
-          _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
-                            _mm512_xor_si512( M[ 8], H[ 8] ) ), \
-          _mm512_xor_si512( M[11], H[11] ) ), \
-       _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
-                         _mm512_xor_si512( M[15], H[15] ) ) )
+         _mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
+         _mm512_sub_epi64( mh[14], mh[15] ) )
 
 #define W8b2 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
-                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
-         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
+      _mm512_sub_epi64( mh[12], mh[15] ) )
 
 #define W8b3 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
-                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
-         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
-                        _mm512_xor_si512( M[13], H[13] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
+      _mm512_sub_epi64( mh[10], mh[13] ) )
 
 #define W8b4 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
-                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
-         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
+      _mm512_add_epi64( mh[11], mh[14] ) )
 
 #define W8b5 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
-                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
-         _mm512_xor_si512( M[10], H[10] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
+      _mm512_sub_epi64( mh[12], mh[15] ) )
 
 #define W8b6 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
-                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
-         _mm512_xor_si512( M[ 3], H[ 3] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
-                        _mm512_xor_si512( M[13], H[13] ) ) )
+         _mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
+      _mm512_sub_epi64( mh[11], mh[13] ) )
 
 #define W8b7 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
-                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
-         _mm512_xor_si512( M[ 5], H[ 5] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
+      _mm512_add_epi64( mh[12], mh[14] ) )
 
 #define W8b8 \
    _mm512_add_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
-                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[13], mh[15] ) )
 
 #define W8b9 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
-                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[ 7], mh[14] ) )
 
 #define W8b10 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
-                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
-         _mm512_xor_si512( M[ 4], H[ 4] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
+      _mm512_sub_epi64( mh[ 7], mh[15] ) )
 
 #define W8b11 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
-                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
-         _mm512_xor_si512( M[ 2], H[ 2] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
-                        _mm512_xor_si512( M[ 9], H[ 9] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
+      _mm512_sub_epi64( mh[ 5], mh[ 9] ) )
 
 #define W8b12 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
-                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
-                        _mm512_xor_si512( M[10], H[10] ) ) )
+      _mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[ 9], mh[10] ) )
 
 #define W8b13 \
    _mm512_add_epi64( \
-      _mm512_add_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
-                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
-         _mm512_xor_si512( M[ 7], H[ 7] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
-                        _mm512_xor_si512( M[11], H[11] ) ) )
+      _mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
+      _mm512_add_epi64( mh[10], mh[11] ) )
 
 #define W8b14 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
-                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
-         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
-                        _mm512_xor_si512( M[12], H[12] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
+      _mm512_add_epi64( mh[11], mh[12] ) )
 
 #define W8b15 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                           _mm512_xor_si512( M[ 4], H[4] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
-                        _mm512_xor_si512( M[13], H[13] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[ 9], mh[13] ) )
 
 void compress_big_8way( const __m512i *M, const __m512i H[16],
                         __m512i dH[16] )
 {
    __m512i qt[32], xl, xh;
+   __m512i mh[16];
+   int i;
+
+   for ( i = 0; i < 16; i++ )
+      mh[i] = _mm512_xor_si512( M[i], H[i] );
 
    qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
    qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
@@ -1268,22 +1169,60 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
    qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
    qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
    qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
-   qt[16] = expand1b8( qt, M, H, 16 );
-   qt[17] = expand1b8( qt, M, H, 17 );
-   qt[18] = expand2b8( qt, M, H, 18 );
-   qt[19] = expand2b8( qt, M, H, 19 );
-   qt[20] = expand2b8( qt, M, H, 20 );
-   qt[21] = expand2b8( qt, M, H, 21 );
-   qt[22] = expand2b8( qt, M, H, 22 );
-   qt[23] = expand2b8( qt, M, H, 23 );
-   qt[24] = expand2b8( qt, M, H, 24 );
-   qt[25] = expand2b8( qt, M, H, 25 );
-   qt[26] = expand2b8( qt, M, H, 26 );
-   qt[27] = expand2b8( qt, M, H, 27 );
-   qt[28] = expand2b8( qt, M, H, 28 );
-   qt[29] = expand2b8( qt, M, H, 29 );
-   qt[30] = expand2b8( qt, M, H, 30 );
-   qt[31] = expand2b8( qt, M, H, 31 );
+
+   __m512i mj[16];
+   for ( i = 0; i < 16; i++ )
+      mj[i] = rol8w_off_64( M, i );
+
+   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
+              (const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
+   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
+              (const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
+   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
+              (const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
+   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
+              (const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
+   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
+              (const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
+   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
+              (const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
+   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
+              (const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
+   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
+              (const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
+   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
+              (const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
+   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
+              (const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
+   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
+              (const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
+   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
+              (const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
+   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
+              (const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
+   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
+              (const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
+   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
+              (const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
+   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
+              (const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
+
+   qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
+   qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
+   qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
+   qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
+   qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
+   qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
+   qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
+   qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
+   qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
+   qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
+   qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
+   qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
+   qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
+   qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
+   qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
+   qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
 
    xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
                     mm512_xor3( qt[19], qt[20], qt[21] ),
diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c
index 1201b8f2..06f7e095 100644
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp )
     _mm512_store_si512( (__m512i*)sp->h + 7, x7 );
 }
 
+// 8 ways, 4 way parallel double buffered
+static void transform_4way_2buf( cube_4way_2buf_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7;
+    __m512i y0, y1, y2, y3, y4, y5, y6, y7;
+    __m512i tx0, tx1, ty0, ty1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->h0     );
+    x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
+    x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
+    x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
+    x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
+    x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
+    x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
+    x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
+
+    y0 = _mm512_load_si512( (__m512i*)sp->h1     );
+    y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
+    y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
+    y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
+    y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
+    y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
+    y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
+    y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
+
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm512_add_epi32( x0, x4 );
+        y4 = _mm512_add_epi32( y0, y4 );
+        tx0 = x0;
+        ty0 = y0;
+        x5 = _mm512_add_epi32( x1, x5 );
+        y5 = _mm512_add_epi32( y1, y5 );
+        tx1 = x1;
+        ty1 = y1;
+        x0 = mm512_rol_32( x2, 7 );
+        y0 = mm512_rol_32( y2, 7 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        y6 = _mm512_add_epi32( y2, y6 );
+        x1 = mm512_rol_32( x3, 7 );
+        y1 = mm512_rol_32( y3, 7 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y7 = _mm512_add_epi32( y3, y7 );
+
+
+        x2 = mm512_rol_32( tx0, 7 );
+        y2 = mm512_rol_32( ty0, 7 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        y0 = _mm512_xor_si512( y0, y4 );
+        x4 = mm512_swap128_64( x4 );
+        x3 = mm512_rol_32( tx1, 7 );
+        y3 = mm512_rol_32( ty1, 7 );
+        y4 = mm512_swap128_64( y4 );
+
+        x1 = _mm512_xor_si512( x1, x5 );
+        y1 = _mm512_xor_si512( y1, y5 );
+        x5 = mm512_swap128_64( x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        y2 = _mm512_xor_si512( y2, y6 );
+        y5 = mm512_swap128_64( y5 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        y3 = _mm512_xor_si512( y3, y7 );
+
+        x6 = mm512_swap128_64( x6 );
+        x4 = _mm512_add_epi32( x0, x4 );
+        y4 = _mm512_add_epi32( y0, y4 );
+        y6 = mm512_swap128_64( y6 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        y5 = _mm512_add_epi32( y1, y5 );
+        x7 = mm512_swap128_64( x7 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        y6 = _mm512_add_epi32( y2, y6 );
+        tx0 = x0;
+        ty0 = y0;
+        y7 = mm512_swap128_64( y7 );
+        tx1 = x2;
+        ty1 = y2;
+        x0 = mm512_rol_32( x1, 11 );
+        y0 = mm512_rol_32( y1, 11 );
+
+        x7 = _mm512_add_epi32( x3, x7 );
+        y7 = _mm512_add_epi32( y3, y7 );
+
+        x1 = mm512_rol_32( tx0, 11 );
+        y1 = mm512_rol_32( ty0, 11 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x4 = mm512_swap64_32( x4 );
+        y0 = _mm512_xor_si512( y0, y4 );
+        x2 = mm512_rol_32( x3, 11 );
+        y4 = mm512_swap64_32( y4 );
+        y2 = mm512_rol_32( y3, 11 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x5 = mm512_swap64_32( x5 );
+        y1 = _mm512_xor_si512( y1, y5 );
+        x3 = mm512_rol_32( tx1, 11 );
+        y5 = mm512_swap64_32( y5 );
+        y3 = mm512_rol_32( ty1, 11 );
+
+        x2 = _mm512_xor_si512( x2, x6 );
+        x6 = mm512_swap64_32( x6 );
+        y2 = _mm512_xor_si512( y2, y6 );
+        y6 = mm512_swap64_32( y6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x7 = mm512_swap64_32( x7 );
+        y3 = _mm512_xor_si512( y3, y7 );
+
+        y7 = mm512_swap64_32( y7 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->h0,     x0 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
+
+    _mm512_store_si512( (__m512i*)sp->h1,     y0 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
+}
+
 int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
                     int blockbytes )
 {
@@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen,
     return 0;
 }
 
+int cube_4way_2buf_full( cube_4way_2buf_context *sp,
+                         void *output0, void *output1, int hashbitlen,
+                         const void *data0, const void *data1, size_t size )
+{
+    __m512i *h0 = (__m512i*)sp->h0;
+    __m512i *h1 = (__m512i*)sp->h1;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h1[0] = h0[0] = m512_const1_128( iv[0] );
+    h1[1] = h0[1] = m512_const1_128( iv[1] );
+    h1[2] = h0[2] = m512_const1_128( iv[2] );
+    h1[3] = h0[3] = m512_const1_128( iv[3] );
+    h1[4] = h0[4] = m512_const1_128( iv[4] );
+    h1[5] = h0[5] = m512_const1_128( iv[5] );
+    h1[6] = h0[6] = m512_const1_128( iv[6] );
+    h1[7] = h0[7] = m512_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m512i *in0 = (__m512i*)data0;
+    const __m512i *in1 = (__m512i*)data1;
+    __m512i *hash0 = (__m512i*)output0;
+    __m512i *hash1 = (__m512i*)output1;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
+        sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way_2buf( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    __m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
+    sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
+    sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
+
+    transform_4way_2buf( sp );
+
+    tmp = m512_const2_64( 0x0000000100000000, 0 );
+    sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
+    sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way_2buf( sp );
+
+    memcpy( hash0, sp->h0, sp->hashlen<<6);
+    memcpy( hash1, sp->h1, sp->hashlen<<6);
+
+    return 0;
+}
+
 
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                                const void *data, size_t size )
@@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
 
 // 2 way 128 
 
+// This isn't expected to be used with AVX512 so HW rotate intruction
+// is assumed not avaiable.
+// Use double buffering to optimize serial bit rotations. Full double
+// buffering isn't practical because it needs twice as many registers
+// with AVX2 having only half as many as AVX512.
+#define ROL2( out0, out1, in0, in1, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi32( in0, c ); \
+ __m256i t1 = _mm256_slli_epi32( in1, c ); \
+ out0 = _mm256_srli_epi32( in0, 32-(c) ); \
+ out1 = _mm256_srli_epi32( in1, 32-(c) ); \
+ out0 = _mm256_or_si256( out0, t0 ); \
+ out1 = _mm256_or_si256( out1, t1 ); \
+}
+
 static void transform_2way( cube_2way_context *sp )
 {
     int r;
@@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp )
         x7 = _mm256_add_epi32( x3, x7 );
         y0 = x0;
         y1 = x1;
-        x0 = mm256_rol_32( x2, 7 );
-        x1 = mm256_rol_32( x3, 7 );
-        x2 = mm256_rol_32( y0, 7 );
-        x3 = mm256_rol_32( y1, 7 );
+        ROL2( x0, x1, x2, x3, 7 );
+        ROL2( x2, x3, y0, y1, 7 );
         x0 = _mm256_xor_si256( x0, x4 );
+        x4 = mm256_swap128_64( x4 );
         x1 = _mm256_xor_si256( x1, x5 );
         x2 = _mm256_xor_si256( x2, x6 );
-        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap128_64( x4 );
         x5 = mm256_swap128_64( x5 );
-        x6 = mm256_swap128_64( x6 );
-        x7 = mm256_swap128_64( x7 );
+        x3 = _mm256_xor_si256( x3, x7 );
         x4 = _mm256_add_epi32( x0, x4 );
+        x6 = mm256_swap128_64( x6 );
+        y0 = x0;
         x5 = _mm256_add_epi32( x1, x5 );
+        x7 = mm256_swap128_64( x7 );
         x6 = _mm256_add_epi32( x2, x6 );
-        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x0;
         y1 = x2;
-        x0 = mm256_rol_32( x1, 11 );
-        x1 = mm256_rol_32( y0, 11 );
-        x2 = mm256_rol_32( x3, 11 );
-        x3 = mm256_rol_32( y1, 11 );
+        ROL2( x0, x1, x1, y0, 11 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        ROL2( x2, x3, x3, y1, 11 );
         x0 = _mm256_xor_si256( x0, x4 );
-        x1 = _mm256_xor_si256( x1, x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
-        x3 = _mm256_xor_si256( x3, x7 );
         x4 = mm256_swap64_32( x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
         x5 = mm256_swap64_32( x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
         x6 = mm256_swap64_32( x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
         x7 = mm256_swap64_32( x7 );
     }
 
diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h
index 25df10e8..a31ffde0 100644
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -17,41 +17,41 @@ struct _cube_4way_context
     int pos; 
 } __attribute__ ((aligned (128)));
 
+struct _cube_4way_2buf_context
+{
+    __m512i h0[8];
+    __m512i h1[8];
+    int hashlen;
+    int rounds;
+    int blocksize;
+    int pos;
+} __attribute__ ((aligned (128)));
+
+
 typedef struct _cube_4way_context cube_4way_context;
 
+typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
+
 int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
-                       int blockbytes );
+                    int blockbytes );
+
 int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
+
 int cube_4way_close( cube_4way_context *sp, void *output );
+
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                             const void *data, size_t size );
+
 int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
                     const void *data, size_t size );
 
-int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
-                     const void *data, size_t size );
-
-#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
-#define cube512_4way_update cube_4way_update
-#define cube512_4way_update_close cube_4way_update
-#define cube512_4way_close cube_4way_update
-#define cube512_4way_full( sp, output, data, size ) \
-           cube_4way_full( sp, output, 512, data, size )
-#define cube512_4x256_full( sp, output, data, size ) \
-           cube_4x256_full( sp, output, 512, data, size )
-
-#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
-#define cube256_4way_update cube_4way_update
-#define cube256_4way_update_close cube_4way_update
-#define cube256_4way_close cube_4way_update
-#define cube256_4way_full( sp, output, data, size ) \
-           cube_4way_full( sp, output, 256, data, size )
-#define cube256_4x256_full( sp, output, data, size ) \
-           cube_4x256_full( sp, output, 256, data, size )
+int cube_4way_2buf_full( cube_4way_2buf_context *sp,
+                         void *output0, void *output1, int hashbitlen,
+                         const void *data0, const void *data1, size_t size );
 
 #endif
 
-// 2x128, 2 way parallel SSE2
+// 2x128, 2 way parallel AVX2
 
 struct _cube_2way_context
 {
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index c87829db..5ea1b6f6 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
     for ( r = 0; r < rounds; ++r )
     { 
         x1 = _mm512_add_epi32( x0, x1 );
-        x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
-        x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
-        x0 = _mm512_xor_si512( mm512_rol_32(
-                                         mm512_swap256_128( x0 ), 11 ), x1 );
+        x0 = mm512_swap_256( x0 );
+        x0 = mm512_rol_32(  x0, 7 );
+        x0 = _mm512_xor_si512( x0, x1 );
+        x1 = mm512_swap128_64( x1 );
+        x1 = _mm512_add_epi32( x0, x1 );
+        x0 = mm512_swap256_128( x0 );
+        x0 = mm512_rol_32( x0, 11 );
+        x0 = _mm512_xor_si512( x0, x1 );
         x1 = mm512_swap64_32( x1 );
     }
 
diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h
index 595dc3df..b76d8098 100644
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -43,7 +43,8 @@
 #define ROUNDS (ROUNDS1024)
 //#endif
 
-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+#define ROTL64(a,n) rol64( a, n )
 
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h
index 9410266c..32ce1a5f 100644
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
 //#define ROUNDS (ROUNDS1024)
 //#endif
 
-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+#define ROTL64(a,n) rol64( a, n )
 
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c
index 5a673034..4f17c644 100644
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -11,7 +11,7 @@
 #else
   #include "sph_groestl.h"
 #endif
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 typedef struct {
 #ifdef __AES__
@@ -19,7 +19,6 @@ typedef struct {
 #else
     sph_groestl512_context  groestl;
 #endif
-    sph_sha256_context      sha;
 } myrgr_ctx_holder;
 
 myrgr_ctx_holder myrgr_ctx;
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
 #else
      sph_groestl512_init( &myrgr_ctx.groestl );
 #endif
-     sph_sha256_init( &myrgr_ctx.sha );
 }
 
 void myriad_hash(void *output, const void *input)
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
    sph_groestl512_close(&ctx.groestl, hash);
 #endif
 
-   sph_sha256( &ctx.sha, hash, 64 );
-   sph_sha256_close( &ctx.sha, hash );
+   sha256_full( hash, hash, 64 );
 
    memcpy(output, hash, 32);
 }
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 9944ebe4..26e133c9 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -632,26 +632,25 @@ do { \
 } while (0)
 
 
-#define ROUND_BIG8(rc, alpha) \
+#define ROUND_BIG8( alpha ) \
 do { \
    __m512i t0, t1, t2, t3; \
-   s0 = _mm512_xor_si512( s0, m512_const1_64( \
-                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
-   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
-   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
-   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
-   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
-   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
-   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
-   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
-   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
-   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
-   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
-   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
-   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
-   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
-   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
-   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+   s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
+   s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
+   s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
+   s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
+   s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
+   s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
+   s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
+   s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
+   s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
+   s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
+   sA = _mm512_xor_si512( sA, alpha[10] ); \
+   sB = _mm512_xor_si512( sB, alpha[11] ); \
+   sC = _mm512_xor_si512( sC, alpha[12] ); \
+   sD = _mm512_xor_si512( sD, alpha[13] ); \
+   sE = _mm512_xor_si512( sE, alpha[14] ); \
+   sF = _mm512_xor_si512( sF, alpha[15] ); \
 \
   SBOX8( s0, s4, s8, sC ); \
   SBOX8( s1, s5, s9, sD ); \
@@ -731,28 +730,66 @@ do { \
 
 #define P_BIG8 \
 do { \
-   ROUND_BIG8(0, alpha_n); \
-   ROUND_BIG8(1, alpha_n); \
-   ROUND_BIG8(2, alpha_n); \
-   ROUND_BIG8(3, alpha_n); \
-   ROUND_BIG8(4, alpha_n); \
-   ROUND_BIG8(5, alpha_n); \
+   __m512i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
 } while (0)
 
 #define PF_BIG8 \
 do { \
-   ROUND_BIG8( 0, alpha_f); \
-   ROUND_BIG8( 1, alpha_f); \
-   ROUND_BIG8( 2, alpha_f); \
-   ROUND_BIG8( 3, alpha_f); \
-   ROUND_BIG8( 4, alpha_f); \
-   ROUND_BIG8( 5, alpha_f); \
-   ROUND_BIG8( 6, alpha_f); \
-   ROUND_BIG8( 7, alpha_f); \
-   ROUND_BIG8( 8, alpha_f); \
-   ROUND_BIG8( 9, alpha_f); \
-   ROUND_BIG8(10, alpha_f); \
-   ROUND_BIG8(11, alpha_f); \
+   __m512i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
 } while (0)
 
 #define T_BIG8 \
@@ -965,26 +1002,25 @@ do { \
 #define sF   m7
 */
 
-#define ROUND_BIG(rc, alpha) \
+#define ROUND_BIG( alpha ) \
 do { \
    __m256i t0, t1, t2, t3; \
-   s0 = _mm256_xor_si256( s0, m256_const1_64( \
-                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
-   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
-   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
-   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
-   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
-   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
-   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
-   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+   s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
+   s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
+   s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
+   s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
+   s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
+   s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
+   s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
+   s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
+   s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
+   s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
+   sA = _mm256_xor_si256( sA, alpha[10] ); \
+   sB = _mm256_xor_si256( sB, alpha[11] ); \
+   sC = _mm256_xor_si256( sC, alpha[12] ); \
+   sD = _mm256_xor_si256( sD, alpha[13] ); \
+   sE = _mm256_xor_si256( sE, alpha[14] ); \
+   sF = _mm256_xor_si256( sF, alpha[15] ); \
 \
   SBOX( s0, s4, s8, sC ); \
   SBOX( s1, s5, s9, sD ); \
@@ -1064,28 +1100,66 @@ do { \
 
 #define P_BIG \
 do { \
-   ROUND_BIG(0, alpha_n); \
-   ROUND_BIG(1, alpha_n); \
-   ROUND_BIG(2, alpha_n); \
-   ROUND_BIG(3, alpha_n); \
-   ROUND_BIG(4, alpha_n); \
-   ROUND_BIG(5, alpha_n); \
+   __m256i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
 } while (0)
 
 #define PF_BIG \
 do { \
-   ROUND_BIG( 0, alpha_f); \
-   ROUND_BIG( 1, alpha_f); \
-   ROUND_BIG( 2, alpha_f); \
-   ROUND_BIG( 3, alpha_f); \
-   ROUND_BIG( 4, alpha_f); \
-   ROUND_BIG( 5, alpha_f); \
-   ROUND_BIG( 6, alpha_f); \
-   ROUND_BIG( 7, alpha_f); \
-   ROUND_BIG( 8, alpha_f); \
-   ROUND_BIG( 9, alpha_f); \
-   ROUND_BIG(10, alpha_f); \
-   ROUND_BIG(11, alpha_f); \
+   __m256i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
 } while (0)
 
 #define T_BIG \
diff --git a/algo/hodl/hodl-wolf.c b/algo/hodl/hodl-wolf.c
index 6ff61757..7ce79da8 100644
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -7,6 +7,7 @@
 #include "hodl-gate.h"
 #include "hodl-wolf.h"
 #include "miner.h"
+#include "algo/sha/sha256d.h"
 
 #if defined(__AES__)               
 
diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c
index 282ae91a..c710836b 100644
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,5 +1,6 @@
 #include "keccak-gate.h"
 #include "sph_keccak.h"
+#include "algo/sha/sha256d.h"
 
 int hard_coded_eb = 1;
 
diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c
index e2545b4d..af37d6f6 100644
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -70,13 +70,13 @@ static const uint64_t RC[] = {
 
 // Targetted macros, keccak-macros.h is included for each target.
 
-#define DECL64(x)        __m512i x
-#define XOR64(d, a, b)   (d = _mm512_xor_si512(a,b))
-#define AND64(d, a, b)   (d = _mm512_and_si512(a,b))
-#define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
-#define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
-#define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
-#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
+#define DECL64(x)          __m512i x
+#define XOR64(d, a, b)     (d = _mm512_xor_si512(a,b))
+#define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
+#define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
+#define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
+#define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
+#define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
 
 
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index 833b87ec..f15648ae 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -16,7 +16,7 @@
 typedef struct {
    blake256_16way_context     blake;
    keccak256_8way_context    keccak;
-   cube_4way_context          cube;
+   cube_4way_2buf_context    cube;
    skein256_8way_context     skein;
 #if defined(__VAES__)
    groestl256_4way_context groestl;
@@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx;
 bool init_allium_16way_ctx()
 {
    keccak256_8way_init( &allium_16way_ctx.keccak );
-   cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
    skein256_8way_init( &allium_16way_ctx.skein );
-#if defined(__VAES__)
-   groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
-#else
-   init_groestl256( &allium_16way_ctx.groestl, 32 );
-#endif
    return true;
 }
 
@@ -111,12 +105,11 @@ void allium_16way_hash( void *state, const void *input )
    intrlv_2x256( vhash, hash14, hash15, 256 );
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash14, hash15, vhash, 256 );
-  
+
    intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
    intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
 
-   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
-   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
 
    dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
    dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -124,8 +117,7 @@ void allium_16way_hash( void *state, const void *input )
    intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
    intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
 
-   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
-   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
 
    dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
    dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
@@ -255,7 +247,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
 typedef struct {
    blake256_8way_context     blake;
    keccak256_4way_context    keccak;
-   cubehashParam             cube;
+   cube_2way_context         cube;
    skein256_4way_context     skein;
 #if defined(__VAES__)
    groestl256_2way_context   groestl;
@@ -269,13 +261,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx;
 bool init_allium_8way_ctx()
 {
    keccak256_4way_init( &allium_8way_ctx.keccak );
-   cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
    skein256_4way_init( &allium_8way_ctx.skein );
-#if defined(__VAES__)
-   groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
-#else
-   init_groestl256( &allium_8way_ctx.groestl, 32 );
-#endif
    return true;
 }
 
@@ -320,21 +306,20 @@ void allium_8way_hash( void *hash, const void *input )
    LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
    LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
 
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
+
+   intrlv_2x128( vhashA, hash0, hash1, 256 );
+   intrlv_2x128( vhashB, hash2, hash3, 256 );
+   cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   dintrlv_2x128( hash0, hash1, vhashA, 256 );
+   dintrlv_2x128( hash2, hash3, vhashB, 256 );
+
+   intrlv_2x128( vhashA, hash4, hash5, 256 );
+   intrlv_2x128( vhashB, hash6, hash7, 256 );
+   cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   dintrlv_2x128( hash4, hash5, vhashA, 256 );
+   dintrlv_2x128( hash6, hash7, vhashB, 256 );
 
    LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
    LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index b24b1730..1c904447 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 
 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
    G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_ror256_64( s1); \
+   s3 = mm512_shufll256_64( s3 ); \
+   s1 = mm512_shuflr256_64( s1); \
    s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_rol256_64( s3 ); \
    G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_rol256_64( s1 ); \
-   s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_ror256_64( s3 );
+   s3 = mm512_shuflr256_64( s3 ); \
+   s1 = mm512_shufll256_64( s1 ); \
+   s2 = mm512_swap256_128( s2 ); 
 
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
    LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
    G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_ror_1x64( s1); \
+   s3 = mm256_shufll_64( s3 ); \
+   s1 = mm256_shuflr_64( s1); \
    s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rol_1x64( s3 ); \
    G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rol_1x64( s1 ); \
-   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_ror_1x64( s3 );
+   s3 = mm256_shuflr_64( s3 ); \
+   s1 = mm256_shufll_64( s1 ); \
+   s2 = mm256_swap_128( s2 );
 
 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
    LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
    G_2X64( s0, s2, s4, s6 ); \
    G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror256_64( s2, s3 ); \
+   mm128_vrol256_64( s6, s7 ); \
+   mm128_vror256_64( s2, s3 ); \
    mm128_swap256_128( s4, s5 ); \
-   mm128_rol256_64( s6, s7 ); \
    G_2X64( s0, s2, s4, s6 ); \
    G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 ); \
-   mm128_ror256_64( s6, s7 );
+   mm128_vror256_64( s6, s7 ); \
+   mm128_vrol256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 );
 
 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
    LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c
index ab13a7e3..2bf4a11f 100644
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -13,6 +13,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/ripemd/sph_ripemd.h"
 #include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 #define EPSa DBL_EPSILON
 #define EPS1 DBL_EPSILON
@@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce )
 }
 
 typedef struct {
-    sph_sha256_context      sha256;
-    sph_sha512_context      sha512;
+    sha256_context           sha256;
+    sph_sha512_context       sha512;
     sph_keccak512_context    keccak;
     sph_whirlpool_context    whirlpool;
     sph_haval256_5_context   haval;
@@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx;
 
 void init_m7m_ctx()
 {
-    sph_sha256_init( &m7m_ctx );
+    sha256_ctx_init( &m7m_ctx.sha256 );
     sph_sha512_init( &m7m_ctx.sha512 );
     sph_keccak512_init( &m7m_ctx.keccak );
     sph_whirlpool_init( &m7m_ctx.whirlpool );
@@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
 
     m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
     memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
-    sph_sha256_context ctxf_sha256;
 
     memcpy(data, pdata, 80);
 
-    sph_sha256(     &ctx1.sha256,    data, M7_MIDSTATE_LEN );
+    sha256_update(  &ctx1.sha256,    data, M7_MIDSTATE_LEN );
     sph_sha512(     &ctx1.sha512,    data, M7_MIDSTATE_LEN );
     sph_keccak512(  &ctx1.keccak,    data, M7_MIDSTATE_LEN );
     sph_whirlpool(  &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
@@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
 
         memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
 
-        sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
-        sph_sha256_close( &ctx2.sha256, bhash[0] );
+        sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
+        sha256_final( &ctx2.sha256, bhash[0] );
 
         sph_sha512(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
         sph_sha512_close( &ctx2.sha512, bhash[1] );
@@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
         bytes = mpz_sizeinbase(product, 256);
         mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
 
-        sph_sha256_init( &ctxf_sha256 );
-        sph_sha256( &ctxf_sha256, bdata, bytes );
-        sph_sha256_close( &ctxf_sha256, hash );
+        sha256_full( hash, bdata, bytes );
 
         digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
         mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
@@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
             mpzscale=bytes;
             mpz_export(bdata, NULL, -1, 1, 0, 0, product);
 
-            sph_sha256_init( &ctxf_sha256 );
-            sph_sha256( &ctxf_sha256, bdata, bytes );
-            sph_sha256_close( &ctxf_sha256, hash );
-	}
+            sha256_full( hash, bdata, bytes );
+	     }
 
         if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) 
              && !opt_benchmark ) )
diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c
index 94f34171..e91b287c 100644
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -7,24 +7,19 @@
 #include <string.h>
 #include <stdio.h>
 #include "sph_ripemd.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 void lbry_hash(void* output, const void* input)
 {
-   sph_sha256_context    ctx_sha256 __attribute__ ((aligned (64)));
+   sha256_context        ctx_sha256 __attribute__ ((aligned (64)));
    sph_sha512_context    ctx_sha512 __attribute__ ((aligned (64)));
    sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
    uint32_t _ALIGN(64) hashA[16];
    uint32_t _ALIGN(64) hashB[16];
    uint32_t _ALIGN(64) hashC[16];
 
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, input, 112 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
+   sha256_full( hashA, input, 112 );
+   sha256_full( hashA, hashA, 32 );
 
    sph_sha512_init( &ctx_sha512 );
    sph_sha512( &ctx_sha512, hashA, 32 );
@@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input)
    sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
    sph_ripemd160_close( &ctx_ripemd, hashC );
 
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, hashB, 20 );
-   sph_sha256( &ctx_sha256, hashC, 20 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
+   sha256_ctx_init( &ctx_sha256 );
+   sha256_update( &ctx_sha256, hashB, 20 );
+   sha256_update( &ctx_sha256, hashC, 20 );
+   sha256_final( &ctx_sha256, hashA );
 
+   sha256_full( hashA, hashA, 32 );
+   
    memcpy( output, hashA, 32 );
 }
 
diff --git a/algo/scrypt/neoscrypt.c b/algo/scrypt/neoscrypt.c
index 7cb4c828..709b2688 100644
--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -69,8 +69,12 @@ typedef unsigned int  uint;
 #define SCRYPT_HASH_BLOCK_SIZE 64U
 #define SCRYPT_HASH_DIGEST_SIZE 32U
 
-#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
-#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+
+#define ROTL32(a,b) rol32(a,b)
+#define ROTR32(a,b) ror32(a,b)
+
 
 #define U8TO32_BE(p) \
     (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
new file mode 100644
index 00000000..19ff9cdd
--- /dev/null
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -0,0 +1,3981 @@
+#include "scrypt-core-4way.h"
+
+//////////////////////////////////////////////////////////////////////////
+//
+//  Optimized Salsa implementation inspired by Pooler.
+//  Any similarities are not a coincidence.
+//
+//  Implementations include reference X64, SSE2, AVX2 & AVX512
+//  using both serial and parallel vectoring using SIMD instruction.
+//
+//  Generic macros are providedi and invoked with different targets depending
+//  on level of parallelism and data organization. Targets for any macros
+//  needed must be defined by the calling function. XOR, ROL32 and ADD32
+//  are needed in all cases. Additionally ROL_1X32, SWAP_64 and ROR_1X32
+//  shuffles are needed for serial SIMD.
+//
+//  SALSA_8ROUNDS_SIMD uses vectors on serial data rather than traditional
+//  n-way parallel hashing.
+//  The SIMD version has different implied arguments {X0:X3}, representing
+//  an array of 4 vectors of 4 32 bit words, while the version used for
+//  regular parallel hashing has {x0:xf} representing array of 16 by 32 bit
+//  words.
+//  These arguments must be defined by the calling function.
+//  The calling function must also define targets for all macros used for
+//  arithmetic, logic and shuffling: XOR, ROL32, ADD32 for all targets and
+//  ROL_1X32, SWAP_64, ROR_1X32 for serial SIMD targets.
+//
+//  Serial and parallel SIMD will be combined with AVX2 doing 2 way 
+//  parallel over 4 way linear for 8 way throughput, and AVX512 doing
+//  4 way parallel over 4 way linear for 16 way thoughput.
+//
+//  The term SIMD128 here refers to vectors that contain multiple contiguous
+//  data from a single stream (lane) as opposed to parallel vectors that
+//  contain interleaved words of data from multiple streams.
+//
+//  The sequencing of techniques in the naming convention is a little
+//  mixed up. The logical hierarchy top down is to put Nbuffs at the top
+//  where each buffer then performs another technique.
+//
+//  Although, Nway and SIMS128 are listed in top down order Nbuffs is
+//  always listed last:
+//
+//  scrypt_core_simd128_2way means a linear simd operation on 2 parallel
+//  streams of data while
+//   scrypt_core_2way_simd128 is 2 parallel streams linear SIMD vectors.
+//
+///////////////////////////////////////////////////////////////////////////
+
+
+// Used by all targets, needs XOR, ROL32 & ADD32 macros defined
+// Function, return typically overwrites in1
+//
+#define ARX( in1, in2, in3, n ) \
+   XOR( in1, ROL32( ADD32( in2, in3 ), n ) )
+
+// Multi buffering has 2 main benefits and one drawback. 
+// Traditionally double buffering has been used to empty one bucket
+// while another is filling. This requires a second (or 3rd, etc)
+// bucket. The computing analogy is to use 2 registers, 1 to read
+// and 1 to write, and switch back and forth.
+//
+// The second benefit in computing is using multiple registers to 
+// provide data independence that improves multiple instruction issue and
+// pipelining in the CPU. The number of buffers is limited by the number
+// of registers available. Three seems to be a swet spot as a 4 variable
+// data set uses 12 registers triple buffered, leaving 4 of 16 as temps.
+// Many pipelined instructions require 3 clocks to complete and triple
+// bufferin keeps the pipeline full. Many execution units are also 3 wide
+// allowing up to 3 similar instructions to be issued per clock.
+// However, execution units are shared by hyperthreading which reduces
+// the effect on a single thread.
+//  
+// The drawback is the increased size of the data. Although multi buffering
+// also improves memory throughput this is offset by the amount of
+// memory required and it's effect on cache performance and will eventually
+// hit memory bus saturation.
+//
+// For example scryptn2 struggles with more than 4 buffers, multi
+// buffered and parallel SIMD combined, and performance drops. This can
+// be mitigated somewhat by reducing the number of CPU threads but
+// ultimately excessive multi buffering has a negative impact.
+//
+// Unlike paralle SIMD, increasing multi buffering does not require a
+// CPU technology increase, ie SSE2 to AVX2 or AVX2 TO AVX512.
+// SSE2 is limited to 4 way SIMD but no theoretical limit to multibuffering.
+// Multi buffering  also does not suffer the clock penalty of increasing
+// parallism.
+//
+// Multi buffering implementations here focus on powers of 2,
+// to match sha256 without re-interleaving the data.
+//
+// A decision will have to be made at run time, based of the N factor,
+// whether to use multi buffering or serial execution.
+
+// Need TYPE macro defined.
+#define ARX_2BUF( a1, a2, a3, b1, b2, b3, n ) \
+do{ \
+   TYPE ta = ADD32( a2, a3 ); \
+   TYPE tb = ADD32( b2, b3 ); \
+   ta = ROL32( ta, n ); \
+   tb = ROL32( tb, n ); \
+   a1 = XOR( a1, ta ); \
+   b1 = XOR( b1, tb ); \
+} while (0);
+
+#define ARX_3BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, n ) \
+do{ \
+   TYPE ta = ADD32( a2, a3 ); \
+   TYPE tb = ADD32( b2, b3 ); \
+   TYPE tc = ADD32( c2, c3 ); \
+   ta = ROL32( ta, n ); \
+   tb = ROL32( tb, n ); \
+   tc = ROL32( tc, n ); \
+   a1 = XOR( a1, ta ); \
+   b1 = XOR( b1, tb ); \
+   c1 = XOR( c1, tc ); \
+} while (0);
+
+// use 16 regs   AVX, AVX2, 8 buf for AVX512?
+#define ARX_4BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3, n ) \
+do{ \
+   TYPE ta = ADD32( a2, a3 ); \
+   TYPE tb = ADD32( b2, b3 ); \
+   TYPE tc = ADD32( c2, c3 ); \
+   TYPE td = ADD32( d2, d3 ); \
+   ta = ROL32( ta, n ); \
+   tb = ROL32( tb, n ); \
+   tc = ROL32( tc, n ); \
+   td = ROL32( td, n ); \
+   a1 = XOR( a1, ta ); \
+   b1 = XOR( b1, tb ); \
+   c1 = XOR( c1, tc ); \
+   d1 = XOR( d1, td ); \
+} while (0);
+
+
+// Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 &
+// ROR_1X32 defined.
+//
+// Implied arguments ( X0 = { x3, x2, x1, x0 },
+//                     X1 = { x7, x6, x5, x4 },
+//                     X3 = { xb, xa, x9, x8 },
+//                     X3 = { xf, xe, xd, xc } )
+//
+#define SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ) \
+   /* Operate on columns */ \
+   X1 = ARX( X1, X0, X3,  7 );  /* ( x4, x0, xc,  7 )  */ \
+   X2 = ARX( X2, X1, X0,  9 );  /* ( x8, x4, x0,  9 )  */ \
+   X3 = ARX( X3, X2, X1, 13 );  /* ( xc, x8, x4, 13 )  */ \
+   X0 = ARX( X0, X3, X2, 18 );  /* ( x0, xc, x8, 18 )  */ \
+   /* Rearrange data */ \
+   X1 = ROL_1X32( X1 ); \
+   X3 = ROR_1X32( X3 ); \
+   X2 = SWAP_64( X2 ); \
+   /* Operate on rows */ \
+   X3 = ARX( X3, X0, X1,  7 ); \
+   X2 = ARX( X2, X3, X0,  9 ); \
+   X1 = ARX( X1, X2, X3, 13 ); \
+   X0 = ARX( X0, X1, X2, 18 ); \
+   /* Rearrange data */ \
+   X3 = ROL_1X32( X3 ); \
+   X1 = ROR_1X32( X1 ); \
+   X2 = SWAP_64( X2 ); \
+
+// Final round optimization, don't rearange data back to original order on exit
+// Used only on pre-AVX2 CPUs where blend instruction is not avaiable.
+// It saves a few redundant shuffles.
+#define SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 ) \
+   /* Operate on columns */ \
+   X1 = ARX( X1, X0, X3,  7 );  /* ( x4, x0, xc,  7 )  */ \
+   X2 = ARX( X2, X1, X0,  9 );  /* ( x8, x4, x0,  9 )  */ \
+   X3 = ARX( X3, X2, X1, 13 );  /* ( xc, x8, x4, 13 )  */ \
+   X0 = ARX( X0, X3, X2, 18 );  /* ( x0, xc, x8, 18 )  */ \
+   /* Rearrange data */ \
+   X1 = ROL_1X32( X1 ); \
+   X3 = ROR_1X32( X3 ); \
+   X2 = SWAP_64( X2 ); \
+   /* Operate on rows */ \
+   X3 = ARX( X3, X0, X1,  7 ); \
+   X2 = ARX( X2, X3, X0,  9 ); \
+   X1 = ARX( X1, X2, X3, 13 ); \
+   X0 = ARX( X0, X1, X2, 18 ); \
+   /* Final round, don't rearrange data
+   X1 = ROR_1X32( X1 ); \
+   X2 = SWAP_64( X2 ); \
+   X3 = ROL_1X32( X3 ); */
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 )
+#define SALSA_2ROUNDS_SIMD128_2BUF \
+   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
+   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
+   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
+   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
+   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
+   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
+   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 );
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \
+   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
+   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
+   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
+   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
+   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
+   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
+   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 );
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+//                XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3 )
+#define SALSA_2ROUNDS_SIMD128_4BUF \
+   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
+   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
+   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
+   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XD1 = ROL_1X32( XD1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XD3 = ROR_1X32( XD3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XD2 = SWAP_64( XD2 ); \
+   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
+   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
+   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
+   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, XD0, XD1, XD2, 18 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XD3 = ROL_1X32( XD3 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC1 = ROR_1X32( XC1 ); \
+   XD1 = ROR_1X32( XD1 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XD2 = SWAP_64( XD2 );
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_4BUF \
+   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
+   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
+   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
+   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XD1 = ROL_1X32( XD1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XD3 = ROR_1X32( XD3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XD2 = SWAP_64( XD2 ); \
+   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
+   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
+   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
+   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, XD0, XD1, XD2, 18 );
+
+// Inlined ARX
+#define SALSA_2ROUNDS_SIMD128_3BUF \
+do{ \
+   TYPE TA = ADD32( XA0, XA3 ); \
+   TYPE TB = ADD32( XB0, XB3 ); \
+   TYPE TC = ADD32( XC0, XC3 ); \
+   TA = ROL32( TA, 7 ); \
+   TB = ROL32( TB, 7 ); \
+   TC = ROL32( TC, 7 ); \
+   XA1 = XOR( XA1, TA ); \
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA0 ); \
+   TB = ADD32( XB1, XB0 ); \
+   TC = ADD32( XC1, XC0 ); \
+   TA = ROL32( TA, 9 ); \
+   TB = ROL32( TB, 9 ); \
+   TC = ROL32( TC, 9 ); \
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA1 ); \
+   TB = ADD32( XB2, XB1 ); \
+   TC = ADD32( XC2, XC1 ); \
+   TA = ROL32( TA, 13 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = XOR( XA3, TA ); \
+   TB = ROL32( TB, 13 ); \
+   XB3 = XOR( XB3, TB ); \
+   TC = ROL32( TC, 13 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA2 ); \
+   TB = ADD32( XB3, XB2 ); \
+   TC = ADD32( XC3, XC2 ); \
+   TA = ROL32( TA, 18 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XA0 = XOR( XA0, TA ); \
+   TB = ROL32( TB, 18 ); \
+   XB0 = XOR( XB0, TB ); \
+   TC = ROL32( TC, 18 ); \
+   XC0 = XOR( XC0, TC ); \
+\
+   TA = ADD32( XA0, XA1 ); \
+   TB = ADD32( XB0, XB1 ); \
+   TC = ADD32( XC0, XC1 ); \
+   TA = ROL32( TA, 7 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, TA ); \
+   TB = ROL32( TB, 7 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XB3 = XOR( XB3, TB ); \
+   TC = ROL32( TC, 7 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA0 ); \
+   TB = ADD32( XB3, XB0 ); \
+   TC = ADD32( XC3, XC0 ); \
+   TA = ROL32( TA, 9 ); \
+   TB = ROL32( TB, 9 ); \
+   TC = ROL32( TC, 9 ); \
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA3 ); \
+   TB = ADD32( XB2, XB3 ); \
+   TA = ROL32( TA, 13 ); \
+   TC = ADD32( XC2, XC3 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   TB = ROL32( TB, 13 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XA1 = XOR( XA1, TA ); \
+   TC = ROL32( TC, 13 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA2 ); \
+   TB = ADD32( XB1, XB2 ); \
+   TA = ROL32( TA, 18); \
+   TC = ADD32( XC1, XC2 ); \
+   XA2 = SWAP_64( XA2 ); \
+   TB = ROL32( TB, 18); \
+   XA0 = XOR( XA0, TA ); \
+   XB2 = SWAP_64( XB2 ); \
+   TC = ROL32( TC, 18); \
+   XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC0 = XOR( XC0, TC ); \
+   XC1 = ROR_1X32( XC1 ); \
+} while (0);
+   
+
+// slow rol, an attempt to optimze non-avx512 bit rotations
+#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROL \
+do{ \
+   TYPE TA = ADD32( XA0, XA3 ); \
+   TYPE TB = ADD32( XB0, XB3 ); \
+   TYPE TC = ADD32( XC0, XC3 ); \
+   TYPE T  = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
+   XA1 = XOR( XA1, T  ); \
+   T = _mm_slli_epi32( TB, 7 );\
+   XA1 = XOR( XA1, TA  ); \
+   TB = _mm_srli_epi32( TB, 25 ); \
+   XB1 = XOR( XB1, T ); \
+   T = _mm_slli_epi32( TC, 7 );\
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, T ); \
+   TC = _mm_srli_epi32( TC, 25 );\
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA0 ); \
+   TB = ADD32( XB1, XB0 ); \
+   TC = ADD32( XC1, XC0 ); \
+   T  = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
+   T = _mm_slli_epi32( TB, 9 );\
+   TB = _mm_srli_epi32( TB, 23 );\
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, T ); \
+   T = _mm_slli_epi32( TC, 9 );\
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, T ); \
+   TC = _mm_srli_epi32( TC, 23 );\
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA1 ); \
+   TB = ADD32( XB2, XB1 ); \
+   TC = ADD32( XC2, XC1 ); \
+   T  = _mm_slli_epi32( TA, 13); \
+   TA = _mm_srli_epi32( TA, 19 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XA3 = XOR( XA3, T ); \
+   XB1 = ROL_1X32( XB1 ); \
+   T  = _mm_slli_epi32( TB, 13); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XA3 = XOR( XA3, TA ); \
+   XB3 = XOR( XB3, T ); \
+   T  = _mm_slli_epi32( TC, 13); \
+   TC = _mm_srli_epi32( TC, 19 ); \
+   XB3 = XOR( XB3, TB ); \
+   XC3 = XOR( XC3, T ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA2 ); \
+   TB = ADD32( XB3, XB2 ); \
+   TC = ADD32( XC3, XC2 ); \
+   T  = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XA0 = XOR( XA0, T ); \
+   T  = _mm_slli_epi32( TB, 18 ); \
+   XB2 = SWAP_64( XB2 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
+   XB0 = XOR( XB0, T ); \
+   T = _mm_slli_epi32( TC, 18 ); \
+   XA0 = XOR( XA0, TA ); \
+   TC = _mm_srli_epi32( TC, 14 ); \
+   XC0 = XOR( XC0, T ); \
+   XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
+   XC0 = XOR( XC0, TC ); \
+\
+   TA = ADD32( XA0, XA1 ); \
+   TB = ADD32( XB0, XB1 ); \
+   TC = ADD32( XC0, XC1 ); \
+   TA = ROL32( TA, 7 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, TA ); \
+   TB = ROL32( TB, 7 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XB3 = XOR( XB3, TB ); \
+   TC = ROL32( TC, 7 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA0 ); \
+   TB = ADD32( XB3, XB0 ); \
+   TC = ADD32( XC3, XC0 ); \
+   TA = ROL32( TA, 9 ); \
+   TB = ROL32( TB, 9 ); \
+   TC = ROL32( TC, 9 ); \
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA3 ); \
+   TB = ADD32( XB2, XB3 ); \
+   TA = ROL32( TA, 13 ); \
+   TC = ADD32( XC2, XC3 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   TB = ROL32( TB, 13 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XA1 = XOR( XA1, TA ); \
+   TC = ROL32( TC, 13 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA2 ); \
+   TB = ADD32( XB1, XB2 ); \
+   TA = ROL32( TA, 18); \
+   TC = ADD32( XC1, XC2 ); \
+   XA2 = SWAP_64( XA2 ); \
+   TB = ROL32( TB, 18); \
+   XA0 = XOR( XA0, TA ); \
+   XB2 = SWAP_64( XB2 ); \
+   TC = ROL32( TC, 18); \
+   XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC0 = XOR( XC0, TC ); \
+   XC1 = ROR_1X32( XC1 ); \
+} while (0);
+
+
+/*
+// Standard version using ARX
+#define SALSA_2ROUNDS_SIMD128_3BUF \
+   ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, 7 ); \
+   ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0,  9 ); \
+   ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, 13 ); \
+   ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1,  7 ); \
+   ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0,  9 ); \
+   ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, 13 ); \
+   ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, 18 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC1 = ROR_1X32( XC1 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 );
+*/
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_3BUF \
+   ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, 7 ); \
+   ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0,  9 ); \
+   ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, 13 ); \
+   ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1,  7 ); \
+   ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0,  9 ); \
+   ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, 13 ); \
+   ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, 18 );
+
+
+#define SALSA_8ROUNDS_SIMD128 \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 );
+
+#define SALSA_8ROUNDS_FINAL_SIMD128 \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 );
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 )
+#define SALSA_8ROUNDS_SIMD128_2BUF \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF;
+
+#define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_FINAL_SIMD128_2BUF;
+
+#define SALSA_8ROUNDS_SIMD128_3BUF \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF;
+
+#define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_FINAL_SIMD128_3BUF;
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+//                XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3, )
+#define SALSA_8ROUNDS_SIMD128_4BUF \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF;
+
+#define SALSA_8ROUNDS_FINAL_SIMD128_4BUF \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_FINAL_SIMD128_4BUF;
+
+// Used by reference code and pure parallel implementations
+//
+// Implied arguments ( x0, x1, x2, x3, x4, x5, x6, x7,
+//                     x8, x9, xa, xb, xc, xd, xe, xf )
+//
+#define SALSA_COLUMN \
+   x4 = ARX( x4, x0, xc,  7 ); \
+   x9 = ARX( x9, x5, x1,  7 ); \
+   xe = ARX( xe, xa, x6,  7 ); \
+   x3 = ARX( x3, xf, xb,  7 ); \
+   x8 = ARX( x8, x4, x0,  9 ); \
+   xd = ARX( xd, x9, x5,  9 ); \
+   x2 = ARX( x2, xe, xa,  9 ); \
+   x7 = ARX( x7, x3, xf,  9 ); \
+   xc = ARX( xc, x8, x4, 13 ); \
+   x1 = ARX( x1, xd, x9, 13 ); \
+   x6 = ARX( x6, x2, xe, 13 ); \
+   xb = ARX( xb, x7, x3, 13 ); \
+   x0 = ARX( x0, xc, x8, 18 ); \
+   x5 = ARX( x5, x1, xd, 18 ); \
+   xa = ARX( xa, x6, x2, 18 ); \
+   xf = ARX( xf, xb, x7, 18 ) 
+   
+#define SALSA_ROW \
+   x1 = ARX( x1, x0, x3,  7 ); \
+   x6 = ARX( x6, x5, x4,  7 ); \
+   xb = ARX( xb, xa, x9,  7 ); \
+   xc = ARX( xc, xf, xe,  7 ); \
+   x2 = ARX( x2, x1, x0,  9 ); \
+   x7 = ARX( x7, x6, x5,  9 ); \
+   x8 = ARX( x8, xb, xa,  9 ); \
+   xd = ARX( xd, xc, xf,  9 ); \
+   x3 = ARX( x3, x2, x1, 13 ); \
+   x4 = ARX( x4, x7, x6, 13 ); \
+   x9 = ARX( x9, x8, xb, 13 ); \
+   xe = ARX( xe, xd, xc, 13 ); \
+   x0 = ARX( x0, x3, x2, 18 ); \
+   x5 = ARX( x5, x4, x7, 18 ); \
+   xa = ARX( xa, x9, x8, 18 ); \
+   xf = ARX( xf, xe, xd, 18 );
+
+#define SALSA_2ROUNDS    SALSA_COLUMN; SALSA_ROW;
+
+#define SALSA_8ROUNDS \
+   SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Tested OK but very slow
+// 16 way parallel, requires 16x32 interleaving
+static void xor_salsa8_16way( __m512i * const B, const __m512i * const C)
+{
+   __m512i x0 = B[ 0] = _mm512_xor_si512( B[ 0], C[ 0] );
+   __m512i x1 = B[ 1] = _mm512_xor_si512( B[ 1], C[ 1] );
+   __m512i x2 = B[ 2] = _mm512_xor_si512( B[ 2], C[ 2] );
+   __m512i x3 = B[ 3] = _mm512_xor_si512( B[ 3], C[ 3] );
+   __m512i x4 = B[ 4] = _mm512_xor_si512( B[ 4], C[ 4] );
+   __m512i x5 = B[ 5] = _mm512_xor_si512( B[ 5], C[ 5] );
+   __m512i x6 = B[ 6] = _mm512_xor_si512( B[ 6], C[ 6] );
+   __m512i x7 = B[ 7] = _mm512_xor_si512( B[ 7], C[ 7] );
+   __m512i x8 = B[ 8] = _mm512_xor_si512( B[ 8], C[ 8] );
+   __m512i x9 = B[ 9] = _mm512_xor_si512( B[ 9], C[ 9] );
+   __m512i xa = B[10] = _mm512_xor_si512( B[10], C[10] );
+   __m512i xb = B[11] = _mm512_xor_si512( B[11], C[11] );
+   __m512i xc = B[12] = _mm512_xor_si512( B[12], C[12] );
+   __m512i xd = B[13] = _mm512_xor_si512( B[13], C[13] );
+   __m512i xe = B[14] = _mm512_xor_si512( B[14], C[14] );
+   __m512i xf = B[15] = _mm512_xor_si512( B[15], C[15] );
+
+   #define ROL32       _mm512_rol_epi32
+   #define ADD32       _mm512_add_epi32
+   #define XOR         _mm512_xor_si512
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   
+   B[ 0] = _mm512_add_epi32( B[ 0], x0 );
+   B[ 1] = _mm512_add_epi32( B[ 1], x1 );
+   B[ 2] = _mm512_add_epi32( B[ 2], x2 );
+   B[ 3] = _mm512_add_epi32( B[ 3], x3 );
+   B[ 4] = _mm512_add_epi32( B[ 4], x4 );
+   B[ 5] = _mm512_add_epi32( B[ 5], x5 );
+   B[ 6] = _mm512_add_epi32( B[ 6], x6 );
+   B[ 7] = _mm512_add_epi32( B[ 7], x7 );
+   B[ 8] = _mm512_add_epi32( B[ 8], x8 );
+   B[ 9] = _mm512_add_epi32( B[ 9], x9 );
+   B[10] = _mm512_add_epi32( B[10], xa );
+   B[11] = _mm512_add_epi32( B[11], xb );
+   B[12] = _mm512_add_epi32( B[12], xc );
+   B[13] = _mm512_add_epi32( B[13], xd );
+   B[14] = _mm512_add_epi32( B[14], xe );
+   B[15] = _mm512_add_epi32( B[15], xf );
+}
+
+void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128*16 );
+      xor_salsa8_16way( &X[ 0], &X[16] );
+      xor_salsa8_16way( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      m512_ovly *vptr[16];   // pointer to V offset for each lane 
+      m512_ovly *x16 = (m512_ovly*)(&X[16]);
+
+      // create pointers to V for each lane using data from each lane of X[16]
+      // as index.
+      for ( int l = 0; l < 16; l++ )
+      {
+         uint32_t xl = (*x16).u32[l];
+         vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
+      }
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         m512_ovly v;    // V value assembled from different indexes
+         for ( int l = 0; l < 8; l++ )
+            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
+         X[ k ] = _mm512_xor_si512( X[ k ], v.m512 );
+      }
+
+      xor_salsa8_16way( &X[ 0], &X[16] );
+      xor_salsa8_16way( &X[16], &X[ 0] );
+   }
+}
+
+// Working, not up to date, needs stream optimization.
+// 4x32 interleaving
+static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
+{
+   __m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+   __m512i *B = (__m512i*)b; 
+   const __m512i *C = (const __m512i*)c;
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm512_xor_si512( B[0], C[0] );
+   B[1] = _mm512_xor_si512( B[1], C[1] );
+   B[2] = _mm512_xor_si512( B[2], C[2] );
+   B[3] = _mm512_xor_si512( B[3], C[3] );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[1], B[0] );
+   X0 = _mm512_mask_blend_epi64( 0x30, B[3], B[2] );
+   X0 = _mm512_mask_blend_epi64( 0x0f, X0, Y0 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[2], B[1] );
+   X1 = _mm512_mask_blend_epi64( 0x30, B[0], B[3] );
+   X1 = _mm512_mask_blend_epi64( 0x0f, X1, Y0 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[3], B[2] );
+   X2 = _mm512_mask_blend_epi64( 0x30, B[1], B[0] );
+   X2 = _mm512_mask_blend_epi64( 0x0f, X2, Y0 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[0], B[3] );
+   X3 = _mm512_mask_blend_epi64( 0x30, B[2], B[1] );
+   X3 = _mm512_mask_blend_epi64( 0x0f, X3, Y0 );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm512_shufll_128 
+   #define ROR_1X32    mm512_shuflr_128
+   #define SWAP_64     mm512_swap_256
+   #define ROL32       _mm512_rol_epi32
+   #define ADD32       _mm512_add_epi32
+   #define XOR         _mm512_xor_si512
+
+   SALSA_8ROUNDS_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   Y0 = _mm512_mask_blend_epi64( 0xc0, X0, X1 );
+   Y1 = _mm512_mask_blend_epi64( 0x03, X0, X1 );
+   Y2 = _mm512_mask_blend_epi64( 0x0c, X0, X1 );
+   Y3 = _mm512_mask_blend_epi64( 0x30, X0, X1 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x30, Y0, X2 );
+   Y1 = _mm512_mask_blend_epi64( 0xc0, Y1, X2 );
+   Y2 = _mm512_mask_blend_epi64( 0x03, Y2, X2 );
+   Y3 = _mm512_mask_blend_epi64( 0x0c, Y3, X2 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x0c, Y0, X3 );
+   Y1 = _mm512_mask_blend_epi64( 0x30, Y1, X3 );
+   Y2 = _mm512_mask_blend_epi64( 0xc0, Y2, X3 );
+   Y3 = _mm512_mask_blend_epi64( 0x03, Y3, X3 );
+
+   B[0] = _mm512_add_epi32( B[0], Y0 );
+   B[1] = _mm512_add_epi32( B[1], Y1 );
+   B[2] = _mm512_add_epi32( B[2], Y2 );
+   B[3] = _mm512_add_epi32( B[3], Y3 );
+}
+
+// data format for 512 bits: 4 * ( 4 way 32 )
+// { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2,
+//   l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 }
+
+void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 4*128 );
+      salsa8_simd128_4way( &X[ 0], &X[16] );
+      salsa8_simd128_4way( &X[16], &X[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      uint32_t x16[4];   // index into V for each lane
+      memcpy( x16, &X[16], 16 );
+      x16[0] = 32 * ( x16[0] & ( N-1) );
+      x16[1] = 32 * ( x16[1] & ( N-1) );
+      x16[2] = 32 * ( x16[2] & ( N-1) );
+      x16[3] = 32 * ( x16[3] & ( N-1) );
+      m128_ovly *v = (m128_ovly*)V;
+
+      for( int k = 0; k < 32; k++ )
+      {
+         X[k] = _mm_xor_si128( X[k], _mm_set_epi32( v[ x16[3] + k ].u32[3],
+                                                    v[ x16[2] + k ].u32[2],
+                                                    v[ x16[1] + k ].u32[1],
+                                                    v[ x16[0] + k ].u32[0] ) );
+      }
+
+      salsa8_simd128_4way( &X[ 0], &X[16] );
+      salsa8_simd128_4way( &X[16], &X[ 0] );
+   }
+}
+
+// not working, occasional accepted shares, not up to date.
+// 4x128 interleaving
+static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
+{
+   __m512i X0, X1, X2, X3;
+   uint32_t *b = (uint32_t*)B;
+   m512_ovly y[4], z[4];
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm512_xor_si512( B[0], C[0] );
+   B[1] = _mm512_xor_si512( B[1], C[1] );
+   B[2] = _mm512_xor_si512( B[2], C[2] );
+   B[3] = _mm512_xor_si512( B[3], C[3] );
+
+   // { l3u15, l3u10, l3u5, l3u0,  l2u15, l2u10, l2u5, l2u0,
+   //   l1u15, l1u10, l1u5, l1u0,  l0u15, l0u10, l0u5, l0u0 }
+
+   //  b index = row index     + lane index + unit index
+   //          = ( 8 * (u/4) ) +  ( 4*l )   +  ( u%4 )
+
+   X0 = _mm512_set_epi32( b[63], b[46], b[29], b[12],   // lane 3[3:0]
+                          b[59], b[42], b[25], b[ 8],   // lane 2[3:0]
+                          b[55], b[38], b[21], b[ 4],   // lane 1[3:0]
+                          b[51], b[34], b[17], b[ 0] ); // lane 0[3:0]
+
+   X1 = _mm512_set_epi32( b[15], b[62], b[45], b[28], 
+                          b[11], b[58], b[41], b[24],  
+                          b[ 7], b[54], b[37], b[20],
+                          b[ 3], b[50], b[33], b[16] ); // lane 0[7:4]
+
+   X2 = _mm512_set_epi32( b[31], b[14], b[61], b[44],
+                          b[27], b[10], b[57], b[40],
+                          b[23], b[ 6], b[53], b[36],
+                          b[19], b[ 2], b[49], b[32] );
+
+   X3 = _mm512_set_epi32( b[47], b[30], b[13], b[60],
+                          b[43], b[26], b[ 9], b[56],
+                          b[39], b[22], b[ 5], b[52],
+                          b[35], b[18], b[ 1], b[48] );
+
+
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm512_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm512_shuflr128_32
+   #define SWAP_64     mm512_swap128_64
+   #define ROL32       _mm512_rol_epi32
+   #define ADD32       _mm512_add_epi32
+   #define XOR         _mm512_xor_si512
+
+   SALSA_8ROUNDS_FINAL_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   y[0].m512 = X0;
+   y[1].m512 = X1;
+   y[2].m512 = X2;
+   y[3].m512 = X3;
+
+   // lane 0
+   z[0].u32[ 0    ] = y[0].u32[ 0];
+   z[0].u32[ 3    ] = y[1].u32[ 0];
+   z[0].u32[ 2    ] = y[2].u32[ 0];
+   z[0].u32[ 1    ] = y[3].u32[ 0];
+
+   // lane 1
+   z[0].u32[ 0+ 4 ] = y[0].u32[ 4];
+   z[0].u32[ 3+ 4 ] = y[1].u32[ 4];
+   z[0].u32[ 2+ 4 ] = y[2].u32[ 4];
+   z[0].u32[ 1+ 4 ] = y[3].u32[ 4];
+
+   // lane 2
+   z[0].u32[ 0+ 8 ] = y[0].u32[ 8];
+   z[0].u32[ 3+ 8 ] = y[1].u32[ 8];
+   z[0].u32[ 2+ 8 ] = y[2].u32[ 8];
+   z[0].u32[ 1+ 8 ] = y[3].u32[ 8];
+   
+   // lane 3
+   z[0].u32[ 0+12 ] = y[0].u32[12];
+   z[0].u32[ 3+12 ] = y[1].u32[12];
+   z[0].u32[ 2+12 ] = y[2].u32[12];
+   z[0].u32[ 1+12 ] = y[3].u32[12];
+
+   // lane 0
+   z[1].u32[ 1    ] = y[0].u32[ 1];
+   z[1].u32[ 0    ] = y[1].u32[ 1];
+   z[1].u32[ 3    ] = y[2].u32[ 1];
+   z[1].u32[ 2    ] = y[3].u32[ 1];
+
+   //lane 1
+   z[1].u32[ 1+ 4 ] = y[0].u32[ 5];
+   z[1].u32[ 0+ 4 ] = y[1].u32[ 5];
+   z[1].u32[ 3+ 4 ] = y[2].u32[ 5];
+   z[1].u32[ 2+ 4 ] = y[3].u32[ 5];
+
+   // lane 2
+   z[1].u32[ 1+ 8 ] = y[0].u32[ 9];
+   z[1].u32[ 0+ 8 ] = y[1].u32[ 9];
+   z[1].u32[ 3+ 8 ] = y[2].u32[ 9];
+   z[1].u32[ 2+ 8 ] = y[3].u32[ 9];
+
+   // lane 3
+   z[1].u32[ 1+12 ] = y[0].u32[13];
+   z[1].u32[ 0+12 ] = y[1].u32[13];
+   z[1].u32[ 3+12 ] = y[2].u32[13];
+   z[1].u32[ 2+12 ] = y[3].u32[13];
+  
+   // lane 0
+   z[2].u32[ 2    ] = y[0].u32[2];
+   z[2].u32[ 1    ] = y[1].u32[2];
+   z[2].u32[ 0    ] = y[2].u32[2];
+   z[2].u32[ 3    ] = y[3].u32[2];
+
+   // lane 1
+   z[2].u32[ 2+ 4 ] = y[0].u32[6];
+   z[2].u32[ 1+ 4 ] = y[1].u32[6];
+   z[2].u32[ 0+ 4 ] = y[2].u32[6];
+   z[2].u32[ 3+ 4 ] = y[3].u32[6];
+
+   // lane 2
+   z[2].u32[ 2+ 8 ] = y[0].u32[10];
+   z[2].u32[ 1+ 8 ] = y[1].u32[10];
+   z[2].u32[ 0+ 8 ] = y[2].u32[10];
+   z[2].u32[ 3+ 8 ] = y[3].u32[10];
+
+   // lane 3
+   z[2].u32[ 2+12 ] = y[0].u32[14];
+   z[2].u32[ 1+12 ] = y[1].u32[14];
+   z[2].u32[ 0+12 ] = y[2].u32[14];
+   z[2].u32[ 3+12 ] = y[3].u32[14];
+   
+   // lane 0
+   z[3].u32[ 3    ] = y[0].u32[ 3];
+   z[3].u32[ 2    ] = y[1].u32[ 3];
+   z[3].u32[ 1    ] = y[2].u32[ 3];
+   z[3].u32[ 0    ] = y[3].u32[ 3];
+
+   // lane 1
+   z[3].u32[ 3+ 4 ] = y[0].u32[ 7];
+   z[3].u32[ 2+ 4 ] = y[1].u32[ 7];
+   z[3].u32[ 1+ 4 ] = y[2].u32[ 7];
+   z[3].u32[ 0+ 4 ] = y[3].u32[ 7];
+
+   // lane 2
+   z[3].u32[ 3+ 8 ] = y[0].u32[11];
+   z[3].u32[ 2+ 8 ] = y[1].u32[11];
+   z[3].u32[ 1+ 8 ] = y[2].u32[11];
+   z[3].u32[ 0+ 8 ] = y[3].u32[11];
+
+   // lane 1
+   z[3].u32[ 3+12 ] = y[0].u32[15];
+   z[3].u32[ 2+12 ] = y[1].u32[15];
+   z[3].u32[ 1+12 ] = y[2].u32[15];
+   z[3].u32[ 0+12 ] = y[3].u32[15];
+
+   B[0] = _mm512_add_epi32( B[0], z[0].m512 );
+   B[1] = _mm512_add_epi32( B[1], z[1].m512 );
+   B[2] = _mm512_add_epi32( B[2], z[2].m512 );
+   B[3] = _mm512_add_epi32( B[3], z[3].m512 );
+}
+
+void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 8], X, 128*4 );
+      salsa8_4way_simd128( &X[0], &X[4] );
+      salsa8_4way_simd128( &X[4], &X[0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      m512_ovly x16;
+      x16 = ( (m512_ovly*)X )[4];
+      uint32_t j0 = 8 * ( x16.u32[ 0] & ( N-1 ) );
+      uint32_t j1 = 8 * ( x16.u32[ 4] & ( N-1 ) );
+      uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) );
+      uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+         X[k] = _mm512_xor_si512( X[k], m512_const_128( 
+                                   ( (m512_ovly*)V )[ j3+k ].m128[3],
+                                   ( (m512_ovly*)V )[ j2+k ].m128[2],
+                                   ( (m512_ovly*)V )[ j1+k ].m128[1],
+                                   ( (m512_ovly*)V )[ j0+k ].m128[0] ) );
+
+/*
+      for ( int k = 0; k < 8; k++ )
+         X[k] = _mm512_xor_si512( X[k], m512_diagonal128_32( 
+                   V[ j3+k ], V[ j2+k ], V[ j1+k ], V[ j0+k ] ) );
+*/
+      salsa8_4way_simd128( &X[0], &X[4] );
+      salsa8_4way_simd128( &X[4], &X[0] );
+   }
+}
+   
+
+
+#endif // AVX512
+
+#if defined(__AVX2__)
+
+// 8x memory usage
+// Tested OK but slow scrypt, very slow scryptn2, 2x4way is faster
+// Crashes with large N & many threads, OOM? Use only for scrypt
+// 8x32 interleaving
+static void salsa8_8way( __m256i * const B, const __m256i * const C )
+{
+   __m256i x0 = B[ 0] = _mm256_xor_si256( B[ 0], C[ 0] );
+   __m256i x1 = B[ 1] = _mm256_xor_si256( B[ 1], C[ 1] );
+   __m256i x2 = B[ 2] = _mm256_xor_si256( B[ 2], C[ 2] );
+   __m256i x3 = B[ 3] = _mm256_xor_si256( B[ 3], C[ 3] );
+   __m256i x4 = B[ 4] = _mm256_xor_si256( B[ 4], C[ 4] );
+   __m256i x5 = B[ 5] = _mm256_xor_si256( B[ 5], C[ 5] );
+   __m256i x6 = B[ 6] = _mm256_xor_si256( B[ 6], C[ 6] );
+   __m256i x7 = B[ 7] = _mm256_xor_si256( B[ 7], C[ 7] );
+   __m256i x8 = B[ 8] = _mm256_xor_si256( B[ 8], C[ 8] );
+   __m256i x9 = B[ 9] = _mm256_xor_si256( B[ 9], C[ 9] );
+   __m256i xa = B[10] = _mm256_xor_si256( B[10], C[10] );
+   __m256i xb = B[11] = _mm256_xor_si256( B[11], C[11] );
+   __m256i xc = B[12] = _mm256_xor_si256( B[12], C[12] );
+   __m256i xd = B[13] = _mm256_xor_si256( B[13], C[13] );
+   __m256i xe = B[14] = _mm256_xor_si256( B[14], C[14] );
+   __m256i xf = B[15] = _mm256_xor_si256( B[15], C[15] );
+
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   B[ 0] = _mm256_add_epi32( B[ 0], x0 );
+   B[ 1] = _mm256_add_epi32( B[ 1], x1 );
+   B[ 2] = _mm256_add_epi32( B[ 2], x2 );
+   B[ 3] = _mm256_add_epi32( B[ 3], x3 );
+   B[ 4] = _mm256_add_epi32( B[ 4], x4 );
+   B[ 5] = _mm256_add_epi32( B[ 5], x5 );
+   B[ 6] = _mm256_add_epi32( B[ 6], x6 );
+   B[ 7] = _mm256_add_epi32( B[ 7], x7 );
+   B[ 8] = _mm256_add_epi32( B[ 8], x8 );
+   B[ 9] = _mm256_add_epi32( B[ 9], x9 );
+   B[10] = _mm256_add_epi32( B[10], xa );
+   B[11] = _mm256_add_epi32( B[11], xb );
+   B[12] = _mm256_add_epi32( B[12], xc );
+   B[13] = _mm256_add_epi32( B[13], xd );
+   B[14] = _mm256_add_epi32( B[14], xe );
+   B[15] = _mm256_add_epi32( B[15], xf );
+}
+
+void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128*8 );
+      salsa8_8way( &X[ 0], &X[16] );
+      salsa8_8way( &X[16], &X[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      m256_ovly *vptr[8];   // pointer to V offset for each lane 
+      m256_ovly *x16 = (m256_ovly*)(&X[16]);
+
+      // create pointers to V for each lane using data from each lane of X[16]
+      // as index.
+      for ( int l = 0; l < 8; l++ )
+      {
+         uint32_t xl = (*x16).u32[l];
+         vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
+      }
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         m256_ovly v;    // V value assembled from different indexes
+         for ( int l = 0; l < 8; l++ )
+            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
+         X[ k ] = _mm256_xor_si256( X[ k ], v.m256 );
+      }
+
+      salsa8_8way( &X[ 0], &X[16] );
+      salsa8_8way( &X[16], &X[ 0] );
+   }
+}
+
+// 2x memory usage
+// Working, not up to date, needs stream optimization.
+// Essentially Pooler 6way
+// 2x128 interleaved simd128
+//   ------- lane 1 -------    ------- lane 0 -------
+// { l1x3, l1x2, l1x1, l1x0,   l0x3, l0x2, l0x1, l0x0 }   b[3]  B[ 7: 0]
+// { l1x7, l1x6, l1x5, l1x4,   l0x7, l0x6, l0x5, l0x4 }   b[2]  B[15: 8]
+// { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
+// { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]
+
+static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
+{
+   __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm256_xor_si256( B[0], C[0] );
+   B[1] = _mm256_xor_si256( B[1], C[1] );
+   B[2] = _mm256_xor_si256( B[2], C[2] );
+   B[3] = _mm256_xor_si256( B[3], C[3] );
+
+   Y0 = _mm256_blend_epi32( B[1], B[0], 0x11 );
+   X0 = _mm256_blend_epi32( B[3], B[2], 0x44 );
+   X0 = _mm256_blend_epi32( X0, Y0, 0x33);
+
+   Y1 = _mm256_blend_epi32( B[2], B[1], 0x11 );
+   X1 = _mm256_blend_epi32( B[0], B[3], 0x44 );
+   X1 = _mm256_blend_epi32( X1, Y1, 0x33 );
+
+   Y2 = _mm256_blend_epi32( B[3], B[2], 0x11 );
+   X2 = _mm256_blend_epi32( B[1], B[0], 0x44 );
+   X2 = _mm256_blend_epi32( X2, Y2, 0x33 );
+
+   Y3 = _mm256_blend_epi32( B[0], B[3], 0x11 );
+   X3 = _mm256_blend_epi32( B[2], B[1], 0x44 );
+   X3 = _mm256_blend_epi32( X3, Y3, 0x33 );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm256_shuflr128_32
+   #define SWAP_64     mm256_swap128_64
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+
+   SALSA_8ROUNDS_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   // init with X0 then blend in the other elements
+
+   Y0 = _mm256_blend_epi32( X0, X1, 0x88 );
+   Y1 = _mm256_blend_epi32( X0, X1, 0x11 );   
+   Y2 = _mm256_blend_epi32( X0, X1, 0x22 );   
+   Y3 = _mm256_blend_epi32( X0, X1, 0x44 );     
+
+   Y0 = _mm256_blend_epi32( Y0, X2, 0x44 );
+   Y1 = _mm256_blend_epi32( Y1, X2, 0x88 );
+   Y2 = _mm256_blend_epi32( Y2, X2, 0x11 );
+   Y3 = _mm256_blend_epi32( Y3, X2, 0x22 );
+   
+   Y0 = _mm256_blend_epi32( Y0, X3, 0x22 );
+   Y1 = _mm256_blend_epi32( Y1, X3, 0x44 );
+   Y2 = _mm256_blend_epi32( Y2, X3, 0x88 );
+   Y3 = _mm256_blend_epi32( Y3, X3, 0x11 );
+   
+   B[0] = _mm256_add_epi32( B[0], Y0 );
+   B[1] = _mm256_add_epi32( B[1], Y1 );
+   B[2] = _mm256_add_epi32( B[2], Y2 );
+   B[3] = _mm256_add_epi32( B[3], Y3 );
+}
+
+void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 8], X, 128*2 );
+      salsa8_2way_simd128( &X[0], &X[4] );
+      salsa8_2way_simd128( &X[4], &X[0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      m256_ovly x16;
+      x16 = ( (m256_ovly*)X )[4];
+      uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) );
+      uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+         X[k] = _mm256_xor_si256( X[k], _mm256_blend_epi32( V[ j1+k ],
+                                                            V[ j0+k ], 0x0f ) );
+
+      salsa8_2way_simd128( &X[0], &X[4] );
+      salsa8_2way_simd128( &X[4], &X[0] );
+   }
+}
+
+// Working
+// 2x128 interleaving
+static void salsa8_2way_simd128_2buf( __m256i * const BA, __m256i * const BB,
+      const __m256i * const CA, const __m256i * const CB )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm256_xor_si256( BA[0], CA[0] );
+   BB[0] = _mm256_xor_si256( BB[0], CB[0] );
+   BA[1] = _mm256_xor_si256( BA[1], CA[1] );
+   BB[1] = _mm256_xor_si256( BB[1], CB[1] );
+   BA[2] = _mm256_xor_si256( BA[2], CA[2] );
+   BB[2] = _mm256_xor_si256( BB[2], CB[2] );
+   BA[3] = _mm256_xor_si256( BA[3], CA[3] );
+   BB[3] = _mm256_xor_si256( BB[3], CB[3] );
+
+   YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 );
+   XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 );
+   XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 );
+   XA0 = _mm256_blend_epi32( XA0, YA0, 0x33);
+   XB0 = _mm256_blend_epi32( XB0, YB0, 0x33);
+
+   YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 );
+   XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 );
+   XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 );
+   XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 );
+   XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 );
+   XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 );
+   XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 );
+   XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 );
+   XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 );
+   XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 );
+   XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 );
+   XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 );
+   XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 );
+   
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm256_shuflr128_32
+   #define SWAP_64     mm256_swap128_64
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+
+   YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 );
+   YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 );
+   YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 );
+   YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 );
+   YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 );
+   YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 );
+   YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 );
+   YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 );
+   YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 );
+   YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 );
+   YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 );
+   YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 );
+   YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 );
+   YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 );
+   YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 );
+   YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 );
+   YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 );
+   YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 );
+   YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 );
+   YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 );
+   YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 );
+   YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 );
+
+   BA[0] = _mm256_add_epi32( BA[0], YA0 );
+   BB[0] = _mm256_add_epi32( BB[0], YB0 );
+   BA[1] = _mm256_add_epi32( BA[1], YA1 );
+   BB[1] = _mm256_add_epi32( BB[1], YB1 );
+   BA[2] = _mm256_add_epi32( BA[2], YA2 );
+   BB[2] = _mm256_add_epi32( BB[2], YB2 );
+   BA[3] = _mm256_add_epi32( BA[3], YA3 );
+   BB[3] = _mm256_add_epi32( BB[3], YB3 );
+
+}
+
+void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N )
+{
+   __m256i *X0 = X;
+   __m256i *X1 = X + 8;
+   __m256i *V0 = V;
+   __m256i *V1 = V + 8*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+      {
+         _mm256_stream_si256( V0 + i*8 + k, X0[k] );   
+         _mm256_stream_si256( V1 + i*8 + k, X1[k] );      
+      }
+      salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
+      salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      const m256_ovly x16a = ( (m256_ovly*)X0 )[4];
+      const m256_ovly x16b = ( (m256_ovly*)X1 )[4];
+      
+      const uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) );
+      const uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) );
+      const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
+      const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+      {
+         const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + k );
+         const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + k );
+         const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + k );
+         const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + k );
+         X0[k] = _mm256_xor_si256( X0[k],
+                       _mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) );
+         X1[k] = _mm256_xor_si256( X1[k],
+                       _mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) );
+
+
+/*
+         X0[k] = _mm256_xor_si256( X0[k],
+                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
+         X1[k] = _mm256_xor_si256( X1[k],
+                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
+*/
+
+      }
+
+      salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
+      salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
+   }
+}
+
+// Triple buffered, not up to date, needs stream optimization
+// 2x128 interleaving
+static void salsa8_2way_simd128_3buf( __m256i * const BA, __m256i * const BB,
+      __m256i * const BC, const __m256i * const CA, const __m256i * const CB,
+      const __m256i * const CC )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, XC0, XC1, XC2, XC3;
+   __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm256_xor_si256( BA[0], CA[0] );
+   BB[0] = _mm256_xor_si256( BB[0], CB[0] );
+   BC[0] = _mm256_xor_si256( BC[0], CC[0] );
+   BA[1] = _mm256_xor_si256( BA[1], CA[1] );
+   BB[1] = _mm256_xor_si256( BB[1], CB[1] );
+   BC[1] = _mm256_xor_si256( BC[1], CC[1] );
+   BA[2] = _mm256_xor_si256( BA[2], CA[2] );
+   BB[2] = _mm256_xor_si256( BB[2], CB[2] );
+   BC[2] = _mm256_xor_si256( BC[2], CC[2] );
+   BA[3] = _mm256_xor_si256( BA[3], CA[3] );
+   BB[3] = _mm256_xor_si256( BB[3], CB[3] );
+   BC[3] = _mm256_xor_si256( BC[3], CC[3] );
+
+   YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[1], BC[0], 0x11 );
+   XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 );
+   XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 );
+   XC0 = _mm256_blend_epi32( BC[3], BC[2], 0x44 );
+   XA0 = _mm256_blend_epi32( XA0, YA0, 0x33);
+   XB0 = _mm256_blend_epi32( XB0, YB0, 0x33);
+   XC0 = _mm256_blend_epi32( XC0, YC0, 0x33);
+
+   YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[2], BC[1], 0x11 );
+   XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 );
+   XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 );
+   XC1 = _mm256_blend_epi32( BC[0], BC[3], 0x44 );
+   XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 );
+   XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 );
+   XC1 = _mm256_blend_epi32( XC1, YC0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[3], BC[2], 0x11 );
+   XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 );
+   XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 );
+   XC2 = _mm256_blend_epi32( BC[1], BC[0], 0x44 );
+   XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 );
+   XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 );
+   XC2 = _mm256_blend_epi32( XC2, YC0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[0], BC[3], 0x11 );
+   XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 );
+   XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 );
+   XC3 = _mm256_blend_epi32( BC[2], BC[1], 0x44 );
+   XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 );
+   XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 );
+   XC3 = _mm256_blend_epi32( XC3, YC0, 0x33 );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm256_shuflr128_32
+   #define SWAP_64     mm256_swap128_64
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_SIMD128_3BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+
+   YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 );
+   YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 );
+   YC0 = _mm256_blend_epi32( XC0, XC1, 0x88 );
+   YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 );
+   YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 );
+   YC1 = _mm256_blend_epi32( XC0, XC1, 0x11 );
+   YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 );
+   YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 );
+   YC2 = _mm256_blend_epi32( XC0, XC1, 0x22 );
+   YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 );
+   YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 );
+   YC3 = _mm256_blend_epi32( XC0, XC1, 0x44 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 );
+   YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 );
+   YC0 = _mm256_blend_epi32( YC0, XC2, 0x44 );
+   YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 );
+   YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 );
+   YC1 = _mm256_blend_epi32( YC1, XC2, 0x88 );
+   YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 );
+   YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 );
+   YC2 = _mm256_blend_epi32( YC2, XC2, 0x11 );
+   YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 );
+   YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 );
+   YC3 = _mm256_blend_epi32( YC3, XC2, 0x22 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 );
+   YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 );
+   YC0 = _mm256_blend_epi32( YC0, XC3, 0x22 );
+   YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 );
+   YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 );
+   YC1 = _mm256_blend_epi32( YC1, XC3, 0x44 );
+   YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 );
+   YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 );
+   YC2 = _mm256_blend_epi32( YC2, XC3, 0x88 );
+   YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 );
+   YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 );
+   YC3 = _mm256_blend_epi32( YC3, XC3, 0x11 );
+
+   BA[0] = _mm256_add_epi32( BA[0], YA0 );
+   BB[0] = _mm256_add_epi32( BB[0], YB0 );
+   BC[0] = _mm256_add_epi32( BC[0], YC0 );
+   BA[1] = _mm256_add_epi32( BA[1], YA1 );
+   BB[1] = _mm256_add_epi32( BB[1], YB1 );
+   BC[1] = _mm256_add_epi32( BC[1], YC1 );
+   BA[2] = _mm256_add_epi32( BA[2], YA2 );
+   BB[2] = _mm256_add_epi32( BB[2], YB2 );
+   BC[2] = _mm256_add_epi32( BC[2], YC2 );
+   BA[3] = _mm256_add_epi32( BA[3], YA3 );
+   BB[3] = _mm256_add_epi32( BB[3], YB3 );
+   BC[3] = _mm256_add_epi32( BC[3], YC3 );
+
+}
+
+void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N )
+{
+   __m256i *X0 = X;
+   __m256i *X1 = X+8;
+   __m256i *X2 = X+16;
+   __m256i *V0 = V;
+   __m256i *V1 = V + 8*N;
+   __m256i *V2 = V + 16*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V0[i * 8], X0, 128*2 );
+      memcpy( &V1[i * 8], X1, 128*2 );
+      memcpy( &V2[i * 8], X2, 128*2 );
+      salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0],
+                                &X0[4], &X1[4], &X2[4] );
+      salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],
+                                &X0[0], &X1[0], &X2[0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      m256_ovly x16a, x16b, x16c;
+      x16a = ( (m256_ovly*)X0 )[4];
+      x16b = ( (m256_ovly*)X1 )[4];
+      x16c = ( (m256_ovly*)X2 )[4];
+
+      uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) );
+      uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) );
+      uint32_t j0c = 8 * ( x16c.u32[0] & ( N-1 ) );
+      uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
+      uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
+      uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+      {
+         X0[k] = _mm256_xor_si256( X0[k],
+                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
+         X1[k] = _mm256_xor_si256( X1[k],
+                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
+         X2[k] = _mm256_xor_si256( X2[k],
+                       _mm256_blend_epi32( V2[ j1c+k ], V2[ j0c+k ], 0x0f ) );
+      }
+
+      salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], 
+                                &X0[4], &X1[4], &X2[4] );
+      salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],  
+                                &X0[0], &X1[0], &X2[0] );
+   }
+}
+
+
+// 2x memory usage
+
+// Tested OK, good speed
+//
+// Serial SIMD over 2 way parallel
+
+// Uses uint64_t as a poorman's vector then applying linear SIMD to the
+// pairs of data.
+//
+// Interleaving is standard 2 way.
+// Use 64 bit shuffles but 32 bit arithmetic.
+
+//  B = { lane1, lane0 }
+//  b[i] = { B[4*i+3], B[4*i+2], B[4*i+1], B[4*i] }
+
+// 2x32 interleaving
+static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c )
+{
+   __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+   __m256i *B = (__m256i*)b; 
+   const __m256i *C = (const __m256i*)c;
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm256_xor_si256( B[0], C[0] );
+   B[1] = _mm256_xor_si256( B[1], C[1] );
+   B[2] = _mm256_xor_si256( B[2], C[2] );
+   B[3] = _mm256_xor_si256( B[3], C[3] );
+
+   Y0 = _mm256_blend_epi32( B[1], B[0], 0x03 );
+   X0 = _mm256_blend_epi32( B[3], B[2], 0x30 );
+   X0 = _mm256_blend_epi32( X0, Y0, 0x0f);
+
+   Y0 = _mm256_blend_epi32( B[2], B[1], 0x03 );
+   X1 = _mm256_blend_epi32( B[0], B[3], 0x30 );
+   X1 = _mm256_blend_epi32( X1, Y0, 0x0f );
+
+   Y0 = _mm256_blend_epi32( B[3], B[2], 0x03 );
+   X2 = _mm256_blend_epi32( B[1], B[0], 0x30 );
+   X2 = _mm256_blend_epi32( X2, Y0, 0x0f );
+
+   Y0 = _mm256_blend_epi32( B[0], B[3], 0x03 );
+   X3 = _mm256_blend_epi32( B[2], B[1], 0x30 );
+   X3 = _mm256_blend_epi32( X3, Y0, 0x0f );
+   
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+
+   SALSA_8ROUNDS_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   Y0 = _mm256_blend_epi32( X0, X1, 0xc0 );
+   Y1 = _mm256_blend_epi32( X0, X1, 0x03 );
+   Y2 = _mm256_blend_epi32( X0, X1, 0x0c );
+   Y3 = _mm256_blend_epi32( X0, X1, 0x30 );
+
+   Y0 = _mm256_blend_epi32( Y0, X2, 0x30 );
+   Y1 = _mm256_blend_epi32( Y1, X2, 0xc0 );
+   Y2 = _mm256_blend_epi32( Y2, X2, 0x03 );
+   Y3 = _mm256_blend_epi32( Y3, X2, 0x0c );
+
+   Y0 = _mm256_blend_epi32( Y0, X3, 0x0c );
+   Y1 = _mm256_blend_epi32( Y1, X3, 0x30 );
+   Y2 = _mm256_blend_epi32( Y2, X3, 0xc0 );
+   Y3 = _mm256_blend_epi32( Y3, X3, 0x03 );
+
+   B[0] = _mm256_add_epi32( B[0], Y0 );
+   B[1] = _mm256_add_epi32( B[1], Y1 );
+   B[2] = _mm256_add_epi32( B[2], Y2 );
+   B[3] = _mm256_add_epi32( B[3], Y3 );
+
+}
+
+// data format for 256 bits: 4 * ( 2 way 32 )
+// { l1d3, l0d3, l1d2, l0d2, l1d1, l0d1, l1d0, l0d0 }
+
+void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+         _mm256_stream_si256( (__m256i*)V + i*8 + k, casti_m256i( X, k ) );
+      salsa8_simd128_2way( &X[ 0], &X[16] );
+      salsa8_simd128_2way( &X[16], &X[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      // need 2 J's
+      const uint32_t j0 = 32 * ( (uint32_t)( X[16]       ) & ( N-1 ) );
+      const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+         X[k] ^= ( ( V[ j1 + k ] & 0xffffffff00000000 )
+                 | ( V[ j0 + k ] & 0x00000000ffffffff ) );  
+
+      salsa8_simd128_2way( &X[ 0], &X[16] );
+      salsa8_simd128_2way( &X[16], &X[ 0] );
+   }
+}
+
+// Double buffered, 4x memory usage
+// 2x32 interleaving
+static void salsa8_simd128_2way_2buf( uint64_t *ba, uint64_t *bb, 
+                                      const uint64_t *ca, const uint64_t *cb )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   __m256i *BA = (__m256i*)ba; 
+   __m256i *BB = (__m256i*)bb; 
+   const __m256i *CA = (const __m256i*)ca;
+   const __m256i *CB = (const __m256i*)cb;
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm256_xor_si256( BA[0], CA[0] );
+   BB[0] = _mm256_xor_si256( BB[0], CB[0] );
+   BA[1] = _mm256_xor_si256( BA[1], CA[1] );
+   BB[1] = _mm256_xor_si256( BB[1], CB[1] );
+   BA[2] = _mm256_xor_si256( BA[2], CA[2] );
+   BB[2] = _mm256_xor_si256( BB[2], CB[2] );
+   BA[3] = _mm256_xor_si256( BA[3], CA[3] );
+   BB[3] = _mm256_xor_si256( BB[3], CB[3] );
+
+   YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x03 );
+   XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x30 );
+   XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x30 );
+   XA0 = _mm256_blend_epi32( XA0, YA0, 0x0f);
+   XB0 = _mm256_blend_epi32( XB0, YB0, 0x0f);
+
+   YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x03 );
+   XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x30 );
+   XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x30 );
+   XA1 = _mm256_blend_epi32( XA1, YA0, 0x0f );
+   XB1 = _mm256_blend_epi32( XB1, YB0, 0x0f );
+
+   YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x03 );
+   XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x30 );
+   XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x30 );
+   XA2 = _mm256_blend_epi32( XA2, YA0, 0x0f );
+   XB2 = _mm256_blend_epi32( XB2, YB0, 0x0f );
+
+   YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x03 );
+   XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x30 );
+   XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x30 );
+   XA3 = _mm256_blend_epi32( XA3, YA0, 0x0f );
+   XB3 = _mm256_blend_epi32( XB3, YB0, 0x0f );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   #undef TYPE
+
+   YA0 = _mm256_blend_epi32( XA0, XA1, 0xc0 );
+   YB0 = _mm256_blend_epi32( XB0, XB1, 0xc0 );
+   YA1 = _mm256_blend_epi32( XA0, XA1, 0x03 );
+   YB1 = _mm256_blend_epi32( XB0, XB1, 0x03 );
+   YA2 = _mm256_blend_epi32( XA0, XA1, 0x0c );
+   YB2 = _mm256_blend_epi32( XB0, XB1, 0x0c );
+   YA3 = _mm256_blend_epi32( XA0, XA1, 0x30 );
+   YB3 = _mm256_blend_epi32( XB0, XB1, 0x30 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA2, 0x30 );
+   YB0 = _mm256_blend_epi32( YB0, XB2, 0x30 );
+   YA1 = _mm256_blend_epi32( YA1, XA2, 0xc0 );
+   YB1 = _mm256_blend_epi32( YB1, XB2, 0xc0 );
+   YA2 = _mm256_blend_epi32( YA2, XA2, 0x03 );
+   YB2 = _mm256_blend_epi32( YB2, XB2, 0x03 );
+   YA3 = _mm256_blend_epi32( YA3, XA2, 0x0c );
+   YB3 = _mm256_blend_epi32( YB3, XB2, 0x0c );
+
+   YA0 = _mm256_blend_epi32( YA0, XA3, 0x0c );
+   YB0 = _mm256_blend_epi32( YB0, XB3, 0x0c );
+   YA1 = _mm256_blend_epi32( YA1, XA3, 0x30 );
+   YB1 = _mm256_blend_epi32( YB1, XB3, 0x30 );
+   YA2 = _mm256_blend_epi32( YA2, XA3, 0xc0 );
+   YB2 = _mm256_blend_epi32( YB2, XB3, 0xc0 );
+   YA3 = _mm256_blend_epi32( YA3, XA3, 0x03 );
+   YB3 = _mm256_blend_epi32( YB3, XB3, 0x03 );
+
+   BA[0] = _mm256_add_epi32( BA[0], YA0 );
+   BB[0] = _mm256_add_epi32( BB[0], YB0 );
+   BA[1] = _mm256_add_epi32( BA[1], YA1 );
+   BB[1] = _mm256_add_epi32( BB[1], YB1 );
+   BA[2] = _mm256_add_epi32( BA[2], YA2 );
+   BB[2] = _mm256_add_epi32( BB[2], YB2 );
+   BA[3] = _mm256_add_epi32( BA[3], YA3 );
+   BB[3] = _mm256_add_epi32( BB[3], YB3 );
+
+}
+
+void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N )
+
+{
+   uint64_t *X0 = X;
+   uint64_t *X1 = X+32;
+   uint64_t *V0 = V;
+   uint64_t *V1 = V + 32*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + i*8 + k, casti_m256i( X0, k ) );
+         _mm256_stream_si256( (__m256i*)V1 + i*8 + k, casti_m256i( X1, k ) );
+      }
+      salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      // need 4 J's
+      const uint32_t j0l = 32 * ( (const uint32_t)( X0[16]       ) & ( N-1 ) );
+      const uint32_t j0h = 32 * ( (const uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
+      const uint32_t j1l = 32 * ( (const uint32_t)( X1[16]       ) & ( N-1 ) );
+      const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
+         
+      for ( int k = 0; k < 32; k++ )
+      {
+         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
+         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+      }
+      salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+}
+
+// Working, deprecated, not up to date
+// Triple buffered 2 way, 6x memory usage
+// 2x32 interleaving
+static void salsa8_simd128_2way_3buf( uint64_t *BA, uint64_t *BB,
+          uint64_t *BC, const uint64_t *CA, const uint64_t *CB,
+          const uint64_t *CC )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3;
+   __m256i *ba = (__m256i*)BA;
+   __m256i *bb = (__m256i*)BB;
+   __m256i *bc = (__m256i*)BC;
+   const __m256i *ca = (const __m256i*)CA;
+   const __m256i *cb = (const __m256i*)CB;
+   const __m256i *cc = (const __m256i*)CC;
+   m256_ovly ya[4], yb[4], yc[4],
+             za[4], zb[4], zc[4];
+
+   // mix C into B then shuffle B into X
+   ba[0] = _mm256_xor_si256( ba[0], ca[0] );
+   bb[0] = _mm256_xor_si256( bb[0], cb[0] );
+   bc[0] = _mm256_xor_si256( bc[0], cc[0] );
+   ba[1] = _mm256_xor_si256( ba[1], ca[1] );
+   bb[1] = _mm256_xor_si256( bb[1], cb[1] );
+   bc[1] = _mm256_xor_si256( bc[1], cc[1] );
+   ba[2] = _mm256_xor_si256( ba[2], ca[2] );
+   bb[2] = _mm256_xor_si256( bb[2], cb[2] );
+   bc[2] = _mm256_xor_si256( bc[2], cc[2] );
+   ba[3] = _mm256_xor_si256( ba[3], ca[3] );
+   bb[3] = _mm256_xor_si256( bb[3], cb[3] );
+   bc[3] = _mm256_xor_si256( bc[3], cc[3] );
+
+   XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] );
+   XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] );
+   XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] );
+   XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] );
+   XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] );
+   XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] );
+   XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] );
+   XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] );
+   XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] );
+   XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] );
+   XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] );
+   XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_FINAL_SIMD128_3BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   #undef TYPE
+
+   ya[0].m256 = XA0;    yb[0].m256 = XB0;
+   yc[0].m256 = XC0;
+   ya[1].m256 = XA1;    yb[1].m256 = XB1;
+   yc[1].m256 = XC1;
+   ya[2].m256 = XA2;    yb[2].m256 = XB2;
+   yc[2].m256 = XC2;
+   ya[3].m256 = XA3;    yb[3].m256 = XB3;
+   yc[3].m256 = XC3;
+
+   za[0].u64[0] = ya[0].u64[0];
+   zb[0].u64[0] = yb[0].u64[0];
+   zc[0].u64[0] = yc[0].u64[0];
+   za[0].u64[3] = ya[1].u64[0];
+   zb[0].u64[3] = yb[1].u64[0];
+   zc[0].u64[3] = yc[1].u64[0];
+   za[0].u64[2] = ya[2].u64[0];
+   zb[0].u64[2] = yb[2].u64[0];
+   zc[0].u64[2] = yc[2].u64[0];
+   za[0].u64[1] = ya[3].u64[0];
+   zb[0].u64[1] = yb[3].u64[0];
+   zc[0].u64[1] = yc[3].u64[0];
+
+   za[1].u64[1] = ya[0].u64[1];
+   zb[1].u64[1] = yb[0].u64[1];
+   zc[1].u64[1] = yc[0].u64[1];
+   za[1].u64[0] = ya[1].u64[1];
+   zb[1].u64[0] = yb[1].u64[1];
+   zc[1].u64[0] = yc[1].u64[1];
+   za[1].u64[3] = ya[2].u64[1];
+   zb[1].u64[3] = yb[2].u64[1];
+   zc[1].u64[3] = yc[2].u64[1];
+   za[1].u64[2] = ya[3].u64[1];
+   zb[1].u64[2] = yb[3].u64[1];
+   zc[1].u64[2] = yc[3].u64[1];
+
+   za[2].u64[2] = ya[0].u64[2];
+   zb[2].u64[2] = yb[0].u64[2];
+   zc[2].u64[2] = yc[0].u64[2];
+   za[2].u64[1] = ya[1].u64[2];
+   zb[2].u64[1] = yb[1].u64[2];
+   zc[2].u64[1] = yc[1].u64[2];
+   za[2].u64[0] = ya[2].u64[2];
+   zb[2].u64[0] = yb[2].u64[2];
+   zc[2].u64[0] = yc[2].u64[2];
+   za[2].u64[3] = ya[3].u64[2];
+   zb[2].u64[3] = yb[3].u64[2];
+   zc[2].u64[3] = yc[3].u64[2];
+
+   za[3].u64[3] = ya[0].u64[3];
+   zb[3].u64[3] = yb[0].u64[3];
+   zc[3].u64[3] = yc[0].u64[3];
+   za[3].u64[2] = ya[1].u64[3];
+   zb[3].u64[2] = yb[1].u64[3];
+   zc[3].u64[2] = yc[1].u64[3];
+   za[3].u64[1] = ya[2].u64[3];
+   zb[3].u64[1] = yb[2].u64[3];
+   zc[3].u64[1] = yc[2].u64[3];
+   za[3].u64[0] = ya[3].u64[3];
+   zb[3].u64[0] = yb[3].u64[3];
+   zc[3].u64[0] = yc[3].u64[3];
+
+   ba[0] = _mm256_add_epi32( ba[0], za[0].m256 );
+   bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 );
+   bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 );
+   ba[1] = _mm256_add_epi32( ba[1], za[1].m256 );
+   bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 );
+   bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 );
+   ba[2] = _mm256_add_epi32( ba[2], za[2].m256 );
+   bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 );
+   bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 );
+   ba[3] = _mm256_add_epi32( ba[3], za[3].m256 );
+   bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 );
+   bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 );
+}
+
+void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
+                                    const uint32_t N )
+{
+   uint64_t *X0 = X;
+   uint64_t *X1 = X+32;
+   uint64_t *X2 = X+64;
+   uint64_t *V0 = V;
+   uint64_t *V1 = V + 32*N;
+   uint64_t *V2 = V + 64*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V0[i * 32], X0, 2*128 );
+      memcpy( &V1[i * 32], X1, 2*128 );
+      memcpy( &V2[i * 32], X2, 2*128 );
+      salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                                &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
+      uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
+      uint32_t j1l = 32 * ( (uint32_t)( X1[16]       ) & ( N-1 ) );
+      uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
+      uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
+      uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
+         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
+                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
+      }
+      salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                                &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+}
+
+// Working, deprecated
+// 8x memory usage
+// 2x32 interleaving
+static void salsa8_simd128_2way_4buf( uint64_t *BA, uint64_t *BB,
+          uint64_t *BC, uint64_t *BD, const uint64_t *CA, const uint64_t *CB,
+          const uint64_t *CC, const uint64_t *CD )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
+   __m256i *ba = (__m256i*)BA;
+   __m256i *bb = (__m256i*)BB;
+   __m256i *bc = (__m256i*)BC;
+   __m256i *bd = (__m256i*)BD;
+   const __m256i *ca = (const __m256i*)CA;
+   const __m256i *cb = (const __m256i*)CB;
+   const __m256i *cc = (const __m256i*)CC;
+   const __m256i *cd = (const __m256i*)CD;
+   m256_ovly ya[4], yb[4], yc[4], yd[4],
+             za[4], zb[4], zc[4], zd[4];
+
+   // mix C into B then shuffle B into X
+   ba[0] = _mm256_xor_si256( ba[0], ca[0] );
+   bb[0] = _mm256_xor_si256( bb[0], cb[0] );
+   bc[0] = _mm256_xor_si256( bc[0], cc[0] );
+   bd[0] = _mm256_xor_si256( bd[0], cd[0] );
+   ba[1] = _mm256_xor_si256( ba[1], ca[1] );
+   bb[1] = _mm256_xor_si256( bb[1], cb[1] );
+   bc[1] = _mm256_xor_si256( bc[1], cc[1] );
+   bd[1] = _mm256_xor_si256( bd[1], cd[1] );
+   ba[2] = _mm256_xor_si256( ba[2], ca[2] );
+   bb[2] = _mm256_xor_si256( bb[2], cb[2] );
+   bc[2] = _mm256_xor_si256( bc[2], cc[2] );
+   bd[2] = _mm256_xor_si256( bd[2], cd[2] );
+   ba[3] = _mm256_xor_si256( ba[3], ca[3] );
+   bb[3] = _mm256_xor_si256( bb[3], cb[3] );
+   bc[3] = _mm256_xor_si256( bc[3], cc[3] );
+   bd[3] = _mm256_xor_si256( bd[3], cd[3] );
+
+   XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] );
+   XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] );
+   XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] );
+   XD0 = _mm256_set_epi64x( BD[15], BD[10], BD[ 5], BD[ 0] );
+   XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] );
+   XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] );
+   XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] );
+   XD1 = _mm256_set_epi64x( BD[ 3], BD[14], BD[ 9], BD[ 4] );
+   XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] );
+   XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] );
+   XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] );
+   XD2 = _mm256_set_epi64x( BD[ 7], BD[ 2], BD[13], BD[ 8] );
+   XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] );
+   XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] );
+   XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] );
+   XD3 = _mm256_set_epi64x( BD[11], BD[ 6], BD[ 1], BD[12] );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   #undef TYPE
+
+   ya[0].m256 = XA0;    yb[0].m256 = XB0;
+   yc[0].m256 = XC0;    yd[0].m256 = XD0;
+   ya[1].m256 = XA1;    yb[1].m256 = XB1;
+   yc[1].m256 = XC1;    yd[1].m256 = XD1;
+   ya[2].m256 = XA2;    yb[2].m256 = XB2;
+   yc[2].m256 = XC2;    yd[2].m256 = XD2;
+   ya[3].m256 = XA3;    yb[3].m256 = XB3;
+   yc[3].m256 = XC3;    yd[3].m256 = XD3;
+
+   za[0].u64[0] = ya[0].u64[0];
+   zb[0].u64[0] = yb[0].u64[0];
+   zc[0].u64[0] = yc[0].u64[0];
+   zd[0].u64[0] = yd[0].u64[0];
+   za[0].u64[3] = ya[1].u64[0];
+   zb[0].u64[3] = yb[1].u64[0];
+   zc[0].u64[3] = yc[1].u64[0];
+   zd[0].u64[3] = yd[1].u64[0];
+   za[0].u64[2] = ya[2].u64[0];
+   zb[0].u64[2] = yb[2].u64[0];
+   zc[0].u64[2] = yc[2].u64[0];
+   zd[0].u64[2] = yd[2].u64[0];
+   za[0].u64[1] = ya[3].u64[0];
+   zb[0].u64[1] = yb[3].u64[0];
+   zc[0].u64[1] = yc[3].u64[0];
+   zd[0].u64[1] = yd[3].u64[0];
+
+   za[1].u64[1] = ya[0].u64[1];
+   zb[1].u64[1] = yb[0].u64[1];
+   zc[1].u64[1] = yc[0].u64[1];
+   zd[1].u64[1] = yd[0].u64[1];
+   za[1].u64[0] = ya[1].u64[1];
+   zb[1].u64[0] = yb[1].u64[1];
+   zc[1].u64[0] = yc[1].u64[1];
+   zd[1].u64[0] = yd[1].u64[1];
+   za[1].u64[3] = ya[2].u64[1];
+   zb[1].u64[3] = yb[2].u64[1];
+   zc[1].u64[3] = yc[2].u64[1];
+   zd[1].u64[3] = yd[2].u64[1];
+   za[1].u64[2] = ya[3].u64[1];
+   zb[1].u64[2] = yb[3].u64[1];
+   zc[1].u64[2] = yc[3].u64[1];
+   zd[1].u64[2] = yd[3].u64[1];
+
+   za[2].u64[2] = ya[0].u64[2];
+   zb[2].u64[2] = yb[0].u64[2];
+   zc[2].u64[2] = yc[0].u64[2];
+   zd[2].u64[2] = yd[0].u64[2];
+   za[2].u64[1] = ya[1].u64[2];
+   zb[2].u64[1] = yb[1].u64[2];
+   zc[2].u64[1] = yc[1].u64[2];
+   zd[2].u64[1] = yd[1].u64[2];
+   za[2].u64[0] = ya[2].u64[2];
+   zb[2].u64[0] = yb[2].u64[2];
+   zc[2].u64[0] = yc[2].u64[2];
+   zd[2].u64[0] = yd[2].u64[2];
+   za[2].u64[3] = ya[3].u64[2];
+   zb[2].u64[3] = yb[3].u64[2];
+   zc[2].u64[3] = yc[3].u64[2];
+   zd[2].u64[3] = yd[3].u64[2];
+
+   za[3].u64[3] = ya[0].u64[3];
+   zb[3].u64[3] = yb[0].u64[3];
+   zc[3].u64[3] = yc[0].u64[3];
+   zd[3].u64[3] = yd[0].u64[3];
+   za[3].u64[2] = ya[1].u64[3];
+   zb[3].u64[2] = yb[1].u64[3];
+   zc[3].u64[2] = yc[1].u64[3];
+   zd[3].u64[2] = yd[1].u64[3];
+   za[3].u64[1] = ya[2].u64[3];
+   zb[3].u64[1] = yb[2].u64[3];
+   zc[3].u64[1] = yc[2].u64[3];
+   zd[3].u64[1] = yd[2].u64[3];
+   za[3].u64[0] = ya[3].u64[3];
+   zb[3].u64[0] = yb[3].u64[3];
+   zc[3].u64[0] = yc[3].u64[3];
+   zd[3].u64[0] = yd[3].u64[3];
+
+   ba[0] = _mm256_add_epi32( ba[0], za[0].m256 );
+   bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 );
+   bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 );
+   bd[0] = _mm256_add_epi32( bd[0], zd[0].m256 );
+   ba[1] = _mm256_add_epi32( ba[1], za[1].m256 );
+   bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 );
+   bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 );
+   bd[1] = _mm256_add_epi32( bd[1], zd[1].m256 );
+   ba[2] = _mm256_add_epi32( ba[2], za[2].m256 );
+   bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 );
+   bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 );
+   bd[2] = _mm256_add_epi32( bd[2], zd[2].m256 );
+   ba[3] = _mm256_add_epi32( ba[3], za[3].m256 );
+   bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 );
+   bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 );
+   bd[3] = _mm256_add_epi32( bd[3], zd[3].m256 );
+}
+
+void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N )
+
+{
+   uint64_t *X0 = X;
+   uint64_t *X1 = X+32;
+   uint64_t *X2 = X+64;
+   uint64_t *X3 = X+96;
+   uint64_t *V0 = V;
+   uint64_t *V1 = V + 32*N;
+   uint64_t *V2 = V + 64*N;
+   uint64_t *V3 = V + 96*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V0[i * 32], X0, 2*128 );
+      memcpy( &V1[i * 32], X1, 2*128 );
+      memcpy( &V2[i * 32], X2, 2*128 );
+      memcpy( &V3[i * 32], X3, 2*128 );
+      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                                &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      // need 4 J's
+      uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
+      uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
+      uint32_t j1l = 32 * ( (uint32_t)( X1[16]       ) & ( N-1 ) );
+      uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
+      uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
+      uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
+      uint32_t j3l = 32 * ( (uint32_t)( X3[16]       ) & ( N-1 ) );
+      uint32_t j3h = 32 * ( (uint32_t)( X3[16] >> 32 ) & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
+         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
+                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
+         X3[k] ^= ( ( V3[ j3h + k ] & 0xffffffff00000000 )
+                  | ( V3[ j3l + k ] & 0x00000000ffffffff ) );
+      }
+      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                                &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+}
+   
+
+#endif  // AVX2
+
+#if defined(__SSE2__)  // required and assumed
+
+// Simple 4 way parallel.
+// Tested OK
+// Scyptn2 a little slower than pooler
+// Scrypt 2x faster than pooler
+// 4x memory usage
+// 4x32 interleaving
+static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
+{
+   __m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] );
+   __m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] );
+   __m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] );
+   __m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] );
+   __m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] );
+   __m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] );
+   __m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] );
+   __m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] );
+   __m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] );
+   __m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] );
+   __m128i xa = B[10] = _mm_xor_si128( B[10], C[10] );
+   __m128i xb = B[11] = _mm_xor_si128( B[11], C[11] );
+   __m128i xc = B[12] = _mm_xor_si128( B[12], C[12] );
+   __m128i xd = B[13] = _mm_xor_si128( B[13], C[13] );
+   __m128i xe = B[14] = _mm_xor_si128( B[14], C[14] );
+   __m128i xf = B[15] = _mm_xor_si128( B[15], C[15] );
+
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   B[ 0] = _mm_add_epi32( B[ 0], x0 );
+   B[ 1] = _mm_add_epi32( B[ 1], x1 );
+   B[ 2] = _mm_add_epi32( B[ 2], x2 );
+   B[ 3] = _mm_add_epi32( B[ 3], x3 );
+   B[ 4] = _mm_add_epi32( B[ 4], x4 );
+   B[ 5] = _mm_add_epi32( B[ 5], x5 );
+   B[ 6] = _mm_add_epi32( B[ 6], x6 );
+   B[ 7] = _mm_add_epi32( B[ 7], x7 );
+   B[ 8] = _mm_add_epi32( B[ 8], x8 );
+   B[ 9] = _mm_add_epi32( B[ 9], x9 );
+   B[10] = _mm_add_epi32( B[10], xa );
+   B[11] = _mm_add_epi32( B[11], xb );
+   B[12] = _mm_add_epi32( B[12], xc );
+   B[13] = _mm_add_epi32( B[13], xd );
+   B[14] = _mm_add_epi32( B[14], xe );
+   B[15] = _mm_add_epi32( B[15], xf );
+}
+
+void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128*4 );
+      xor_salsa8_4way( &X[ 0], &X[16] );
+      xor_salsa8_4way( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      m128_ovly *vptr[4]; 
+      m128_ovly *x16 = (m128_ovly*)(&X[16]);
+
+      for ( int l = 0; l < 4; l++ )
+      {
+         uint32_t xl = (*x16).u32[l];
+         vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); 
+      }
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         m128_ovly v;    
+         for ( int l = 0; l < 4; l++ )
+            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
+         X[ k ] = _mm_xor_si128( X[ k ], v.m128 );
+      }
+
+      xor_salsa8_4way( &X[ 0], &X[16] );
+      xor_salsa8_4way( &X[16], &X[ 0] );
+   }
+}
+
+
+// Linear SIMD single thread. No memory increase but some shuffling overhead
+// required.
+
+// 4 way 32 bit interleaved single 32 bit thread, interleave while loading,
+// deinterleave while storing, do 2 way 128 & 4 way 128 parallel on top.
+//
+//   SALSA_2ROUNDS( {x0,x5,xa,xf}, {x4,x9,xe,x3}, {x8,xd,x2,x7}, {xc,x1,x6,xb})
+
+// Tested OK.
+// No interleaving
+static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
+{
+   __m128i X0, X1, X2, X3;
+   __m128i *B = (__m128i*)b;
+   const __m128i *C = (const __m128i*)c;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   
+   // mix C into B then shuffle B into X
+   B[0] = _mm_xor_si128( B[0], C[0] );
+   B[1] = _mm_xor_si128( B[1], C[1] );
+   B[2] = _mm_xor_si128( B[2], C[2] );
+   B[3] = _mm_xor_si128( B[3], C[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i Y0, Y1, Y2, Y3;
+
+#if defined(__AVX2__)
+   
+   Y0 = _mm_blend_epi32( B[1], B[0], 0x1 );
+   X0 = _mm_blend_epi32( B[3], B[2], 0x4 );
+   Y1 = _mm_blend_epi32( B[2], B[1], 0x1 );
+   X1 = _mm_blend_epi32( B[0], B[3], 0x4 );
+   Y2 = _mm_blend_epi32( B[3], B[2], 0x1 );
+   X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
+   Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
+   X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
+   X0 = _mm_blend_epi32( X0, Y0, 0x3);
+   X1 = _mm_blend_epi32( X1, Y1, 0x3 );
+   X2 = _mm_blend_epi32( X2, Y2, 0x3 );
+   X3 = _mm_blend_epi32( X3, Y3, 0x3 );
+
+#else // SSE4_1
+
+   Y0 = _mm_blend_epi16( B[1], B[0], 0x03 );
+   X0 = _mm_blend_epi16( B[3], B[2], 0x30 );
+   Y1 = _mm_blend_epi16( B[2], B[1], 0x03 );
+   X1 = _mm_blend_epi16( B[0], B[3], 0x30 );
+   Y2 = _mm_blend_epi16( B[3], B[2], 0x03 );
+   X2 = _mm_blend_epi16( B[1], B[0], 0x30 );
+   Y3 = _mm_blend_epi16( B[0], B[3], 0x03 );
+   X3 = _mm_blend_epi16( B[2], B[1], 0x30 );
+
+   X0 = _mm_blend_epi16( X0, Y0, 0x0f );
+   X1 = _mm_blend_epi16( X1, Y1, 0x0f );
+   X2 = _mm_blend_epi16( X2, Y2, 0x0f );
+   X3 = _mm_blend_epi16( X3, Y3, 0x0f );
+
+#endif // AVX2 else SSE4_1
+
+   SALSA_8ROUNDS_SIMD128;
+
+#if defined(__AVX2__)
+   
+   Y0 = _mm_blend_epi32( X0, X1, 0x8 );
+   Y1 = _mm_blend_epi32( X0, X1, 0x1 );
+   Y2 = _mm_blend_epi32( X0, X1, 0x2 );
+   Y3 = _mm_blend_epi32( X0, X1, 0x4 );
+
+   Y0 = _mm_blend_epi32( Y0, X2, 0x4 );
+   Y1 = _mm_blend_epi32( Y1, X2, 0x8 );
+   Y2 = _mm_blend_epi32( Y2, X2, 0x1 );
+   Y3 = _mm_blend_epi32( Y3, X2, 0x2 );
+
+   Y0 = _mm_blend_epi32( Y0, X3, 0x2 );
+   Y1 = _mm_blend_epi32( Y1, X3, 0x4 );
+   Y2 = _mm_blend_epi32( Y2, X3, 0x8 );
+   Y3 = _mm_blend_epi32( Y3, X3, 0x1 );
+
+#else  // SSE4_1
+
+   Y0 = _mm_blend_epi16( X0, X1, 0xc0 );
+   Y1 = _mm_blend_epi16( X0, X1, 0x03 );
+   Y2 = _mm_blend_epi16( X0, X1, 0x0c );
+   Y3 = _mm_blend_epi16( X0, X1, 0x30 );
+
+   Y0 = _mm_blend_epi16( Y0, X2, 0x30 );
+   Y1 = _mm_blend_epi16( Y1, X2, 0xc0 );
+   Y2 = _mm_blend_epi16( Y2, X2, 0x03 );
+   Y3 = _mm_blend_epi16( Y3, X2, 0x0c );
+
+   Y0 = _mm_blend_epi16( Y0, X3, 0x0c );
+   Y1 = _mm_blend_epi16( Y1, X3, 0x30 );
+   Y2 = _mm_blend_epi16( Y2, X3, 0xc0 );
+   Y3 = _mm_blend_epi16( Y3, X3, 0x03 );
+
+#endif   // AVX2 else SSE4_1
+
+   B[0] = _mm_add_epi32( B[0], Y0 );
+   B[1] = _mm_add_epi32( B[1], Y1 );
+   B[2] = _mm_add_epi32( B[2], Y2 );
+   B[3] = _mm_add_epi32( B[3], Y3 );
+
+#else  // SSE2
+
+   m128_ovly y[4], z[4];
+
+   X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] );
+   X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] );
+   X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] );
+   X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] );
+   
+   SALSA_8ROUNDS_FINAL_SIMD128;
+
+   // Final round doesn't shuffle data back to original input order,
+   // process it as is.
+   // X0 is unchanged                    { xf, xa, x5, x0 }
+   // X1 is shuffled left 1 (rol_1x32)   { xe, x9, x4, x3 }
+   // X2 is shuffled left 2 (swap_64)    { xd, x8, x7, x2 }
+   // X3 is shuffled left 3 (ror_1x32)   { xc, xb, x6, x1 }
+
+   y[0].m128 = X0;
+   y[1].m128 = X1;
+   y[2].m128 = X2;
+   y[3].m128 = X3;
+
+   z[0].u32[0] = y[0].u32[0];
+   z[0].u32[3] = y[1].u32[0];
+   z[0].u32[2] = y[2].u32[0];
+   z[0].u32[1] = y[3].u32[0];
+
+   z[1].u32[1] = y[0].u32[1];
+   z[1].u32[0] = y[1].u32[1];
+   z[1].u32[3] = y[2].u32[1];
+   z[1].u32[2] = y[3].u32[1];
+
+   z[2].u32[2] = y[0].u32[2];
+   z[2].u32[1] = y[1].u32[2];
+   z[2].u32[0] = y[2].u32[2];
+   z[2].u32[3] = y[3].u32[2];
+
+   z[3].u32[3] = y[0].u32[3];
+   z[3].u32[2] = y[1].u32[3];
+   z[3].u32[1] = y[2].u32[3];
+   z[3].u32[0] = y[3].u32[3];
+
+   B[0] = _mm_add_epi32( B[0], z[0].m128 );
+   B[1] = _mm_add_epi32( B[1], z[1].m128 );
+   B[2] = _mm_add_epi32( B[2], z[2].m128 );
+   B[3] = _mm_add_epi32( B[3], z[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+}
+
+void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+         _mm_stream_si128( (__m128i*)V + i*8 + k, casti_m128i( X, k ) );
+
+      salsa8_simd128( &X[ 0], &X[16] );
+      salsa8_simd128( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      const int j = 32 * ( X[16] & ( N - 1 ) );
+      for ( int k = 0; k < 32; k++ )
+         X[k] ^= V[j + k];
+      salsa8_simd128( &X[ 0], &X[16] );
+      salsa8_simd128( &X[16], &X[ 0] );
+   }
+}
+
+// Double buffered, 2x memory usage
+// No interleaving
+static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
+                       const uint32_t * const ca, const uint32_t * const cb )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+
+#if defined(__AVX2__)
+   
+   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
+   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
+   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
+   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
+   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
+   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
+   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
+   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
+   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
+   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
+   
+   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
+   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
+   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
+   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
+   
+   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
+   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
+
+   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
+   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+
+   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
+   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+
+   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
+   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+
+#else // SSE4_1
+
+   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
+   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
+   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
+   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+
+   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
+   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
+   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
+   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+
+   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
+   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
+   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
+   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+
+   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
+   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
+   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
+   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+
+   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
+   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+
+   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
+   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+
+   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
+   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+
+   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
+   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+
+#endif  // AVX2 else SSE4_1
+
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
+   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
+   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
+   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
+   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
+   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
+   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
+   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
+
+   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
+   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
+   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
+   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
+   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
+   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
+   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
+   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+
+#else  // SSE4_1
+
+   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
+   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
+   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
+   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
+   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
+   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
+   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
+   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
+
+   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
+   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
+   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
+   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
+   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
+   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
+   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
+   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+
+#endif // AVX2 else SSE4_1
+   
+   BA[0] = _mm_add_epi32( BA[0], YA0 );
+   BB[0] = _mm_add_epi32( BB[0], YB0 );
+   BA[1] = _mm_add_epi32( BA[1], YA1 );
+   BB[1] = _mm_add_epi32( BB[1], YB1 );
+   BA[2] = _mm_add_epi32( BA[2], YA2 );
+   BB[2] = _mm_add_epi32( BB[2], YB2 );
+   BA[3] = _mm_add_epi32( BA[3], YA3 );
+   BB[3] = _mm_add_epi32( BB[3], YB3 );
+
+#else  // SSE2
+
+   m128_ovly ya[4], za[4], yb[4], zb[4];
+
+   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
+   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
+   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
+   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
+   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
+   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
+   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
+   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
+   
+   SALSA_8ROUNDS_FINAL_SIMD128_2BUF;
+
+   // Final round doesn't shuffle data back to original input order,
+   // process it as is.
+
+   ya[0].m128 = XA0;
+   yb[0].m128 = XB0;
+   ya[1].m128 = XA1;
+   yb[1].m128 = XB1;
+   ya[2].m128 = XA2;
+   yb[2].m128 = XB2;
+   ya[3].m128 = XA3;
+   yb[3].m128 = XB3;
+
+   za[0].u32[0] = ya[0].u32[0];
+   zb[0].u32[0] = yb[0].u32[0];
+   za[0].u32[3] = ya[1].u32[0];
+   zb[0].u32[3] = yb[1].u32[0];
+   za[0].u32[2] = ya[2].u32[0];
+   zb[0].u32[2] = yb[2].u32[0];
+   za[0].u32[1] = ya[3].u32[0];
+   zb[0].u32[1] = yb[3].u32[0];
+
+   za[1].u32[1] = ya[0].u32[1];
+   zb[1].u32[1] = yb[0].u32[1];
+   za[1].u32[0] = ya[1].u32[1];
+   zb[1].u32[0] = yb[1].u32[1];
+   za[1].u32[3] = ya[2].u32[1];
+   zb[1].u32[3] = yb[2].u32[1];
+   za[1].u32[2] = ya[3].u32[1];
+   zb[1].u32[2] = yb[3].u32[1];
+
+   za[2].u32[2] = ya[0].u32[2];
+   zb[2].u32[2] = yb[0].u32[2];
+   za[2].u32[1] = ya[1].u32[2];
+   zb[2].u32[1] = yb[1].u32[2];
+   za[2].u32[0] = ya[2].u32[2];
+   zb[2].u32[0] = yb[2].u32[2];
+   za[2].u32[3] = ya[3].u32[2];
+   zb[2].u32[3] = yb[3].u32[2];
+
+   za[3].u32[3] = ya[0].u32[3];
+   zb[3].u32[3] = yb[0].u32[3];
+   za[3].u32[2] = ya[1].u32[3];
+   zb[3].u32[2] = yb[1].u32[3];
+   za[3].u32[1] = ya[2].u32[3];
+   zb[3].u32[1] = yb[2].u32[3];
+   za[3].u32[0] = ya[3].u32[3];
+   zb[3].u32[0] = yb[3].u32[3];
+
+   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
+   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
+   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
+   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
+   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
+   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
+   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
+   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+}
+
+
+// X: 2 sequential buffers
+// V: 2 sequential buffers interleaved by the size of N
+// interleaved buffers { v00, v01, v10, v11, v20... }
+//
+void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+  uint32_t *X0 = X;
+  uint32_t *X1 = X+32;
+  uint32_t *V0 = V;
+  uint32_t *V1 = V + 32*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
+         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
+      }
+
+   #else
+
+      memcpy( &V0[ i*32 ], X0, 128 );
+      memcpy( &V1[ i*32 ], X1, 128 );
+
+   #endif
+
+      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N-1 ) );
+      const int j1 = 4 * ( X1[16] & ( N-1 ) );
+      for ( int k = 0; k < 4; k++ )
+      {
+         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
+         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
+//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
+//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
+         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
+         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
+      }
+
+   #else
+
+      const int j0 = 8 * ( X0[16] & ( N-1 ) );
+      const int j1 = 8 * ( X1[16] & ( N-1 ) );
+      for ( int k = 0; k < 8; k++ )
+      {
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
+         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
+         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+      }
+
+   #endif
+
+/*      
+      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 16; k++ )
+      {
+         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
+         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
+         ( (uint64_t*)X0 )[k] ^= v0;
+         ( (uint64_t*)X1 )[k] ^= v1;
+      }
+*/
+
+/*
+      const int j0 = 32 * ( X0[16] & ( N-1 ) );
+      const int j1 = 32 * ( X1[16] & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         const uint32_t v0 = V0[ j0+k ];
+         const uint32_t v1 = V1[ j1+k ]; 
+         X0[k] ^= v0;
+         X1[k] ^= v1;
+      }
+*/
+
+      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+}
+
+
+// Triple buffered, 3x memory usage
+// No interleaving
+static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
+               const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   __m128i *BC = (__m128i*)bc;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+   const __m128i *CC = (const __m128i*)cc;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   BC[0] = _mm_xor_si128( BC[0], CC[0] );
+   BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   BC[1] = _mm_xor_si128( BC[1], CC[1] );
+   BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   BC[2] = _mm_xor_si128( BC[2], CC[2] );
+   BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   BC[3] = _mm_xor_si128( BC[3], CC[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
+   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
+   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
+   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
+   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
+   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
+   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
+   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
+   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
+   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
+   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
+   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
+   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
+   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
+   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
+   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
+
+   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
+   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
+   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
+   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
+   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
+   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
+
+   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
+   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
+   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
+
+   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
+   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
+
+   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
+   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
+
+   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
+   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
+   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
+   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
+   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
+   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
+   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
+   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
+
+   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
+   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
+   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
+   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
+   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
+   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
+   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
+
+   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
+   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
+   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
+   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
+   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
+   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
+   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
+
+   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
+   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
+   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
+   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
+   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
+   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
+   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
+   
+#endif  // AVX2 else SSE3_1
+
+   SALSA_8ROUNDS_SIMD128_3BUF;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
+   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
+   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
+   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
+   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
+   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
+   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
+   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
+   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
+   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
+   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
+   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
+   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
+   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
+   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
+   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
+
+   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
+   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
+   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
+   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
+   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
+   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
+   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
+   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
+   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
+   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
+   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
+   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
+   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
+   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
+   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
+   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
+   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
+   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
+   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
+   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
+   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
+   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
+   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
+   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
+   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
+   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
+
+   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
+   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
+   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
+   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
+   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
+   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
+   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
+   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
+   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
+   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
+   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
+
+#endif  // AVX2 else SSE4_1
+
+   BA[0] = _mm_add_epi32( BA[0], YA0 );
+   BB[0] = _mm_add_epi32( BB[0], YB0 );
+   BC[0] = _mm_add_epi32( BC[0], YC0 );
+   BA[1] = _mm_add_epi32( BA[1], YA1 );
+   BB[1] = _mm_add_epi32( BB[1], YB1 );
+   BC[1] = _mm_add_epi32( BC[1], YC1 );
+   BA[2] = _mm_add_epi32( BA[2], YA2 );
+   BB[2] = _mm_add_epi32( BB[2], YB2 );
+   BC[2] = _mm_add_epi32( BC[2], YC2 );
+   BA[3] = _mm_add_epi32( BA[3], YA3 );
+   BB[3] = _mm_add_epi32( BB[3], YB3 );
+   BC[3] = _mm_add_epi32( BC[3], YC3 );
+
+#else  // SSE2
+
+   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4];
+
+   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
+   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
+   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
+   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
+   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
+   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
+   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
+   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
+   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
+   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
+   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
+   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
+
+   SALSA_8ROUNDS_FINAL_SIMD128_3BUF;
+
+   // Final round doesn't shuffle data back to original input order,
+   // process it as is.
+
+   ya[0].m128 = XA0;
+   yb[0].m128 = XB0;
+   yc[0].m128 = XC0;   
+   ya[1].m128 = XA1;
+   yb[1].m128 = XB1;   
+   yc[1].m128 = XC1;   
+   ya[2].m128 = XA2;
+   yb[2].m128 = XB2;   
+   yc[2].m128 = XC2;   
+   ya[3].m128 = XA3;
+   yb[3].m128 = XB3;
+   yc[3].m128 = XC3;
+
+   za[0].u32[0] = ya[0].u32[0];
+   zb[0].u32[0] = yb[0].u32[0];
+   zc[0].u32[0] = yc[0].u32[0];
+   za[0].u32[3] = ya[1].u32[0];
+   zb[0].u32[3] = yb[1].u32[0];
+   zc[0].u32[3] = yc[1].u32[0];
+   za[0].u32[2] = ya[2].u32[0];
+   zb[0].u32[2] = yb[2].u32[0];
+   zc[0].u32[2] = yc[2].u32[0];
+   za[0].u32[1] = ya[3].u32[0];
+   zb[0].u32[1] = yb[3].u32[0];
+   zc[0].u32[1] = yc[3].u32[0];
+
+   za[1].u32[1] = ya[0].u32[1];
+   zb[1].u32[1] = yb[0].u32[1];
+   zc[1].u32[1] = yc[0].u32[1];
+   za[1].u32[0] = ya[1].u32[1];
+   zb[1].u32[0] = yb[1].u32[1];
+   zc[1].u32[0] = yc[1].u32[1];
+   za[1].u32[3] = ya[2].u32[1];
+   zb[1].u32[3] = yb[2].u32[1];
+   zc[1].u32[3] = yc[2].u32[1];
+   za[1].u32[2] = ya[3].u32[1];
+   zb[1].u32[2] = yb[3].u32[1];
+   zc[1].u32[2] = yc[3].u32[1];
+
+   za[2].u32[2] = ya[0].u32[2];
+   zb[2].u32[2] = yb[0].u32[2];
+   zc[2].u32[2] = yc[0].u32[2];
+   za[2].u32[1] = ya[1].u32[2];
+   zb[2].u32[1] = yb[1].u32[2];
+   zc[2].u32[1] = yc[1].u32[2];
+   za[2].u32[0] = ya[2].u32[2];
+   zb[2].u32[0] = yb[2].u32[2];
+   zc[2].u32[0] = yc[2].u32[2];
+   za[2].u32[3] = ya[3].u32[2];
+   zb[2].u32[3] = yb[3].u32[2];
+   zc[2].u32[3] = yc[3].u32[2];
+
+   za[3].u32[3] = ya[0].u32[3];
+   zb[3].u32[3] = yb[0].u32[3];
+   zc[3].u32[3] = yc[0].u32[3];
+   za[3].u32[2] = ya[1].u32[3];
+   zb[3].u32[2] = yb[1].u32[3];
+   zc[3].u32[2] = yc[1].u32[3];
+   za[3].u32[1] = ya[2].u32[3];
+   zb[3].u32[1] = yb[2].u32[3];
+   zc[3].u32[1] = yc[2].u32[3];
+   za[3].u32[0] = ya[3].u32[3];
+   zb[3].u32[0] = yb[3].u32[3];
+   zc[3].u32[0] = yc[3].u32[3];
+
+   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
+   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
+   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
+   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
+   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
+   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
+   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
+   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
+   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
+   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
+   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
+   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+}
+
+void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+  uint32_t *X0 = X;
+  uint32_t *X1 = X+32;
+  uint32_t *X2 = X+64;
+  uint32_t *V0 = V;
+  uint32_t *V1 = V + 32*N;
+  uint32_t *V2 = V + 64*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__) 
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
+         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
+         _mm256_stream_si256( (__m256i*)V2 + i*4 + k, casti_m256i( X2, k ) );
+      }
+
+   #else
+
+      memcpy( &V0[ i*32 ], X0, 128 );
+      memcpy( &V1[ i*32 ], X1, 128 );
+      memcpy( &V2[ i*32 ], X2, 128 );
+
+   #endif
+
+      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                           &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
+         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
+         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
+//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
+//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
+//         const __m256i v2 = _mm256_load_si256( ( (__m256i*)V2 ) +j2+k );
+         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
+         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
+         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
+      }
+
+   #else
+
+      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
+      for ( int k = 0; k < 8; k++ )
+      {
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
+         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
+         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
+         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
+      }
+
+   #endif
+
+/*      
+      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 16; k++ )
+      {
+         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
+         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
+         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
+         ( (uint64_t*)X0 )[k] ^= v0;
+         ( (uint64_t*)X1 )[k] ^= v1;
+         ( (uint64_t*)X2 )[k] ^= v2;
+      }
+*/      
+
+/*
+      const int j0 = 32 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 32 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 32 * ( X2[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         const uint32_t v0 = V0[ j0+k ];
+         const uint32_t v1 = V1[ j1+k ];         
+         const uint32_t v2 = V2[ j2+k ];
+         X0[k] ^= v0;
+         X1[k] ^= v1;
+         X2[k] ^= v2;
+      }
+*/
+   
+      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                           &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+}
+
+// Working.
+// Quadruple buffered, 4x memory usage
+// No interleaving
+static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
+                     uint32_t *bd, const uint32_t *ca, const uint32_t *cb,
+                     const uint32_t *cc,  const uint32_t *cd )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   __m128i *BC = (__m128i*)bc;
+   __m128i *BD = (__m128i*)bd;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+   const __m128i *CC = (const __m128i*)cc;
+   const __m128i *CD = (const __m128i*)cd;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   BC[0] = _mm_xor_si128( BC[0], CC[0] );
+   BD[0] = _mm_xor_si128( BD[0], CD[0] );
+   BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   BC[1] = _mm_xor_si128( BC[1], CC[1] );
+   BD[1] = _mm_xor_si128( BD[1], CD[1] );
+   BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   BC[2] = _mm_xor_si128( BC[2], CC[2] );
+   BD[2] = _mm_xor_si128( BD[2], CD[2] );
+   BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   BC[3] = _mm_xor_si128( BC[3], CC[3] );
+   BD[3] = _mm_xor_si128( BD[3], CD[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3,
+           YC0, YC1, YC2, YC3, YD0, YD1, YD2, YD3;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
+   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
+   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
+   YD0 = _mm_blend_epi32( BD[1], BD[0], 0x1 );
+   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
+   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
+   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
+   XD0 = _mm_blend_epi32( BD[3], BD[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
+   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
+   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
+   YD1 = _mm_blend_epi32( BD[2], BD[1], 0x1 );
+   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
+   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
+   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
+   XD1 = _mm_blend_epi32( BD[0], BD[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
+   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
+   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
+   YD2 = _mm_blend_epi32( BD[3], BD[2], 0x1 );
+   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
+   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
+   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
+   XD2 = _mm_blend_epi32( BD[1], BD[0], 0x4 );
+
+   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
+   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
+   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
+   YD3 = _mm_blend_epi32( BD[0], BD[3], 0x1 );
+   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
+   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
+   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
+   XD3 = _mm_blend_epi32( BD[2], BD[1], 0x4 );
+
+   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
+   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
+   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
+   XD0 = _mm_blend_epi32( XD0, YD0, 0x3 );
+
+   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
+   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
+   XD1 = _mm_blend_epi32( XD1, YD1, 0x3 );
+
+   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
+   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
+   XD2 = _mm_blend_epi32( XD2, YD2, 0x3 );
+
+   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
+   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
+   XD3 = _mm_blend_epi32( XD3, YD3, 0x3 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
+   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
+   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
+   YD0 = _mm_blend_epi16( BD[1], BD[0], 0x03 );
+   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
+   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
+   XD0 = _mm_blend_epi16( BD[3], BD[2], 0x30 );
+   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
+   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
+   XD0 = _mm_blend_epi16( XD0, YD0, 0x0f );
+
+   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
+   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
+   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
+   YD1 = _mm_blend_epi16( BD[2], BD[1], 0x03 );
+   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
+   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
+   XD1 = _mm_blend_epi16( BD[0], BD[3], 0x30 );
+   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
+   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
+   XD1 = _mm_blend_epi16( XD1, YD1, 0x0f );
+
+   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
+   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
+   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
+   YD2 = _mm_blend_epi16( BD[3], BD[2], 0x03 );
+   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
+   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
+   XD2 = _mm_blend_epi16( BD[1], BD[0], 0x30 );
+   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
+   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
+   XD2 = _mm_blend_epi16( XD2, YD2, 0x0f );
+
+   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
+   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
+   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
+   YD3 = _mm_blend_epi16( BD[0], BD[3], 0x03 );
+   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
+   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
+   XD3 = _mm_blend_epi16( BD[2], BD[1], 0x30 );
+   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
+   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
+   XD3 = _mm_blend_epi16( XD3, YD3, 0x0f );
+
+#endif  // AVX2 else SSE3_1
+
+   SALSA_8ROUNDS_SIMD128_4BUF;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
+   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
+   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
+   YD0 = _mm_blend_epi32( XD0, XD1, 0x8 );
+   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
+   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
+   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
+   YD1 = _mm_blend_epi32( XD0, XD1, 0x1 );
+   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
+   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
+   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
+   YD2 = _mm_blend_epi32( XD0, XD1, 0x2 );
+   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
+   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
+   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
+   YD3 = _mm_blend_epi32( XD0, XD1, 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
+   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
+   YD0 = _mm_blend_epi32( YD0, XD2, 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
+   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
+   YD1 = _mm_blend_epi32( YD1, XD2, 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
+   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
+   YD2 = _mm_blend_epi32( YD2, XD2, 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
+   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
+   YD3 = _mm_blend_epi32( YD3, XD2, 0x2 );
+
+   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
+   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
+   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
+   YD0 = _mm_blend_epi32( YD0, XD3, 0x2 );
+   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
+   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
+   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
+   YD1 = _mm_blend_epi32( YD1, XD3, 0x4 );
+   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
+   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
+   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
+   YD2 = _mm_blend_epi32( YD2, XD3, 0x8 );
+   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
+   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
+   YD3 = _mm_blend_epi32( YD3, XD3, 0x1 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
+   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
+   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
+   YD0 = _mm_blend_epi16( XD0, XD1, 0xc0 );
+   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
+   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
+   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
+   YD1 = _mm_blend_epi16( XD0, XD1, 0x03 );
+   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
+   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
+   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
+   YD2 = _mm_blend_epi16( XD0, XD1, 0x0c );
+   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
+   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
+   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
+   YD3 = _mm_blend_epi16( XD0, XD1, 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
+   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
+   YD0 = _mm_blend_epi16( YD0, XD2, 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
+   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
+   YD1 = _mm_blend_epi16( YD1, XD2, 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
+   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
+   YD2 = _mm_blend_epi16( YD2, XD2, 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
+   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
+   YD3 = _mm_blend_epi16( YD3, XD2, 0x0c );
+
+   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
+   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
+   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
+   YD0 = _mm_blend_epi16( YD0, XD3, 0x0c );
+   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
+   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
+   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
+   YD1 = _mm_blend_epi16( YD1, XD3, 0x30 );
+   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
+   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
+   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
+   YD2 = _mm_blend_epi16( YD2, XD3, 0xc0 );
+   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
+   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
+   YD3 = _mm_blend_epi16( YD3, XD3, 0x03 );
+
+#endif  // AVX2 else SSE4_1
+
+   BA[0] = _mm_add_epi32( BA[0], YA0 );
+   BB[0] = _mm_add_epi32( BB[0], YB0 );
+   BC[0] = _mm_add_epi32( BC[0], YC0 );
+   BD[0] = _mm_add_epi32( BD[0], YD0 );
+   BA[1] = _mm_add_epi32( BA[1], YA1 );
+   BB[1] = _mm_add_epi32( BB[1], YB1 );
+   BC[1] = _mm_add_epi32( BC[1], YC1 );
+   BD[1] = _mm_add_epi32( BD[1], YD1 );
+   BA[2] = _mm_add_epi32( BA[2], YA2 );
+   BB[2] = _mm_add_epi32( BB[2], YB2 );
+   BC[2] = _mm_add_epi32( BC[2], YC2 );
+   BD[2] = _mm_add_epi32( BD[2], YD2 );
+   BA[3] = _mm_add_epi32( BA[3], YA3 );
+   BB[3] = _mm_add_epi32( BB[3], YB3 );
+   BC[3] = _mm_add_epi32( BC[3], YC3 );
+   BD[3] = _mm_add_epi32( BD[3], YD3 );
+
+#else  // SSE2
+
+   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4], yd[4], zd[4];
+
+   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
+   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
+   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
+   XD0 = _mm_set_epi32( bd[15], bd[10], bd[ 5], bd[ 0] );
+   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
+   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
+   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
+   XD1 = _mm_set_epi32( bd[ 3], bd[14], bd[ 9], bd[ 4] );
+   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
+   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
+   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
+   XD2 = _mm_set_epi32( bd[ 7], bd[ 2], bd[13], bd[ 8] );
+   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
+   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
+   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
+   XD3 = _mm_set_epi32( bd[11], bd[ 6], bd[ 1], bd[12] );
+
+   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
+
+   ya[0].m128 = XA0;
+   yb[0].m128 = XB0;
+   yc[0].m128 = XC0;
+   yd[0].m128 = XD0;
+   ya[1].m128 = XA1;
+   yb[1].m128 = XB1;
+   yc[1].m128 = XC1;
+   yd[1].m128 = XD1;
+   ya[2].m128 = XA2;
+   yb[2].m128 = XB2;
+   yc[2].m128 = XC2;
+   yd[2].m128 = XD2;
+   ya[3].m128 = XA3;
+   yb[3].m128 = XB3;
+   yc[3].m128 = XC3;
+   yd[3].m128 = XD3;
+
+   za[0].u32[0] = ya[0].u32[0];
+   zb[0].u32[0] = yb[0].u32[0];
+   zc[0].u32[0] = yc[0].u32[0];
+   zd[0].u32[0] = yd[0].u32[0];
+   za[0].u32[3] = ya[1].u32[0];
+   zb[0].u32[3] = yb[1].u32[0];
+   zc[0].u32[3] = yc[1].u32[0];
+   zd[0].u32[3] = yd[1].u32[0];
+   za[0].u32[2] = ya[2].u32[0];
+   zb[0].u32[2] = yb[2].u32[0];
+   zc[0].u32[2] = yc[2].u32[0];
+   zd[0].u32[2] = yd[2].u32[0];
+   za[0].u32[1] = ya[3].u32[0];
+   zb[0].u32[1] = yb[3].u32[0];
+   zc[0].u32[1] = yc[3].u32[0];
+   zd[0].u32[1] = yd[3].u32[0];
+
+   za[1].u32[1] = ya[0].u32[1];
+   zb[1].u32[1] = yb[0].u32[1];
+   zc[1].u32[1] = yc[0].u32[1];
+   zd[1].u32[1] = yd[0].u32[1];
+   za[1].u32[0] = ya[1].u32[1];
+   zb[1].u32[0] = yb[1].u32[1];
+   zc[1].u32[0] = yc[1].u32[1];
+   zd[1].u32[0] = yd[1].u32[1];
+   za[1].u32[3] = ya[2].u32[1];
+   zb[1].u32[3] = yb[2].u32[1];
+   zc[1].u32[3] = yc[2].u32[1];
+   zd[1].u32[3] = yd[2].u32[1];
+   za[1].u32[2] = ya[3].u32[1];
+   zb[1].u32[2] = yb[3].u32[1];
+   zc[1].u32[2] = yc[3].u32[1];
+   zd[1].u32[2] = yd[3].u32[1];
+
+   za[2].u32[2] = ya[0].u32[2];
+   zb[2].u32[2] = yb[0].u32[2];
+   zc[2].u32[2] = yc[0].u32[2];
+   zd[2].u32[2] = yd[0].u32[2];
+   za[2].u32[1] = ya[1].u32[2];
+   zb[2].u32[1] = yb[1].u32[2];
+   zc[2].u32[1] = yc[1].u32[2];
+   zd[2].u32[1] = yd[1].u32[2];
+   za[2].u32[0] = ya[2].u32[2];
+   zb[2].u32[0] = yb[2].u32[2];
+   zc[2].u32[0] = yc[2].u32[2];
+   zd[2].u32[0] = yd[2].u32[2];
+   za[2].u32[3] = ya[3].u32[2];
+   zb[2].u32[3] = yb[3].u32[2];
+   zc[2].u32[3] = yc[3].u32[2];
+   zd[2].u32[3] = yd[3].u32[2];
+
+   za[3].u32[3] = ya[0].u32[3];
+   zb[3].u32[3] = yb[0].u32[3];
+   zc[3].u32[3] = yc[0].u32[3];
+   zd[3].u32[3] = yd[0].u32[3];
+   za[3].u32[2] = ya[1].u32[3];
+   zb[3].u32[2] = yb[1].u32[3];
+   zc[3].u32[2] = yc[1].u32[3];
+   zd[3].u32[2] = yd[1].u32[3];
+   za[3].u32[1] = ya[2].u32[3];
+   zb[3].u32[1] = yb[2].u32[3];
+   zc[3].u32[1] = yc[2].u32[3];
+   zd[3].u32[1] = yd[2].u32[3];
+   za[3].u32[0] = ya[3].u32[3];
+   zb[3].u32[0] = yb[3].u32[3];
+   zc[3].u32[0] = yc[3].u32[3];
+   zd[3].u32[0] = yd[3].u32[3];
+
+   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
+   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
+   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
+   BD[0] = _mm_add_epi32( BD[0], zd[0].m128 );
+   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
+   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
+   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
+   BD[1] = _mm_add_epi32( BD[1], zd[1].m128 );
+   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
+   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
+   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
+   BD[2] = _mm_add_epi32( BD[2], zd[2].m128 );
+   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
+   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
+   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
+   BD[3] = _mm_add_epi32( BD[3], zd[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+}
+
+void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+  uint32_t *X0 = X;
+  uint32_t *X1 = X+32;
+  uint32_t *X2 = X+64;
+  uint32_t *X3 = X+96;
+  uint32_t *V0 = V;
+  uint32_t *V1 = V + 32*N;
+  uint32_t *V2 = V + 64*N;
+  uint32_t *V3 = V + 96*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+      {
+         _mm_stream_si128( (__m128i*)V0 + i*8 + k, casti_m128i( X0, k ) );
+         _mm_stream_si128( (__m128i*)V1 + i*8 + k, casti_m128i( X1, k ) );
+         _mm_stream_si128( (__m128i*)V2 + i*8 + k, casti_m128i( X2, k ) );
+         _mm_stream_si128( (__m128i*)V3 + i*8 + k, casti_m128i( X3, k ) );
+      }
+
+      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                           &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
+      const int j3 = 4 * ( X3[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
+         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); 
+         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
+         const __m256i v3 = _mm256_stream_load_si256( ( (__m256i*)V3 ) +j3+k );
+         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
+         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
+         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
+         casti_m256i( X3, k ) = _mm256_xor_si256( casti_m256i( X3, k ), v3 );
+      }
+
+   #else
+      
+      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
+      const int j3 = 8 * ( X3[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+      {
+      #if defined(__SSE4_1__)
+         const __m128i v0 = _mm_stream_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_stream_load_si128( ( (__m128i*)V1 ) +j1+k );
+         const __m128i v2 = _mm_stream_load_si128( ( (__m128i*)V2 ) +j2+k );
+         const __m128i v3 = _mm_stream_load_si128( ( (__m128i*)V3 ) +j3+k );
+      #else
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
+         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
+         const __m128i v3 = _mm_load_si128( ( (__m128i*)V3 ) +j3+k );
+      #endif
+         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
+         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
+         casti_m128i( X3, k ) = _mm_xor_si128( casti_m128i( X3, k ), v3 );
+      }
+
+   #endif      
+
+/*
+      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
+      const int j3 = 16 * ( X3[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 16; k++ )
+      {
+         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
+         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];
+         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
+         const uint64_t v3 = ( (uint64_t*)V3 )[ j3+k ];
+         ( (uint64_t*)X0 )[k] ^= v0;
+         ( (uint64_t*)X1 )[k] ^= v1;
+         ( (uint64_t*)X2 )[k] ^= v2;
+         ( (uint64_t*)X3 )[k] ^= v3;
+      }
+*/
+
+      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                           &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+}
+
+
+#endif // SSE2
+
+
+// Reference, used only for testing.
+// Tested OK.
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+   uint32_t x0 = (B[ 0] ^= C[ 0]),
+            x1 = (B[ 1] ^= C[ 1]),
+            x2 = (B[ 2] ^= C[ 2]),
+            x3 = (B[ 3] ^= C[ 3]);
+   uint32_t x4 = (B[ 4] ^= C[ 4]),
+            x5 = (B[ 5] ^= C[ 5]),
+            x6 = (B[ 6] ^= C[ 6]),
+            x7 = (B[ 7] ^= C[ 7]);
+   uint32_t x8 = (B[ 8] ^= C[ 8]),
+            x9 = (B[ 9] ^= C[ 9]),
+            xa = (B[10] ^= C[10]),
+            xb = (B[11] ^= C[11]);
+   uint32_t xc = (B[12] ^= C[12]),
+            xd = (B[13] ^= C[13]),
+            xe = (B[14] ^= C[14]),
+            xf = (B[15] ^= C[15]);
+
+   
+   #define ROL32( a, c )    ror32( a, c )
+   #define ADD32( a, b )    ( (a)+(b) )
+   #define XOR( a, b )      ( (a)^(b) )
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+
+   B[ 0] += x0;
+   B[ 1] += x1;
+   B[ 2] += x2;
+   B[ 3] += x3;
+   B[ 4] += x4;
+   B[ 5] += x5;
+   B[ 6] += x6;
+   B[ 7] += x7;
+   B[ 8] += x8;
+   B[ 9] += x9;
+   B[10] += xa;
+   B[11] += xb;
+   B[12] += xc;
+   B[13] += xd;
+   B[14] += xe;
+   B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor (def. 1024)
+ */
+
+
+void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128 );
+      xor_salsa8( &X[ 0], &X[16] );
+      xor_salsa8( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      int j = 32 * ( X[16] & ( N - 1 ) );
+      for ( int k = 0; k < 32; k++ )
+         X[k] ^= V[j + k];
+      xor_salsa8( &X[ 0], &X[16] );
+      xor_salsa8( &X[16], &X[ 0] );
+   }
+}
+
+
+
diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h
new file mode 100644
index 00000000..6567733b
--- /dev/null
+++ b/algo/scrypt/scrypt-core-4way.h
@@ -0,0 +1,70 @@
+#ifndef SCRYPT_CORE_4WAY_H__
+#define SCRYPT_CORE_4WAY_H__
+
+#include "simd-utils.h"
+#include <stdlib.h>
+#include <stdint.h>
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
+
+// Serial SIMD over 4 way parallel
+void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
+
+// 4 way parallel over serial SIMD
+void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
+
+#endif
+
+#if defined(__AVX2__)
+
+void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
+
+// 2 way parallel over SIMD128
+void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
+
+// Double buffered 2 way parallel over SIMD128
+void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
+
+// Triplee buffered 2 way parallel over SIMD128
+void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
+
+// Serial SIMD128 over 2 way parallel
+void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Double buffered simd over parallel
+void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Triple buffered 2 way
+void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Quadruple buffered
+void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+#endif
+
+#if defined(__SSE2__)
+
+// Parallel 4 way, 4x memory
+void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
+
+// Linear SIMD 1 way, 1x memory, lowest
+void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Double buffered, 2x memory
+void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Triple buffered
+void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Quadruple buffered, 4x memory
+void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+#endif
+
+// For reference only
+void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
+
+#endif   
+
diff --git a/algo/scrypt/scrypt-core-ref.c b/algo/scrypt/scrypt-core-ref.c
new file mode 100644
index 00000000..ec564ed2
--- /dev/null
+++ b/algo/scrypt/scrypt-core-ref.c
@@ -0,0 +1,206 @@
+#include "scrypt-core-ref.h"
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+   uint32_t x0 = (B[ 0] ^= C[ 0]),
+            x1 = (B[ 1] ^= C[ 1]),
+            x2 = (B[ 2] ^= C[ 2]),
+            x3 = (B[ 3] ^= C[ 3]);
+   uint32_t x4 = (B[ 4] ^= C[ 4]),
+            x5 = (B[ 5] ^= C[ 5]),
+            x6 = (B[ 6] ^= C[ 6]),
+            x7 = (B[ 7] ^= C[ 7]);
+   uint32_t x8 = (B[ 8] ^= C[ 8]),
+            x9 = (B[ 9] ^= C[ 9]),
+            xa = (B[10] ^= C[10]),
+            xb = (B[11] ^= C[11]);
+   uint32_t xc = (B[12] ^= C[12]),
+            xd = (B[13] ^= C[13]),
+            xe = (B[14] ^= C[14]),
+            xf = (B[15] ^= C[15]);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   B[ 0] += x0;
+   B[ 1] += x1;
+   B[ 2] += x2;
+   B[ 3] += x3;
+   B[ 4] += x4;
+   B[ 5] += x5;
+   B[ 6] += x6;
+   B[ 7] += x7;
+   B[ 8] += x8;
+   B[ 9] += x9;
+   B[10] += xa;
+   B[11] += xb;
+   B[12] += xc;
+   B[13] += xd;
+   B[14] += xe;
+   B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor (def. 1024)
+ */
+void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
+{
+   for (uint32_t i = 0; i < N; i++) {
+      memcpy(&V[i * 32], X, 128);
+      xor_salsa8(&X[0], &X[16]);
+      xor_salsa8(&X[16], &X[0]);
+   }
+   for (uint32_t i = 0; i < N; i++) {
+      uint32_t j = 32 * (X[16] & (N - 1));
+      for (uint8_t k = 0; k < 32; k++)
+         X[k] ^= V[j + k];
+      xor_salsa8(&X[0], &X[16]);
+      xor_salsa8(&X[16], &X[0]);
+   }
+}
+
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index e35adbf5..a15b5cb1 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -32,6 +32,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
+#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
+#include <mm_malloc.h>
 
 static const uint32_t keypad[12] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
@@ -46,80 +49,102 @@ static const uint32_t finalblk[16] = {
 	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
 };
 
-static __thread char *scratchbuf;
-int scratchbuf_size = 0;
+static const uint32_t sha256_initial_state[8] =
+{
+  0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+  0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static int scrypt_throughput = 0;
+
+static int scratchbuf_size = 0;
+
+static __thread char *scratchbuf = NULL;
+
+// change this to a constant to be used directly  as input state arg
+// vectors still need an init function.
+static inline void sha256_init_state( uint32_t *state )
+{
+   state[ 0 ] = 0x6A09E667;
+   state[ 1 ] = 0xBB67AE85;
+   state[ 2 ] = 0x3C6EF372;
+   state[ 3 ] = 0xA54FF53A;
+   state[ 4 ] = 0x510E527F;
+   state[ 5 ] = 0x9B05688C;
+   state[ 6 ] = 0x1F83D9AB;
+   state[ 7 ] = 0x5BE0CD19;
+}
 
 static inline void HMAC_SHA256_80_init(const uint32_t *key,
 	uint32_t *tstate, uint32_t *ostate)
 {
-	uint32_t ihash[8];
-	uint32_t pad[16];
-	int i;
+   uint32_t ihash[8];
+   uint32_t pad[16];
+   int i;
 
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 16, 16);
-	memcpy(pad + 4, keypad, 48);
-	sha256_transform(tstate, pad, 0);
-	memcpy(ihash, tstate, 32);
-
-	sha256_init(ostate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform(ostate, pad, 0);
-
-	sha256_init(tstate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform(tstate, pad, 0);
+   /* tstate is assumed to contain the midstate of key */
+   memcpy(pad, key + 16, 16);
+   memcpy(pad + 4, keypad, 48);
+
+   sha256_transform_le( tstate, pad, tstate );
+
+   memcpy( ihash, tstate, 32 );
+
+   for ( i = 0; i < 8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
+   for ( ; i < 16; i++ )      pad[i] = 0x5c5c5c5c;
+
+   sha256_transform_le( ostate, pad, sha256_initial_state );
+
+   for ( i = 0; i < 8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
+   for ( ; i < 16; i++ )      pad[i] = 0x36363636;
+
+   sha256_transform_le( tstate, pad, sha256_initial_state );
 }
 
 static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
 	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
-	uint32_t istate[8], ostate2[8];
-	uint32_t ibuf[16], obuf[16];
-	int i, j;
+   uint32_t istate[8], ostate2[8];
+   uint32_t ibuf[16], obuf[16];
+   int i, j;
 
-	memcpy(istate, tstate, 32);
-	sha256_transform(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 16, 16);
-	memcpy(ibuf + 5, innerpad, 44);
-	memcpy(obuf + 8, outerpad, 32);
-
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 32);
-		ibuf[4] = i + 1;
-		sha256_transform(obuf, ibuf, 0);
-
-		memcpy(ostate2, ostate, 32);
-		sha256_transform(ostate2, obuf, 0);
-		for (j = 0; j < 8; j++)
-			output[8 * i + j] = swab32(ostate2[j]);
-	}
+   sha256_transform_le( istate, salt, tstate );
+
+   memcpy(ibuf, salt + 16, 16);
+   memcpy(ibuf + 5, innerpad, 44);
+   memcpy(obuf + 8, outerpad, 32);
+
+   for (i = 0; i < 4; i++)
+   {
+      memcpy(obuf, istate, 32);
+      ibuf[4] = i + 1;
+
+      sha256_transform_le( obuf, ibuf, obuf );
+      sha256_transform_le( ostate2, obuf, ostate );
+
+      for (j = 0; j < 8; j++)
+         output[8 * i + j] = bswap_32( ostate2[j] );
+   }
 }
 
 static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
 	const uint32_t *salt, uint32_t *output)
 {
-	uint32_t buf[16];
-	int i;
-	
-	sha256_transform(tstate, salt, 1);
-	sha256_transform(tstate, salt + 16, 1);
-	sha256_transform(tstate, finalblk, 0);
-	memcpy(buf, tstate, 32);
-	memcpy(buf + 8, outerpad, 32);
-
-	sha256_transform(ostate, buf, 0);
-	for (i = 0; i < 8; i++)
-		output[i] = swab32(ostate[i]);
-}
+   uint32_t buf[16];
+   int i;
 
+   sha256_transform_be( tstate, salt, tstate );
+   sha256_transform_be( tstate, salt+16, tstate );
+   sha256_transform_le( tstate, finalblk, tstate );
+
+   memcpy(buf, tstate, 32);
+   memcpy(buf + 8, outerpad, 32);
+
+   sha256_transform_le( ostate, buf, ostate );
+
+   for (i = 0; i < 8; i++)
+      output[i] = bswap_32( ostate[i] );
+}
 
 #ifdef HAVE_SHA256_4WAY
 
@@ -160,6 +185,8 @@ static const uint32_t outerpad_4way[4 * 8] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000300, 0x00000300, 0x00000300, 0x00000300
 };
+
+/*
 static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
 	0x00000001, 0x00000001, 0x00000001, 0x00000001,
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -178,37 +205,51 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000620, 0x00000620, 0x00000620, 0x00000620
 };
+*/
 
-static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
+static inline void sha256_4way_init_state( void *state )
+{
+   casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 );
+   casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 );
+   casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 );
+   casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A );
+   casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F );
+   casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C );
+   casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB );
+   casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 );
+}
+
+static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
+                                   uint32_t *tstate, uint32_t *ostate )
 {
 	uint32_t _ALIGN(16) ihash[4 * 8];
 	uint32_t _ALIGN(16) pad[4 * 16];
 	int i;
 
 	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 4 * 16, 4 * 16);
-	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
-	sha256_transform_4way(tstate, pad, 0);
-	memcpy(ihash, tstate, 4 * 32);
-
-	sha256_init_4way(ostate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_4way(ostate, pad, 0);
-
-	sha256_init_4way(tstate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_4way(tstate, pad, 0);
+	memcpy( pad, key + 4*16, 4*16 );
+	memcpy( pad + 4*4, keypad_4way, 4*48 );
+
+   sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad,
+                             (const __m128i*)tstate );
+
+   sha256_4way_init_state( tstate );
+
+	for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for ( ; i < 4*16; i++ )      pad[i] = 0x5c5c5c5c;
+
+   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad,
+                             (const __m128i*)tstate );
+   
+   for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
+	for ( ; i < 4*16; i++ )      pad[i] = 0x36363636;
+
+   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad,
+                             (const __m128i*)tstate );
 }
 
-static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
+          const uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
 	uint32_t _ALIGN(16) istate[4 * 8];
 	uint32_t _ALIGN(16) ostate2[4 * 8];
@@ -216,43 +257,62 @@ static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
 	uint32_t _ALIGN(16) obuf[4 * 16];
 	int i, j;
 
-	memcpy(istate, tstate, 4 * 32);
-	sha256_transform_4way(istate, salt, 0);
+   sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt,
+                             (const __m128i*)tstate );
 	
 	memcpy(ibuf, salt + 4 * 16, 4 * 16);
 	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
 	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
 
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 4 * 32);
+	for ( i = 0; i < 4; i++ )
+   {
 		ibuf[4 * 4 + 0] = i + 1;
 		ibuf[4 * 4 + 1] = i + 1;
 		ibuf[4 * 4 + 2] = i + 1;
 		ibuf[4 * 4 + 3] = i + 1;
-		sha256_transform_4way(obuf, ibuf, 0);
 
-		memcpy(ostate2, ostate, 4 * 32);
-		sha256_transform_4way(ostate2, obuf, 0);
-		for (j = 0; j < 4 * 8; j++)
-			output[4 * 8 * i + j] = swab32(ostate2[j]);
+      sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf,
+                                (const __m128i*)istate );
+      
+      sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf,
+                                (const __m128i*)ostate );
+
+      for ( j = 0; j < 4 * 8; j++ )
+			output[4 * 8 * i + j] = bswap_32( ostate2[j] );
 	}
 }
 
-static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
+               uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
-	uint32_t _ALIGN(16) buf[4 * 16];
+   __m128i _ALIGN(64) final[ 8*16 ];
+	uint32_t _ALIGN(64) buf[4 * 16];
 	int i;
 	
-	sha256_transform_4way(tstate, salt, 1);
-	sha256_transform_4way(tstate, salt + 4 * 16, 1);
-	sha256_transform_4way(tstate, finalblk_4way, 0);
-	memcpy(buf, tstate, 4 * 32);
+   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt,
+                       (const __m128i*)tstate );
+   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16),
+                       (const __m128i*)tstate );
+
+   final[ 0] = _mm_set1_epi32( 0x00000001 );
+   final[ 1] = _mm_set1_epi32( 0x80000000 );
+   final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
+             = final[ 7] = final[ 8] = final[ 9] = final[10]
+             = final[11] = final[12] = final[13] = final[14]
+             = _mm_setzero_si128();
+   final[15] = _mm_set1_epi32 ( 0x00000620 );
+
+   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final,
+                       (const __m128i*)tstate );
+   
+   memcpy(buf, tstate, 4 * 32);
 	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
 
-	sha256_transform_4way(ostate, buf, 0);
-	for (i = 0; i < 4 * 8; i++)
-		output[i] = swab32(ostate[i]);
+   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf,
+                             (const __m128i*)ostate );
+
+   for ( i = 0; i < 4 * 8; i++ )
+		output[i] = bswap_32( ostate[i] );
 }
 
 #endif /* HAVE_SHA256_4WAY */
@@ -260,6 +320,7 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
 
 #ifdef HAVE_SHA256_8WAY
 
+/*
 static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
 	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
 	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -278,41 +339,52 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
 };
+*/
 
-static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
+static inline void sha256_8way_init_state( void *state )
+{
+   casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 );
+   casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 );
+   casti_m256i( state, 2 ) = _mm256_set1_epi32( 0x3C6EF372 );
+   casti_m256i( state, 3 ) = _mm256_set1_epi32( 0xA54FF53A );
+   casti_m256i( state, 4 ) = _mm256_set1_epi32( 0x510E527F );
+   casti_m256i( state, 5 ) = _mm256_set1_epi32( 0x9B05688C );
+   casti_m256i( state, 6 ) = _mm256_set1_epi32( 0x1F83D9AB );
+   casti_m256i( state, 7 ) = _mm256_set1_epi32( 0x5BE0CD19 );
+}
+
+static inline void HMAC_SHA256_80_init_8way( const uint32_t *key,
+                                      uint32_t *tstate, uint32_t *ostate )
 {
 	uint32_t _ALIGN(32) ihash[8 * 8];
 	uint32_t _ALIGN(32)  pad[8 * 16];
 	int i;
 	
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		pad[8 * 4 + i] = 0x80000000;
-	memset(pad + 8 * 5, 0x00, 8 * 40);
-	for (i = 0; i < 8; i++)
-		pad[8 * 15 + i] = 0x00000280;
-	sha256_transform_8way(tstate, pad, 0);
-	memcpy(ihash, tstate, 8 * 32);
-	
-	sha256_init_8way(ostate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_8way(ostate, pad, 0);
-	
-	sha256_init_8way(tstate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_8way(tstate, pad, 0);
+	memcpy( pad, key + 8*16, 8*16 );
+	for ( i = 0; i < 8; i++ )    pad[ 8*4 + i ] = 0x80000000;
+	memset( pad + 8*5, 0x00, 8*40 );
+	for ( i = 0; i < 8; i++ )    pad[ 8*15 + i ] = 0x00000280;
+
+   sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad,
+                             (const __m256i*)tstate );
+
+   sha256_8way_init_state( tstate );
+
+   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for ( ; i < 8*16; i++ )       pad[i] = 0x5c5c5c5c;
+
+   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad,
+                             (const __m256i*)tstate );
+
+   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
+	for ( ; i < 8*16; i++ )       pad[i] = 0x36363636;
+
+   sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad,
+                             (const __m256i*)tstate );
 }
 
-static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
+          const uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
 	uint32_t _ALIGN(32) istate[8 * 8];
 	uint32_t _ALIGN(32) ostate2[8 * 8];
@@ -320,24 +392,20 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
 	uint32_t _ALIGN(32) obuf[8 * 16];
 	int i, j;
 	
-	memcpy(istate, tstate, 8 * 32);
-	sha256_transform_8way(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 5 + i] = 0x80000000;
-	memset(ibuf + 8 * 6, 0x00, 8 * 36);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 15 + i] = 0x000004a0;
+   sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt,
+                             (const __m256i*)tstate );
+
+	memcpy( ibuf, salt + 8*16, 8*16 );
+	for ( i = 0; i < 8; i++ )     ibuf[ 8*5 + i ] = 0x80000000;
+	memset( ibuf + 8*6, 0x00, 8*36 );
+	for ( i = 0; i < 8; i++ )     ibuf[ 8*15 + i ] = 0x000004a0;
 	
-	for (i = 0; i < 8; i++)
-		obuf[8 * 8 + i] = 0x80000000;
-	memset(obuf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		obuf[8 * 15 + i] = 0x00000300;
+	for ( i = 0; i < 8; i++ )     obuf[ 8*8 + i ] = 0x80000000;
+	memset( obuf + 8*9, 0x00, 8*24 );
+	for ( i = 0; i < 8; i++ )     obuf[ 8*15 + i ] = 0x00000300;
 	
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 8 * 32);
+	for ( i = 0; i < 4; i++ )
+   {
 		ibuf[8 * 4 + 0] = i + 1;
 		ibuf[8 * 4 + 1] = i + 1;
 		ibuf[8 * 4 + 2] = i + 1;
@@ -346,48 +414,198 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
 		ibuf[8 * 4 + 5] = i + 1;
 		ibuf[8 * 4 + 6] = i + 1;
 		ibuf[8 * 4 + 7] = i + 1;
-		sha256_transform_8way(obuf, ibuf, 0);
-		
-		memcpy(ostate2, ostate, 8 * 32);
-		sha256_transform_8way(ostate2, obuf, 0);
-		for (j = 0; j < 8 * 8; j++)
-			output[8 * 8 * i + j] = swab32(ostate2[j]);
+
+      sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
+                                (const __m256i*)istate );
+
+      sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
+                                (const __m256i*)ostate );
+
+      for ( j = 0; j < 8*8; j++ )
+			output[ 8*8*i + j ] = bswap_32( ostate2[j] );
 	}
 }
 
-static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
+                uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
-	uint32_t _ALIGN(32) buf[8 * 16];
+   __m256i _ALIGN(128) final[ 8*16 ];
+   uint32_t _ALIGN(128) buf[ 8*16 ];
 	int i;
 	
-	sha256_transform_8way(tstate, salt, 1);
-	sha256_transform_8way(tstate, salt + 8 * 16, 1);
-	sha256_transform_8way(tstate, finalblk_8way, 0);
-	
-	memcpy(buf, tstate, 8 * 32);
-	for (i = 0; i < 8; i++)
-		buf[8 * 8 + i] = 0x80000000;
-	memset(buf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		buf[8 * 15 + i] = 0x00000300;
-	sha256_transform_8way(ostate, buf, 0);
-	
+   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt,
+                             (const __m256i*)tstate );
+   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
+                             (const __m256i*)tstate );
+   
+   final[ 0] = _mm256_set1_epi32( 0x00000001 );
+   final[ 1] = _mm256_set1_epi32( 0x80000000 ); 
+   final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
+             = final[ 7] = final[ 8] = final[ 9] = final[10]
+             = final[11] = final[12] = final[13] = final[14]
+             = _mm256_setzero_si256();
+   final[15] = _mm256_set1_epi32 ( 0x00000620 );
+
+   sha256_8way_transform_le( (__m256i*)tstate, final,
+                             (const __m256i*)tstate );
+
+	memcpy( buf, tstate, 8*32 );
+	for ( i = 0; i < 8; i++ )     buf[ 8*8 + i ] = 0x80000000;
+	memset( buf + 8*9, 0x00, 8*24 );
+	for ( i = 0; i < 8; i++ )     buf[ 8*15 + i ] = 0x00000300;
+
+   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf,
+                             (const __m256i*)ostate );
+
 	for (i = 0; i < 8 * 8; i++)
-		output[i] = swab32(ostate[i]);
+		output[i] = bswap_32(ostate[i]);
 }
 
 #endif /* HAVE_SHA256_8WAY */
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+static inline void sha256_16way_init_state( void *state )
+{
+   casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 );
+   casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 );
+   casti_m512i( state, 2 ) = _mm512_set1_epi32( 0x3C6EF372 );
+   casti_m512i( state, 3 ) = _mm512_set1_epi32( 0xA54FF53A );
+   casti_m512i( state, 4 ) = _mm512_set1_epi32( 0x510E527F );
+   casti_m512i( state, 5 ) = _mm512_set1_epi32( 0x9B05688C );
+   casti_m512i( state, 6 ) = _mm512_set1_epi32( 0x1F83D9AB );
+   casti_m512i( state, 7 ) = _mm512_set1_epi32( 0x5BE0CD19 );
+}
+
+static inline void HMAC_SHA256_80_init_16way( const uint32_t *key,
+                                     uint32_t *tstate, uint32_t *ostate )
+{
+   uint32_t _ALIGN(128)   pad[16*16];
+   uint32_t _ALIGN(128) ihash[16* 8];
+   int i;
+
+   memcpy( pad, key + 16*16, 16*16 ); 
+   for ( i = 0; i < 16; i++ )       pad[ 16*4 + i ] = 0x80000000;
+   memset( pad + 16*5, 0x00, 16*40 );
+   for ( i = 0; i < 16; i++ )       pad[ 16*15 + i ] = 0x00000280;
+
+   sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad,
+                              (const __m512i*)tstate );
+
+   sha256_16way_init_state( tstate );
+
+   for ( i = 0; i < 16*8; i++ )    pad[i] = ihash[i] ^ 0x5c5c5c5c;
+   for ( ; i < 16*16; i++ )        pad[i] = 0x5c5c5c5c;
+
+   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad,
+                              (const __m512i*)tstate );
+
+   for ( i = 0; i < 16*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
+   for ( ; i < 16*16; i++ )       pad[i] = 0x36363636;
+ 
+   sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad,
+                              (const __m512i*)tstate );
+}
+
+
+static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
+          const uint32_t *ostate, const uint32_t *salt, uint32_t *output )
+{
+   uint32_t _ALIGN(128) ibuf[ 16*16 ];
+   uint32_t _ALIGN(128) obuf[ 16*16 ];
+   uint32_t _ALIGN(128) istate[ 16*8 ];
+   uint32_t _ALIGN(128) ostate2[ 16*8 ];
+   int i, j;
+
+   sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt,
+                              (const __m512i*)tstate );
+
+   memcpy( ibuf, salt + 16*16, 16*16 );
+   for ( i = 0; i < 16; i++ )      ibuf[ 16*5 + i ] = 0x80000000;
+   memset( ibuf + 16*6, 0x00, 16*36 );
+   for ( i = 0; i < 16; i++ )      ibuf[ 16*15 + i ] = 0x000004a0;
+
+   for ( i = 0; i < 16; i++ )      obuf[ 16*8 + i ] = 0x80000000;
+   memset( obuf + 16*9, 0x00, 16*24 );
+   for ( i = 0; i < 16; i++ )      obuf[ 16*15 + i ] = 0x00000300;
+
+   for ( i = 0; i < 4; i++ )
+   {
+      ibuf[ 16*4 +  0 ] = i + 1;
+      ibuf[ 16*4 +  1 ] = i + 1;
+      ibuf[ 16*4 +  2 ] = i + 1;
+      ibuf[ 16*4 +  3 ] = i + 1;
+      ibuf[ 16*4 +  4 ] = i + 1;
+      ibuf[ 16*4 +  5 ] = i + 1;
+      ibuf[ 16*4 +  6 ] = i + 1;
+      ibuf[ 16*4 +  7 ] = i + 1;
+      ibuf[ 16*4 +  8 ] = i + 1;
+      ibuf[ 16*4 +  9 ] = i + 1;
+      ibuf[ 16*4 + 10 ] = i + 1;
+      ibuf[ 16*4 + 11 ] = i + 1;
+      ibuf[ 16*4 + 12 ] = i + 1;
+      ibuf[ 16*4 + 13 ] = i + 1;
+      ibuf[ 16*4 + 14 ] = i + 1;
+      ibuf[ 16*4 + 15 ] = i + 1;
+
+      sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
+                                 (const __m512i*)istate );
+
+      sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
+                                 (const __m512i*)ostate );
+
+      for ( j = 0; j < 16*8; j++ )
+         output[ 16*8*i + j ] = bswap_32( ostate2[j] );
+   }
+}
+
+static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
+                 uint32_t *ostate, const uint32_t *salt, uint32_t *output )
+{
+   __m512i _ALIGN(128) final[ 16*16 ];
+   uint32_t _ALIGN(128) buf[ 16*16 ];
+   int i;
+
+   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt,
+                             (const __m512i*)tstate );
+   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
+                             (const __m512i*)tstate );
+
+   final[ 0] = _mm512_set1_epi32( 0x00000001 );
+   final[ 1] = _mm512_set1_epi32( 0x80000000 );
+   final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
+             = final[ 7] = final[ 8] = final[ 9] = final[10]
+             = final[11] = final[12] = final[13] = final[14]
+             = _mm512_setzero_si512();
+   final[15] = _mm512_set1_epi32 ( 0x00000620 );
+
+   sha256_16way_transform_le( (__m512i*)tstate, final,
+                             (const __m512i*)tstate );
+
+   memcpy( buf, tstate, 16*32 );
+   for ( i = 0; i < 16; i++ )      buf[ 16*8 + i ] = 0x80000000;
+   memset( buf + 16*9, 0x00, 16*24 );
+   for ( i = 0; i < 16; i++ )      buf[ 16*15 + i ] = 0x00000300;
+
+   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf,
+                             (const __m512i*)ostate );
+
+   for ( i = 0; i < 16*8; i++ )
+      output[i] = bswap_32( ostate[i] );
+}
+
+#endif // AVX512
 
 //#if defined(USE_ASM) && defined(__x86_64__)
 
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
-int scrypt_best_throughput();
+//int scrypt_best_throughput();
 void scrypt_core(uint32_t *X, uint32_t *V, int N);
 void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
-#if defined(USE_AVX2)
+
+//#if defined(USE_AVX2)
+#if defined(__AVX2__)
 #undef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 24
 #define HAVE_SCRYPT_6WAY 1
@@ -396,261 +614,633 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
 
 #ifndef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
+//#define scrypt_best_throughput() 1
 #endif
 
-unsigned char *scrypt_buffer_alloc(int N)
-{
-	return (uchar*) malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63);
-}
+#include "scrypt-core-4way.h"
 
-static bool scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
+static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
 	uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
 {
 	uint32_t tstate[8], ostate[8];
 	uint32_t X[32];
-	uint32_t *V;
+	uint32_t *V = (uint32_t*)scratchpad;
 	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
 
-	scrypt_core(X, V, N);
+   scrypt_core_simd128( X, V, N );  // woring
+//   scrypt_core_1way( X, V, N );  // working
+//   scrypt_core(X, V, N);
 
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
    return true;
 }
 
-#ifdef HAVE_SHA256_4WAY
-static int scrypt_1024_1_1_256_4way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, 
-   int thrid )
+#if defined(__AVX2__)
+
+static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-	uint32_t _ALIGN(128) tstate[4 * 8];
-	uint32_t _ALIGN(128) ostate[4 * 8];
-	uint32_t _ALIGN(128) W[4 * 32];
-	uint32_t _ALIGN(128) X[4 * 32];
-	uint32_t *V;
-	int i, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+   uint32_t _ALIGN(128) tstate[8 * 8];
+   uint32_t _ALIGN(128) ostate[8 * 8];
+   uint32_t _ALIGN(128) W[8 * 32];
+   uint32_t _ALIGN(128) X[8 * 32];
+   uint32_t *V = (uint32_t*)scratchpad;
 
-	for (i = 0; i < 20; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = input[k * 20 + i];
+   intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
+                   input+80, input+100, input+120, input+140, 640 );
+   for ( int i = 0; i < 8; i++ )
+      casti_m256i( tstate, i ) = _mm256_set1_epi32( midstate[i] );
 
-   for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			tstate[4 * i + k] = midstate[i];
+   HMAC_SHA256_80_init_8way( W, tstate, ostate );
+   PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );
 
-   HMAC_SHA256_80_init_4way(W, tstate, ostate);
+   dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
 
-   PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
 
+   // SCRYPT CORE
+
+
+   // AVX512
+
+/*
+   // AVX512 16 way working
+   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
+                    X+256+160, X+256+192, X+256+224, 1024 );
+
+   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
+
+   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                  X+256, X+256+32, X+256+64, X+256+96, X+256+128, 
+                  X+256+160, X+256+192, X+256+224, W, 1024 );
+*/
+/*
+   // AVX512 working
+   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); 
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/   
+/*
+   // AVX512, not working, very slow
+   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/
+
+  // AVX2
+
+/*
+   // AVX2   
+   // disable de/interleave for testing.
+   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+*/
+
+/*
+   // AVX2 working
+   intrlv_2x128( W,     X,     X+ 32, 1024 );
+   intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
+   intrlv_2x128( W+128, X+128, X+160, 1024 );
+   intrlv_2x128( W+192, X+192, X+224, 1024 );
+
+   // working
+//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+
+   // working
+   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+
+   // working
+//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+
+   dintrlv_2x128( X,     X+ 32, W,     1024 );
+   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x128( X+128, X+160, W+128, 1024 );
+   dintrlv_2x128( X+192, X+224, W+192, 1024 );
+*/
+
+/* 
+   // AVX2
+   intrlv_2x32( W,     X    , X+ 32, 1024 );
+   intrlv_2x32( W+64,  X+ 64, X+ 96, 1024 );
+   intrlv_2x32( W+128, X+128, X+160, 1024 );
+   intrlv_2x32( W+192, X+192, X+224, 1024 );
+
+   // working, deprecated, not up to data
+//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
+
+     // deprecated, not up to date
+//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
+
+   // working
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+
+//   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
    
-   for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			X[k * 32 + i] = W[4 * i + k];
+   dintrlv_2x32( X,     X+ 32, W,     1024 );
+   dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x32( X+128, X+160, W+128, 1024 );
+   dintrlv_2x32( X+192, X+224, W+192, 1024 );
+*/   
+
+   // SSE2
+
+/*   
+   // SSE2 working
+   intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
+   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N ); 
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); 
+   dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
+*/
 
-   scrypt_core(X + 0 * 32, V, N);
-	scrypt_core(X + 1 * 32, V, N);
-	scrypt_core(X + 2 * 32, V, N);
-	scrypt_core(X + 3 * 32, V, N);
+/*
+   // SSE2
+   scrypt_core_simd128( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+160, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+224, V, N );
+*/
+/*
+   // SSE2 working
+   scrypt_core_simd128_2buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+*/
 
+   scrypt_core_simd128_3buf( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+ 96, V, N );
    if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
 
-   for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = X[k * 32 + i];
+/*
+   // SSE2 working
+   scrypt_core_simd128_4buf( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+128, V, N );
+*/
 
-   PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
 
-   for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			output[k * 8 + i] = W[4 * i + k];
+   if ( work_restart[thrid].restart ) return 0;
+
+   intrlv_8x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, 1024 );
+
+   PBKDF2_SHA256_128_32_8way( tstate, ostate, W, W );
+
+   dintrlv_8x32( output,    output+ 8, output+16, output+24,
+                 output+32, output+40, output+48, output+56, W, 256 );
 
    return 1;
 }
-#endif /* HAVE_SHA256_4WAY */
 
-#ifdef HAVE_SCRYPT_3WAY
+#endif  // AVX2
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-static int scrypt_1024_1_1_256_3way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, 
-   int thrid )
+static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-	uint32_t _ALIGN(64) tstate[3 * 8], ostate[3 * 8];
-	uint32_t _ALIGN(64) X[3 * 32];
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+   uint32_t _ALIGN(128) tstate[ 16*8 ];
+   uint32_t _ALIGN(128) ostate[ 16*8 ];
+   uint32_t _ALIGN(128) W[ 16*32 ]; 
+   uint32_t _ALIGN(128) X[ 16*32 ];
+   uint32_t *V = (uint32_t*)scratchpad;
+
+   intrlv_16x32( W, input,     input+ 20, input+ 40, input+ 60,
+                    input+ 80, input+100, input+120, input+140,
+                    input+160, input+180, input+200, input+220,
+                    input+240, input+260, input+280, input+300, 640 );
+   for ( int i = 0; i < 8; i++ )
+      casti_m512i( tstate, i ) = _mm512_set1_epi32( midstate[i] );
 
-	memcpy(tstate +  0, midstate, 32);
-	memcpy(tstate +  8, midstate, 32);
-	memcpy(tstate + 16, midstate, 32);
+   HMAC_SHA256_80_init_16way( W, tstate, ostate );
+   PBKDF2_SHA256_80_128_16way( tstate, ostate, W, W );
 
-   HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
-	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
+   dintrlv_16x32( X,     X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224,
+                  X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480,
+                  W, 1024 );
 
+
+   // SCRYPT CORE
+
+
+   // AVX512
+/*
+   // AVX512 16 way working
+   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
+                    X+256+160, X+256+192, X+256+224, 1024 );
+
+   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
+
+   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                  X+256, X+256+32, X+256+64, X+256+96, X+256+128,
+                  X+256+160, X+256+192, X+256+224, W, 1024 );
+*/
+/*
+   // AVX512 working
+   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/
+/*
+   // AVX512, not working, very slow
+   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/
 
-   PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
-	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
-	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
+  // AVX2
 
+/*
+   // AVX2
+   // disable de/interleave for testing.
+   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+*/
+
+/*
+   // AVX2 working
+   intrlv_2x128( W,     X,     X+ 32, 1024 );
+   intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
+   intrlv_2x128( W+128, X+128, X+160, 1024 );
+   intrlv_2x128( W+192, X+192, X+224, 1024 );
+
+   // working
+//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+
+   // working
+   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+
+   // working
+//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+
+   dintrlv_2x128( X,     X+ 32, W,     1024 );
+   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x128( X+128, X+160, W+128, 1024 );
+   dintrlv_2x128( X+192, X+224, W+192, 1024 );
+*/
+
+/*
+   // AVX2
+   intrlv_2x32( W,     X    , X+ 32, 1024 );
+   intrlv_2x32( W+64,  X+ 64, X+ 96, 1024 );
+   intrlv_2x32( W+128, X+128, X+160, 1024 );
+   intrlv_2x32( W+192, X+192, X+224, 1024 );
+
+   // working, deprecated, not up to data
+//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
+
+     // deprecated, not up to date
+//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
+
+   // working
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+
+//   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
+
+   dintrlv_2x32( X,     X+ 32, W,     1024 );
+   dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x32( X+128, X+160, W+128, 1024 );
+   dintrlv_2x32( X+192, X+224, W+192, 1024 );
+*/
+
+   // SSE2
+
+/*
+   // SSE2 working
+   intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
+   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
+*/
+/*
+   // SSE2
+   scrypt_core_simd128( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+160, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+224, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+288, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+320, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+352, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+384, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+416, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+448, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+480, V, N );
+*/
+/*
+   // SSE2 working
+   scrypt_core_simd128_2buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+320, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+384, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+448, V, N );
+*/
+
+   scrypt_core_simd128_3buf( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+352, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+448, V, N );
+
+/*
+   // SSE2 working
+   scrypt_core_simd128_4buf( X,     V, N );
    if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+384, V, N );
+*/
+/*
+   scrypt_core_3way( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_3way( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_3way( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_3way( X+352, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+448, V, N );
+*/
 
-   scrypt_core_3way(X, V, N);
 
    if ( work_restart[thrid].restart ) return 0;
 
-   PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
-	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
-	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
+   intrlv_16x32( W, X,     X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224,
+                    X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480,
+                    1024 );
+
+   PBKDF2_SHA256_128_32_16way( tstate, ostate, W, W );
+
+   dintrlv_16x32( output,     output+  8, output+ 16, output+ 24,
+                  output+ 32, output+ 40, output+ 48, output+ 56,
+                  output+ 64, output+ 72, output+ 80, output+ 88,
+                  output+ 96, output+104, output+112, output+120, W, 256 );
 
    return 1;
 }
 
-#ifdef HAVE_SHA256_4WAY
-static bool scrypt_1024_1_1_256_12way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N,
-   int thrid )
-{
-	uint32_t _ALIGN(128) tstate[12 * 8];
-	uint32_t _ALIGN(128) ostate[12 * 8];
-	uint32_t _ALIGN(128) W[12 * 32];
-	uint32_t _ALIGN(128) X[12 * 32];
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
 
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				tstate[32 * j + 4 * i + k] = midstate[i];
+#endif // AVX512
 
-   HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
-	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
+#if defined(__SHA__)
 
+static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+{
+    uint32_t _ALIGN(128) tstate[4 * 8];
+    uint32_t _ALIGN(128) ostate[4 * 8];
+    uint32_t _ALIGN(128) W[4 * 32];
+    uint32_t *V = (uint32_t*)scratchpad;
+
+    memcpy( tstate,    midstate, 32 );
+    memcpy( tstate+ 8, midstate, 32 );
+    memcpy( tstate+16, midstate, 32 );
+    memcpy( tstate+24, midstate, 32 );
+    
+    HMAC_SHA256_80_init(  input,     tstate,    ostate    );
+    PBKDF2_SHA256_80_128( tstate,    ostate,    input,     W );
+
+    HMAC_SHA256_80_init(  input +20, tstate+ 8, ostate+ 8 );
+    PBKDF2_SHA256_80_128( tstate+ 8, ostate+ 8, input +20, W+32 );
+
+    HMAC_SHA256_80_init(  input +40, tstate+16, ostate+16 );
+    PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 );
+
+    HMAC_SHA256_80_init(  input +60, tstate+24, ostate+24 );
+    PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 );
+
+/*    
+   // Working Linear single threaded SIMD
+   scrypt_core_simd128( W,    V, N );
    if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+96, V, N );
+*/
 
-   PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-
+   // working, double buffered linear simd
+   scrypt_core_simd128_2buf( W, V, N );
    if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( W+64, V, N );
 
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
+/*
+   scrypt_core_simd128_3buf( W, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+96, V, N );
+*/
 
-   scrypt_core_3way(X + 0 * 96, V, N);
-	scrypt_core_3way(X + 1 * 96, V, N);
-	scrypt_core_3way(X + 2 * 96, V, N);
-	scrypt_core_3way(X + 3 * 96, V, N);
+   // working
+//   scrypt_core_simd128_4buf( W, V, N );
 
    if ( work_restart[thrid].restart ) return 0;
 
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
+   PBKDF2_SHA256_128_32( tstate,    ostate,    W,    output    );
 
-   PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
+   PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 );
 
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
+   PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 );
+
+   PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 );
 
    return 1;
 }
-#endif /* HAVE_SHA256_4WAY */
 
-#endif /* HAVE_SCRYPT_3WAY */
+#else
 
-#ifdef HAVE_SCRYPT_6WAY
-static int scrypt_1024_1_1_256_24way( const uint32_t *input,
-                               uint32_t *output, uint32_t *midstate,
-                               unsigned char *scratchpad, int N, int thrid )
+#ifdef HAVE_SHA256_4WAY
+static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-	uint32_t _ALIGN(128) tstate[24 * 8];
-	uint32_t _ALIGN(128) ostate[24 * 8];
-	uint32_t _ALIGN(128) W[24 * 32];
-	uint32_t _ALIGN(128) X[24 * 32];
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)( ( (uintptr_t)(scratchpad) + 63 ) & ~ (uintptr_t)(63) );
-	
-	for ( j = 0; j < 3; j++ ) 
-		for ( i = 0; i < 20; i++ )
-			for ( k = 0; k < 8; k++ )
-				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
+   uint32_t _ALIGN(128) tstate[4 * 8];
+   uint32_t _ALIGN(128) ostate[4 * 8];
+   uint32_t _ALIGN(128) W[4 * 32];
+   uint32_t _ALIGN(128) X[4 * 32];
+   uint32_t *V = (uint32_t*)scratchpad;
 
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 8; i++ )
-			for ( k = 0; k < 8; k++ )
-				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
+   intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
+   for ( int i = 0; i < 8; i++ )
+      casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] );
 
-   HMAC_SHA256_80_init_8way( W +   0, tstate +   0, ostate +   0 );
-	HMAC_SHA256_80_init_8way( W + 256, tstate +  64, ostate +  64 );
-	HMAC_SHA256_80_init_8way( W + 512, tstate + 128, ostate + 128 );
+   HMAC_SHA256_80_init_4way(W, tstate, ostate);
+   PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
+
+   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+
+////// SCRYPT_CORE   
 
-   if ( work_restart[thrid].restart ) return 0;
    
-   PBKDF2_SHA256_80_128_8way( tstate +   0, ostate +   0, W +   0, W +   0 );
-	PBKDF2_SHA256_80_128_8way( tstate +  64, ostate +  64, W + 256, W + 256 );
-	PBKDF2_SHA256_80_128_8way( tstate + 128, ostate + 128, W + 512, W + 512 );
+   // working, simple 4 way parallel, best for scrypt
+//   scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
 
+/*   
+   // Working Linear single threaded SIMD
+   scrypt_core_simd128( X,    V, N );
    if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+96, V, N );
+*/
+   
+   // working, double buffered linear simd, best for n2
+   scrypt_core_simd128_2buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+64, V, N );
+  
+/*
+   scrypt_core_simd128_3buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+96, V, N );
+*/
+   
+   // working
+//   scrypt_core_simd128_4buf( X, V, N );
 
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 32; i++ )
-			for ( k = 0; k < 8; k++ )
-				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
-
-   scrypt_core_6way( X +  0 * 32, V, N );
-	scrypt_core_6way( X +  6 * 32, V, N );
 
-   if ( work_restart[thrid].restart ) return 0;
+/* 
+   // original
+   scrypt_core(X + 0 * 32, V, N);
+	scrypt_core(X + 1 * 32, V, N);
+	scrypt_core(X + 2 * 32, V, N);
+	scrypt_core(X + 3 * 32, V, N);
+*/
 
-   scrypt_core_6way( X + 12 * 32, V, N );
-	scrypt_core_6way( X + 18 * 32, V, N );
+////////////////////////////////
 
    if ( work_restart[thrid].restart ) return 0;
-   
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 32; i++ )
-			for ( k = 0; k < 8; k++ )
-				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
 
-   PBKDF2_SHA256_128_32_8way( tstate +   0, ostate +   0, W +   0, W +   0 );
-	PBKDF2_SHA256_128_32_8way( tstate +  64, ostate +  64, W + 256, W + 256 );
-	PBKDF2_SHA256_128_32_8way( tstate + 128, ostate + 128, W + 512, W + 512 );
+   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+
+   PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
 
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 8; i++ )
-			for ( k = 0; k < 8; k++ )
-				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
+   dintrlv_4x32( output, output+8, output+16, output+24, W, 256 );
 
    return 1;
 }
-#endif /* HAVE_SCRYPT_6WAY */
+#endif /* HAVE_SHA256_4WAY */
+
+#endif // SHA
 
 extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
@@ -660,67 +1250,58 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
 	uint32_t midstate[8];
 	uint32_t n = pdata[19] - 1;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-	int throughput = scrypt_best_throughput();
+   int thr_id = mythr->id;  
+   int throughput = scrypt_throughput;
 	int i;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
-#ifdef HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		throughput *= 4;
-#endif
-	
-//   applog(LOG_INFO,"Scrypt thoughput %d",throughput);
+	for ( i = 0; i < throughput; i++ )
+		memcpy( data + i * 20, pdata, 80 );
+
+   sha256_transform_le( midstate, data, sha256_initial_state );
 
-	for (i = 0; i < throughput; i++)
-		memcpy(data + i * 20, pdata, 80);
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, data, 0);
-	
 	do {
       bool rc = true;
-		for (i = 0; i < throughput; i++)
-			data[i * 20 + 19] = ++n;
-		
-#if defined(HAVE_SHA256_4WAY)
-		if (throughput == 4)
-			rc = scrypt_1024_1_1_256_4way(data, hash, midstate,
-                             scratchbuf, scratchbuf_size, thr_id );
-		else
-#endif
-#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
-		if (throughput == 12)
-			rc = scrypt_1024_1_1_256_12way(data, hash, midstate,
-                              scratchbuf, scratchbuf_size, thr_id );
-		else
+		for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+      if ( throughput == 16 )
+         rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
+                                      opt_param_n, thr_id );
+      else
 #endif
-#if defined(HAVE_SCRYPT_6WAY)
-		if (throughput == 24)
-			rc = scrypt_1024_1_1_256_24way(data, hash, midstate,
-                               scratchbuf, scratchbuf_size, thr_id );
-		else
+#if defined(__AVX2__)      
+      if ( throughput == 8 )      
+         rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
+                                     opt_param_n, thr_id );
+      else
 #endif
-#if defined(HAVE_SCRYPT_3WAY)
-		if (throughput == 3)
-			rc = scrypt_1024_1_1_256_3way(data, hash, midstate,
-                                scratchbuf, scratchbuf_size, thr_id );
-		else
+      if ( throughput == 4 )
+#if defined(__SHA__)
+         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
+                                         opt_param_n, thr_id );
+#else
+         rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
+                                     opt_param_n, thr_id );
 #endif
-		rc = scrypt_1024_1_1_256(data, hash, midstate, scratchbuf,
-                                scratchbuf_size, thr_id );
-		
+      else
+         rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
+                                opt_param_n, thr_id );
+
       if ( rc )
       for ( i = 0; i < throughput; i++ )
       {
-         if ( unlikely( valid_hash( hash + i * 8, ptarget ) ) )
+         if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
          {
-				pdata[19] = data[i * 20 + 19];
+//            applog( LOG_INFO, "Thread %d, Lane %d", thr_id,i );
+            pdata[19] = data[i * 20 + 19];
             submit_solution( work, hash + i * 8, mythr );
-			}
+         }
 
       }
-	} while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
+
+
+   } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
 	
 	*hashes_done = n - pdata[19];
 	pdata[19] = n;
@@ -729,28 +1310,51 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 
 bool scrypt_miner_thread_init( int thr_id )
 {
- scratchbuf = scrypt_buffer_alloc( scratchbuf_size );  
- if ( scratchbuf )
-   return true;
- applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
- return false; 
+   scratchbuf = _mm_malloc( scratchbuf_size, 128 );
+   if ( scratchbuf )
+      return true;
+   applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
+   return false; 
 }
 
 bool register_scrypt_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
-  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
-  gate->scanhash         = (void*)&scanhash_scrypt;
-  opt_target_factor = 65536.0;
-
-  if ( !opt_param_n )
-  {
-     opt_param_n = 1024;
-     scratchbuf_size = 1024;
-  }
-  else
-     scratchbuf_size = opt_param_n;
-  applog(LOG_INFO,"Scrypt paramaters: N= %d, R= 1.", opt_param_n );
-  return true;
+#if defined(__SHA__)
+   gate->optimizations = SSE2_OPT | SHA_OPT;
+#else
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#endif
+   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
+   gate->scanhash         = (void*)&scanhash_scrypt;
+   opt_target_factor = 65536.0;
+   opt_param_n = opt_param_n ? opt_param_n : 1024;
+   applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   scrypt_throughput = 16;
+   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+#elif defined(__SHA__)
+   scrypt_throughput = 4;
+   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+#elif defined(__AVX2__)
+   scrypt_throughput = 8;   
+   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+#else
+   scrypt_throughput = 4;
+   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+#endif
+
+   char t_units[4] = {0};
+   char d_units[4] = {0};
+   double t_size = (double)scratchbuf_size;
+   double d_size = (double)scratchbuf_size * opt_n_threads;
+
+   format_number_si( &t_size, t_units );
+   format_number_si( &d_size, d_units );
+   
+   applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
+          scrypt_throughput, t_size, t_units, d_size, d_units );
+
+   return true;
 };
 
diff --git a/algo/sha/hmac-sha256-hash.c b/algo/sha/hmac-sha256-hash.c
index e09a4c2a..2cdf9c82 100644
--- a/algo/sha/hmac-sha256-hash.c
+++ b/algo/sha/hmac-sha256-hash.c
@@ -39,10 +39,10 @@
 void
 SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 {
-   sph_sha256_context ctx;
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, in, len );
-   sph_sha256_close( &ctx, digest );
+   sha256_context ctx;
+   sha256_ctx_init( &ctx );
+   sha256_update( &ctx, in, len );
+   sha256_final( &ctx, digest );
 }
 
 /**
@@ -64,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
 void
 HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
 {
-   unsigned char pad[64];
+   unsigned char pad[64] __attribute__ ((aligned (64)));
    unsigned char khash[32];
    const unsigned char * K = _K;
    size_t i;
@@ -72,29 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
    /* If Klen > 64, the key is really SHA256(K). */
    if ( Klen > 64 )
    {
-      sph_sha256_init( &ctx->ictx );
-      sph_sha256( &ctx->ictx, K, Klen );
-      sph_sha256_close( &ctx->ictx, khash );
-
+      sha256_ctx_init( &ctx->ictx );
+      sha256_update( &ctx->ictx, K, Klen );
+      sha256_final( &ctx->ictx, khash );
       K = khash;
       Klen = 32;
    }
 
    /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sph_sha256_init( &ctx->ictx );
+   sha256_ctx_init( &ctx->ictx );
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x36;
 
    memset( pad + Klen, 0x36, 64 - Klen );
-   sph_sha256( &ctx->ictx, pad, 64 );
+   sha256_update( &ctx->ictx, pad, 64 );
 
    /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-   sph_sha256_init( &ctx->octx );
+   sha256_ctx_init( &ctx->octx );
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x5c;
 
    memset( pad + Klen, 0x5c, 64 - Klen );
-   sph_sha256( &ctx->octx, pad, 64 );
+   sha256_update( &ctx->octx, pad, 64 );
 }
 
 /* Add bytes to the HMAC-SHA256 operation. */
@@ -102,18 +101,17 @@ void
 HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-   sph_sha256( &ctx->ictx, in, len );
+   sha256_update( &ctx->ictx, in, len );
 }
 
 /* Finish an HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
+HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
 {
-   unsigned char ihash[32];
-
-   sph_sha256_close( &ctx->ictx, ihash );
-   sph_sha256( &ctx->octx, ihash, 32 );
-   sph_sha256_close( &ctx->octx, digest );
+   uint32_t ihash[8] __attribute__ ((aligned (32)));
+   sha256_final( &ctx->ictx, ihash );
+   sha256_update( &ctx->octx, ihash, 32 );
+   sha256_final( &ctx->octx, digest );
 }
 
 /**
@@ -126,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
                size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
 {
 	HMAC_SHA256_CTX PShctx, hctx;
-	uint8_t _ALIGN(128) T[32];
-	uint8_t _ALIGN(128) U[32];
+   uint64_t _ALIGN(128) T[4];
+   uint64_t _ALIGN(128) U[4];
+//   uint8_t _ALIGN(128) T[32];
+//	uint8_t _ALIGN(128) U[32];
    uint32_t ivec;
 	size_t i, clen;
 	uint64_t j;
@@ -163,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
 //         _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
 //         _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
 
-//         for ( k = 0; k < 4; k++ )  T[k] ^= U[k];
+         for ( k = 0; k < 4; k++ )  T[k] ^= U[k];
          
-         for ( k = 0; k < 32; k++ )
-				T[k] ^= U[k];
+//         for ( k = 0; k < 32; k++ )
+//				T[k] ^= U[k];
 		}
 
 		/* Copy as many bytes as necessary into buf. */
diff --git a/algo/sha/hmac-sha256-hash.h b/algo/sha/hmac-sha256-hash.h
index a735c53a..7a281df8 100644
--- a/algo/sha/hmac-sha256-hash.h
+++ b/algo/sha/hmac-sha256-hash.h
@@ -31,18 +31,18 @@
 
 #include <sys/types.h>
 #include <stdint.h>
-#include "sph_sha2.h"
+#include "sha256-hash.h"
 
 typedef struct HMAC_SHA256Context
 {
-   sph_sha256_context ictx;
-   sph_sha256_context octx;
+   sha256_context ictx;
+   sha256_context octx;
 } HMAC_SHA256_CTX;
 
 void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
 void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
 void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
-void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
+void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
 void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
                       size_t len, uint8_t digest[32] );
 
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index f9505d12..7b6618c4 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -59,7 +59,9 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
                          size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sha256_4way_full( void *dst, const void *data, size_t len );
-void sha256_4way_transform( __m128i *state_out,  const __m128i *data,
+void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
                             const __m128i *state_in );
 
 #endif  // SSE2
@@ -79,8 +81,10 @@ void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
 void sha256_8way_close( sha256_8way_context *sc, void *dst );
 void sha256_8way_full( void *dst, const void *data, size_t len );
-void sha256_8way_transform( __m256i *state_out, const __m256i *data,
-                            const __m256i *state_in );
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
 
 #endif  // AVX2
 
@@ -99,7 +103,9 @@ void sha256_16way_init( sha256_16way_context *sc );
 void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
 void sha256_16way_close( sha256_16way_context *sc, void *dst );
 void sha256_16way_full( void *dst, const void *data, size_t len );
-void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in );
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                              const __m512i *state_in );
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 7eb40673..2a229bf6 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -180,6 +180,7 @@ static const uint32_t sha256d_hash1[16] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000100
 };
 
+// this performs the entire hash all over again, why?
 static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 {
 	uint32_t S[16];
@@ -195,6 +196,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 		hash[i] = swab32(hash[i]);
 }
 
+/*
 #if defined (__SHA__)
 
 #include "algo/sha/sph_sha2.h"
@@ -241,6 +243,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len)
 }
 
 #endif
+*/
 
 static inline void sha256d_preextend(uint32_t *W)
 {
@@ -653,6 +656,7 @@ int scanhash_sha256d( struct work *work,
 	return 0;
 }
 
+/*
 int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -682,13 +686,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
    pdata[19] = n;
    return 0;
 }
-
+*/
 
 bool register_sha256d_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AVX2_OPT;
    gate->scanhash = (void*)&scanhash_sha256d;
-   gate->hash     = (void*)&sha256d;
+//   gate->hash     = (void*)&sha256d;
    return true;
 };
 
diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c
index f169b63f..7fc64ca3 100644
--- a/algo/sha/sha256-hash-2way-ni.c
+++ b/algo/sha/sha256-hash-2way-ni.c
@@ -7,9 +7,9 @@
 
 #if defined(__SHA__)
 
-#include "sha256-hash-opt.h"
+#include "sha256-hash.h"
 
-void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
+void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
                               const void *msg_X, const void *msg_Y,
                               const uint32_t *in_X, const uint32_t *in_Y )
 {
@@ -342,4 +342,348 @@ void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
     _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
 }
 
+void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
+    __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
+
+    // Load initial values
+    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
+    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
+    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
+    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
+    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
+    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
+    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
+    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
+    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE_X = STATE0_X;
+    ABEF_SAVE_Y = STATE0_Y;
+    CDGH_SAVE_X = STATE1_X;
+    CDGH_SAVE_Y = STATE1_Y;
+
+    // Rounds 0-3
+    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
+    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
+    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
+    TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
+    TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 4-7
+    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
+    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
+    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
+    TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
+    TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 8-11
+    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
+    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
+    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
+    TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
+    TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 12-15
+    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
+    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
+    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
+    TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
+    TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 16-19
+    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 20-23
+    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 24-27
+    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 28-31
+    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 32-35
+    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 36-39
+    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 40-43
+    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 44-47
+    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 48-51
+    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 52-55
+    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 56-59
+    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 60-63
+    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Add values back to state
+    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
+    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
+    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
+    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
+
+    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
+    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
+    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
+    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
+    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
+    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
+    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
+    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
+    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
+}
+
+
 #endif
+
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index c5f60481..beac702c 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -74,17 +74,6 @@ static const uint32_t K256[64] =
 #define CHs(X, Y, Z) \
    _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
 
-/*
-#define MAJs(X, Y, Z) \
-   _mm_or_si128( _mm_and_si128( X, Y ), \
-                    _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
-*/
-/*
-#define MAJs(X, Y, Z) \
-  _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
-                                   _mm_xor_si128( Y, Z ) ) )
-*/
-
 #define MAJs(X, Y, Z) \
   _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
                                    Y_xor_Z ) )
@@ -105,38 +94,6 @@ static const uint32_t K256[64] =
    _mm_xor_si128( _mm_xor_si128( \
         mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
 
-/*
-#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
-do { \
-  __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
-  __m128i T1 = mm128_ror_32( E, 14 ); \
-  __m128i T2 = mm128_ror_32( A,  9 ); \
-  __m128i T3 = _mm_xor_si128( F, G ); \
-  __m128i T4 = _mm_or_si128( A, B ); \
-  __m128i T5 = _mm_and_si128( A, B ); \
-  K  = _mm_add_epi32( K, W[i] ); \
-  T1 = _mm_xor_si128( T1, E ); \
-  T2 = _mm_xor_si128( T2, A ); \
-  T3 = _mm_and_si128( T3, E ); \
-  T4 = _mm_and_si128( T4, C ); \
-  K  = _mm_add_epi32( H, K ); \
-  T1 = mm128_ror_32( T1,  5 ); \
-  T2 = mm128_ror_32( T2, 11 ); \
-  T3 = _mm_xor_si128( T3, G ); \
-  T4 = _mm_or_si128( T4, T5 ); \
-  T1 = _mm_xor_si128( T1, E ); \
-  T2 = _mm_xor_si128( T2, A ); \
-  T1 = mm128_ror_32( T1,  6 ); \
-  T2 = mm128_ror_32( T2,  2 ); \
-  T1 = _mm_add_epi32( T1, T3 ); \
-  T2 = _mm_add_epi32( T2, T4 ); \
-  T1 = _mm_add_epi32( T1, K ); \
-  H  = _mm_add_epi32( T1, T2 ); \
-  D  = _mm_add_epi32( D, T1 ); \
-} while (0)
-*/
-
-
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m128i T1, T2; \
@@ -149,8 +106,8 @@ do { \
   H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
 
-
-void sha256_4way_transform( __m128i *state_out, const __m128i *data,
+// LE data, no need to byte swap
+void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
                             const __m128i *state_in )
 {
    __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
@@ -232,6 +189,91 @@ void sha256_4way_transform( __m128i *state_out, const __m128i *data,
    state_out[7] = _mm_add_epi32( state_in[7], H );
 }
 
+// BE data, need to byte swap
+void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
+                            const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   __m128i W[16];
+
+   mm128_block_bswap_32( W, data );
+   mm128_block_bswap_32( W+8, data+8 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+   Y_xor_Z = _mm_xor_si128( B, C );
+
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm_add_epi32( state_in[0], A );
+   state_out[1] = _mm_add_epi32( state_in[1], B );
+   state_out[2] = _mm_add_epi32( state_in[2], C );
+   state_out[3] = _mm_add_epi32( state_in[3], D );
+   state_out[4] = _mm_add_epi32( state_in[4], E );
+   state_out[5] = _mm_add_epi32( state_in[5], F );
+   state_out[6] = _mm_add_epi32( state_in[6], G );
+   state_out[7] = _mm_add_epi32( state_in[7], H );
+}
+
+
 static void
 sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
 {
@@ -436,61 +478,81 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
 
 // SHA-256 8 way
 
-#if defined(__AVX512VL__)
-
-#define CHx(X, Y, Z) \
-   _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
-
-#define MAJx(X, Y, Z) \
-   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
-
 #define BSG2_0x(x) \
-   mm256_xor3( mm256_ror_32(x,  2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  2 ), \
+                                       mm256_ror_32( x, 13 ) ), \
+                                       mm256_ror_32( x, 22 ) )
 
 #define BSG2_1x(x) \
-   mm256_xor3( mm256_ror_32(x,  6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  6 ), \
+                                       mm256_ror_32( x, 11 ) ), \
+                                       mm256_ror_32( x, 25 ) )
 
 #define SSG2_0x(x) \
-   mm256_xor3( mm256_ror_32(x,  7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  7 ), \
+                                       mm256_ror_32( x, 18 ) ), \
+                                       _mm256_srli_epi32( x, 3 ) ) 
 
 #define SSG2_1x(x) \
-   mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
+                                       mm256_ror_32( x, 19 ) ), \
+                                       _mm256_srli_epi32( x, 10 ) )
+
+#define SHA2x_MEXP( a, b, c, d ) \
+     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
+
+// With AVX512VL ternary logic optimizations are available.
+// If not optimize by forwarding the result of X^Y in MAJ to the next round
+// to avoid recalculating it as Y^Z. This optimization is not applicable
+// when MAJ is optimized with ternary logic.
+
+#if defined(__AVX512VL__)
+
+#define CHx(X, Y, Z)    _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
+
+#define MAJx(X, Y, Z)   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
+
+#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
+do { \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
+                                 W[ i ] ); \
+  __m256i T1 = BSG2_1x( E ); \
+  __m256i T2 = BSG2_0x( A ); \
+  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
+  T1 = _mm256_add_epi32( T1, H ); \
+  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
+  D  = _mm256_add_epi32( D,  T1 ); \
+  H  = _mm256_add_epi32( T1, T2 ); \
+} while (0)
 
 #else  // AVX2
 
 #define CHx(X, Y, Z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 
-#define MAJx(X, Y, Z) \
-  _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
-                                         _mm256_xor_si256( Y, Z ) ) )
-/*
+// Use saved X_xor_Y from previous round, now called Y_xor_Z,
+// and save new X_xor_Y, for next round.
 #define MAJx(X, Y, Z) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
-*/
-
-#define BSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,  2), mm256_ror_32(x, 13) ), mm256_ror_32( x, 22) )
-
-#define BSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,  6), mm256_ror_32(x, 11) ), mm256_ror_32( x, 25) )
-
-#define SSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,  7), mm256_ror_32(x, 18) ), _mm256_srli_epi32(x, 3) ) 
 
-#define SSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
-
-#endif   // AVX512 else AVX2
-
-#define SHA2x_MEXP( a, b, c, d ) \
-     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
+#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
+do { \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
+                                 W[ i ] ); \
+  __m256i T1 = BSG2_1x( E ); \
+  __m256i T2 = BSG2_0x( A ); \
+  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
+  T1 = _mm256_add_epi32( T1, H ); \
+  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
+  Y_xor_Z = X_xor_Y; \
+  D  = _mm256_add_epi32( D,  T1 ); \
+  H  = _mm256_add_epi32( T1, T2 ); \
+} while (0)
 
+/*
 #define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m256i T1, T2; \
@@ -498,16 +560,23 @@ do { \
   T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
   D  = _mm256_add_epi32( D,  T1 ); \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
+*/
+
+#endif   // AVX512VL else AVX2
 
-void sha256_8way_transform( __m256i *state_out, const __m256i *data,
+// accepts LE byte ordered data, skip the byte swap
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
                             const __m256i *state_in )
 {
    __m256i A, B, C, D, E, F, G, H;
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z;
+#endif
    __m256i W[16];
-
    memcpy_256( W, data, 16 );
 
    A = state_in[0];
@@ -519,6 +588,101 @@ void sha256_8way_transform( __m256i *state_out, const __m256i *data,
    G = state_in[6];
    H = state_in[7];
 
+#if !defined(__AVX512VL__)
+   Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm256_add_epi32( state_in[0], A );
+   state_out[1] = _mm256_add_epi32( state_in[1], B );
+   state_out[2] = _mm256_add_epi32( state_in[2], C );
+   state_out[3] = _mm256_add_epi32( state_in[3], D );
+   state_out[4] = _mm256_add_epi32( state_in[4], E );
+   state_out[5] = _mm256_add_epi32( state_in[5], F );
+   state_out[6] = _mm256_add_epi32( state_in[6], G );
+   state_out[7] = _mm256_add_epi32( state_in[7], H );
+}
+
+
+// Accepts BE byte ordered data, need to byte swap
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z;
+#endif
+   __m256i W[16];
+
+   mm256_block_bswap_32( W  , data   );
+   mm256_block_bswap_32( W+8, data+8 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+#if !defined(__AVX512VL__)
+   Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+   
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -587,6 +751,9 @@ static void
 sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
 {
    register  __m256i A, B, C, D, E, F, G, H;
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z;
+#endif
    __m256i W[16];
 
    mm256_block_bswap_32( W  , in   );
@@ -615,6 +782,10 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
       H = m256_const1_64( 0x5BE0CD195BE0CD19 );
    }
 
+#if !defined(__AVX512VL__)
+   Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+   
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -790,27 +961,44 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 
 // SHA-256 16 way
 
-#define CHx16(X, Y, Z) \
-   _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
+#define CHx16(X, Y, Z)    _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
 
-#define MAJx16(X, Y, Z) \
-   _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
+#define MAJx16(X, Y, Z)   _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
 
-#define BSG2_0x16(x) \
-   mm512_xor3( mm512_ror_32(x,  2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
+#define BSG2_0x16(x)      mm512_xor3( _mm512_ror_epi32( x,  2 ), \
+                                      _mm512_ror_epi32( x, 13 ), \
+                                      _mm512_ror_epi32( x, 22 ) )
 
-#define BSG2_1x16(x) \
-   mm512_xor3( mm512_ror_32(x,  6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
+#define BSG2_1x16(x)      mm512_xor3( _mm512_ror_epi32( x,  6 ), \
+                                      _mm512_ror_epi32( x, 11 ), \
+                                      _mm512_ror_epi32( x, 25 ) )
 
-#define SSG2_0x16(x) \
-   mm512_xor3( mm512_ror_32(x,  7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
+#define SSG2_0x16(x)      mm512_xor3( _mm512_ror_epi32(  x,  7 ), \
+                                      _mm512_ror_epi32(  x, 18 ), \
+                                      _mm512_srli_epi32( x,  3 ) )
 
-#define SSG2_1x16(x) \
-   mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
+#define SSG2_1x16(x)      mm512_xor3( _mm512_ror_epi32(  x, 17 ), \
+                                      _mm512_ror_epi32(  x, 19 ), \
+                                      _mm512_srli_epi32( x, 10 ) )
 
 #define SHA2x16_MEXP( a, b, c, d ) \
      mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
 
+#define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
+do { \
+  __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \
+                                 W[ i ] ); \
+  __m512i T1 = BSG2_1x16( E ); \
+  __m512i T2 = BSG2_0x16( A ); \
+  T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
+  T1 = _mm512_add_epi32( T1, H ); \
+  T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
+  T1 = _mm512_add_epi32( T1, T0 ); \
+  D  = _mm512_add_epi32( D,  T1 ); \
+  H  = _mm512_add_epi32( T1, T2 ); \
+} while (0)
+   
+/*
 #define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m512i T1, T2; \
@@ -821,14 +1009,10 @@ do { \
   D  = _mm512_add_epi32( D,  T1 ); \
   H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
+*/
 
-// Tranform one 16 lane by 64 byte message block and update state.
-// Calling function is responsible for initializing the state, setting
-// correct byte order, counting bits and padding of the final block.
-// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
-// redundant byte swapping.
-//
-void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+// accepts LE input data
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in )
 {
    __m512i A, B, C, D, E, F, G, H;
@@ -909,6 +1093,89 @@ void sha256_16way_transform( __m512i *state_out, const __m512i *data,
    state_out[7] = _mm512_add_epi32( state_in[7], H );
 }
 
+// Accepts BE input data, need to bswap
+void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+                                const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   mm512_block_bswap_32( W  , data   );
+   mm512_block_bswap_32( W+8, data+8 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm512_add_epi32( state_in[0], A );
+   state_out[1] = _mm512_add_epi32( state_in[1], B );
+   state_out[2] = _mm512_add_epi32( state_in[2], C );
+   state_out[3] = _mm512_add_epi32( state_in[3], D );
+   state_out[4] = _mm512_add_epi32( state_in[4], E );
+   state_out[5] = _mm512_add_epi32( state_in[5], F );
+   state_out[6] = _mm512_add_epi32( state_in[6], G );
+   state_out[7] = _mm512_add_epi32( state_in[7], H );
+}
+
 // Aggresive prehashing
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                              const __m512i *state_in )
diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c
index 78bda652..e08dd60b 100644
--- a/algo/sha/sha256-hash-opt.c
+++ b/algo/sha/sha256-hash-opt.c
@@ -7,9 +7,9 @@
 
 #if defined(__SHA__)
 
-#include "sha256-hash-opt.h"
+#include "sha256-hash.h"
 
-void sha256_opt_transform( uint32_t *state_out, const void *input,
+void sha256_opt_transform_le( uint32_t *state_out, const void *input,
                            const uint32_t *state_in )
 {
     __m128i STATE0, STATE1;
@@ -197,4 +197,192 @@ void sha256_opt_transform( uint32_t *state_out, const void *input,
     _mm_store_si128((__m128i*) &state_out[4], STATE1);
 }
 
+
+void sha256_opt_transform_be( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in )
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP, MASK;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm_load_si128((__m128i*) &state_in[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
+    TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 4-7
+    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
+    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+    // Rounds 8-11
+    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
+    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 12-15
+    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
+    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 16-19
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 20-23
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 24-27
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 28-31
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 32-35
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 36-39
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 40-43
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 44-47
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 48-51
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 52-55
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 56-59
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 60-63
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Add values back to state
+    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &state_out[0], STATE0);
+    _mm_store_si128((__m128i*) &state_out[4], STATE1);
+}
+
 #endif
diff --git a/algo/sha/sha256-hash-opt.h b/algo/sha/sha256-hash-opt.h
deleted file mode 100644
index 9ceacf43..00000000
--- a/algo/sha/sha256-hash-opt.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef SHA2_HASH_OPT_H__
-#define SHA2_HASH_OPT_H__ 1
-
-#include <stddef.h>
-#include "simd-utils.h"
-
-#if defined(__SHA__)
-
-void sha256_opt_transform( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in );
-
-// 2 way with interleaved instructions
-void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y );
-
-#endif
-#endif
diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c
new file mode 100644
index 00000000..ddbaacc9
--- /dev/null
+++ b/algo/sha/sha256-hash.c
@@ -0,0 +1,142 @@
+#include "sha256-hash.h"
+
+static const uint32_t SHA256_IV[8] =
+{
+   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+/*
+static const uint8_t SHA256_PAD[64] =
+{
+   0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+*/
+
+void sha256_ctx_init( sha256_context *ctx )
+{
+   memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV );
+   ctx->count = 0;
+}
+
+void sha256_update( sha256_context *ctx, const void *data, size_t len )
+{
+   int ptr = ctx->count & 0x3f;
+   const uint8_t *src = data;
+
+   ctx->count += (uint64_t)len;
+
+   if ( len < 64 - ptr )
+   {
+      memcpy( ctx->buf + ptr, src, len );
+      return;
+   }
+
+   memcpy( ctx->buf + ptr, src, 64 - ptr );
+   sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+   src += 64 - ptr;
+   len -= 64 - ptr;
+
+   while ( len >= 64 )
+   {
+      sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state );
+      src += 64;
+      len -= 64;
+   }
+
+   memcpy( ctx->buf, src, len );
+}
+
+#if 0
+void sha256_final( sha256_context *ctx, uint32_t *hash )
+{
+   size_t r;
+
+
+   /* Figure out how many bytes we have buffered. */
+   r = ctx->count & 0x3f;
+//   r = ( ctx->count >> 3 ) & 0x3f;
+
+//printf("final: count= %d, r= %d\n", ctx->count, r );
+   
+   /* Pad to 56 mod 64, transforming if we finish a block en route. */
+   if ( r < 56 )
+   {
+      /* Pad to 56 mod 64. */
+      memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
+   }
+   else
+   {
+      /* Finish the current block and mix. */
+      memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
+      sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+
+//      SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
+
+      /* The start of the final block is all zeroes. */
+      memset( &ctx->buf[0], 0, 56 );
+   }
+
+   /* Add the terminating bit-count. */
+   ctx->buf[56] = bswap_64( ctx->count << 3 );
+//   ctx->buf[56] = bswap_64( ctx->count );
+//   be64enc( &ctx->buf[56], ctx->count );
+
+   /* Mix in the final block. */
+   sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+
+//   SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
+
+   for ( int i = 0; i < 8; i++ )  hash[i] = bswap_32( ctx->state[i] );
+   
+//   for ( int i = 0; i < 8; i++ )  be32enc( hash + 4*i, ctx->state + i );
+
+/*
+//   be32enc_vect(digest, ctx->state, 4);
+//   be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
+   // Encode vector, two words at a time. 
+   do {
+      be32enc(&dst[0], src[0]);
+      be32enc(&dst[4], src[1]);
+      src += 2;
+      dst += 8;
+   } while (--len);
+*/
+
+}
+#endif
+
+void sha256_final( sha256_context *ctx, void *hash )
+{
+   int ptr = ctx->count & 0x3f;
+
+   ctx->buf[ ptr++ ] = 0x80;
+   
+   if ( ptr > 56 )
+   {
+      memset( ctx->buf + ptr, 0, 64 - ptr );
+      sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+      memset( ctx->buf, 0, 56 );
+   }
+   else
+      memset( ctx->buf + ptr, 0, 56 - ptr );
+
+   *(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 );   
+
+   sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+
+   for ( int i = 0; i < 8; i++ )
+      ( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] );
+}
+
+void sha256_full( void *hash, const void *data, size_t len )
+{
+   sha256_context ctx;
+   sha256_ctx_init( &ctx );
+   sha256_update( &ctx, data, len );
+   sha256_final( &ctx, hash );
+}
+
diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h
new file mode 100644
index 00000000..c6d61d8f
--- /dev/null
+++ b/algo/sha/sha256-hash.h
@@ -0,0 +1,56 @@
+#ifndef SHA256_HASH_H__
+#define SHA256_HASH_H__ 1
+
+#include <stddef.h>
+#include "simd-utils.h"
+#include "cpuminer-config.h"
+#include "sph_sha2.h"
+
+
+// generic interface 
+
+typedef struct {
+   unsigned char buf[64];    /* first field, for alignment */
+   uint32_t state[8];
+   uint64_t count;
+} sha256_context __attribute__((aligned(64)));
+
+void sha256_full( void *hash, const void *data, size_t len );
+void sha256_update( sha256_context *ctx, const void *data, size_t len );
+void sha256_final( sha256_context *ctx, void *hash );
+void sha256_ctx_init( sha256_context *ctx );
+void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                          const uint32_t *state_in );
+void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
+                          const uint32_t *state_in );
+
+#if defined(__SHA__)
+
+void sha256_opt_transform_le( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in );
+
+void sha256_opt_transform_be( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in );
+
+// 2 way with interleaved instructions
+void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+// Select target
+// with SHA...
+#define sha256_transform_le sha256_opt_transform_le
+#define sha256_transform_be sha256_opt_transform_be
+
+#else
+
+// without SHA...
+#define sha256_transform_le sph_sha256_transform_le
+#define sha256_transform_be sph_sha256_transform_be
+
+#endif
+#endif
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
index 9bbc5c8d..fd3ae2f1 100644
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -14,6 +14,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    __m512i  hash32[8]    __attribute__ ((aligned (32)));
    __m512i  initstate[8] __attribute__ ((aligned (32)));
    __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate2[8] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    __m512i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -23,7 +24,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 16;
    uint32_t n = first_nonce;
-   __m512i *noncev = vdata + 19; 
+   __m512i *noncev = vdata + 19;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
    const __m512i last_byte = m512_const1_32( 0x80000000 );
@@ -45,27 +46,30 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
    initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 
-   // hash first 64 bytes of data
-   sha256_16way_transform( midstate, vdata, initstate );
+   // hash first 64 byte block of data
+   sha256_16way_transform_le( midstate, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
 
    do
    {
       // 1. final 16 bytes of data, with padding
       memcpy_512( block, vdata + 16, 4 );
       block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );  
+      memset_zero_512( block + 5, 10 );
       block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_transform( hash32, block, midstate );
+      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
 
       // 2. 32 byte hash from 1.
       memcpy_512( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_512( block + 9, 6 );
       block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform( hash32, block, initstate );
+      sha256_16way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );    
+      mm512_block_bswap_32( hash32, hash32 );
 
       for ( int lane = 0; lane < 16; lane++ )
       if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -85,7 +89,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    return 0;
 }
 
-
 #endif
 
 #if defined(SHA256D_8WAY)
@@ -128,7 +131,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_8way_transform( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -137,14 +140,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_256( block + 5, 10 );
       block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform( hash32, block, midstate );
+      sha256_8way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_256( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_256( block + 9, 6 );
       block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform( hash32, block, initstate );
+      sha256_8way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm256_block_bswap_32( hash32, hash32 );
@@ -209,7 +212,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_4way_transform( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -218,14 +221,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_128( block + 5, 10 );
       block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform( hash32, block, midstate );
+      sha256_4way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_128( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_128( block + 9, 6 );
       block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform( hash32, block, initstate );
+      sha256_4way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm128_block_bswap_32( hash32, hash32 );
diff --git a/algo/sha/sha256d.c b/algo/sha/sha256d.c
new file mode 100644
index 00000000..ed4bd60d
--- /dev/null
+++ b/algo/sha/sha256d.c
@@ -0,0 +1,8 @@
+#include "sha256d.h"
+
+void sha256d( void *hash, const void *data, int len )
+{
+   sha256_full( hash, data, len );
+   sha256_full( hash, hash,  32 );
+}
+
diff --git a/algo/sha/sha256d.h b/algo/sha/sha256d.h
new file mode 100644
index 00000000..71f78eeb
--- /dev/null
+++ b/algo/sha/sha256d.h
@@ -0,0 +1,7 @@
+#include "algo-gate-api.h"
+#include <string.h>
+#include <inttypes.h>
+#include "sha256-hash.h"
+
+void sha256d( void *hash, const void *data, int len );
+
diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c
index cf9890e7..90a2b7bb 100644
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -3,14 +3,14 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
-static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
+static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64)));
 
 void sha256q_midstate( const void* input )
 {
-   sph_sha256_init( &sha256q_ctx );
-   sph_sha256( &sha256q_ctx, input, 64 );
+   sha256_ctx_init( &sha256q_ctx );
+   sha256_update( &sha256q_ctx, input, 64 );
 }
 
 int sha256q_hash( void* output, const void* input )
@@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input )
    const int midlen = 64;            // bytes
    const int tail   = 80 - midlen;   // 16
 
-   sph_sha256_context ctx __attribute__ ((aligned (64)));
+   sha256_context ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
 
-   sph_sha256( &ctx, input + midlen, tail );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, output );
+   sha256_update( &ctx, input + midlen, tail );
+   sha256_final( &ctx, hash );
 
+   sha256_full( hash,   hash, 32 );
+   sha256_full( hash,   hash, 32 );
+   sha256_full( output, hash, 32 );
+   
    return 1;
 }
 
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 0f4fb58d..12cbcde2 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -47,7 +47,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 byte block of data
-   sha256_16way_transform( midstate, vdata, initstate );
+   sha256_16way_transform_le( midstate, vdata, initstate );
 
    // Do 3 rounds on the first 12 bytes of the next block
    sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
@@ -60,18 +60,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
       memset_zero_512( block + 5, 10 );  
       block[15] = m512_const1_32( 80*8 ); // bit count
       sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
-//      sha256_16way_transform( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_512( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_512( block + 9, 6 );
       block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform( hash32, block, initstate );
+      sha256_16way_transform_le( hash32, block, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy_512( block, hash32, 8 );
-      sha256_16way_transform( hash32, block, initstate );
+      sha256_16way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm512_block_bswap_32( hash32, hash32 );    
@@ -137,7 +136,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_8way_transform( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -146,18 +145,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_256( block + 5, 10 );
       block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform( hash32, block, midstate );
+      sha256_8way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_256( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_256( block + 9, 6 );
       block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform( hash32, block, initstate );
+      sha256_8way_transform_le( hash32, block, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy_256( block, hash32, 8 );
-      sha256_8way_transform( hash32, block, initstate );
+      sha256_8way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm256_block_bswap_32( hash32, hash32 );
@@ -222,7 +221,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_4way_transform( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -231,18 +230,18 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_128( block + 5, 10 );
       block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform( hash32, block, midstate );
+      sha256_4way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_128( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_128( block + 9, 6 );
       block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform( hash32, block, initstate );
+      sha256_4way_transform_le( hash32, block, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy_128( block, hash32, 8 );
-      sha256_4way_transform( hash32, block, initstate );
+      sha256_4way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm128_block_bswap_32( hash32, hash32 );
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
index 90d2754b..c528d279 100644
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -4,120 +4,12 @@
 #include <string.h>
 #include <stdio.h>
 //#include "algo/sha/sph_sha2.h"
-#include "sha256-hash-opt.h"
+#include "sha256-hash.h"
 
 #if defined(__SHA__)
 
 // Only used on CPUs with SHA
 
-/*
-static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
-
-void sha256t_midstate( const void* input )
-{
-   sph_sha256_init( &sha256t_ctx );
-   sph_sha256( &sha256t_ctx, input, 64 );
-}
-
-int sha256t_hash( void* output, const void* input )
-{
-   uint32_t _ALIGN(64) hash[16];
-   const int midlen = 64;            // bytes
-   const int tail   = 80 - midlen;   // 16
-
-   sph_sha256_context ctx __attribute__ ((aligned (64)));
-   memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
-
-   sph_sha256( &ctx, input + midlen, tail );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, output );
-
-   return 1;
-}
-*/
-
-/*
-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block[16]    __attribute__ ((aligned (64)));
-   uint32_t hash32[8]    __attribute__ ((aligned (32)));
-   uint32_t initstate[8] __attribute__ ((aligned (32)));
-   uint32_t midstate[8]  __attribute__ ((aligned (32)));
-
-
-
-//   uint32_t edata[20] __attribute__((aligned(64)));
-//   uint32_t hash[8] __attribute__((aligned(64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 1;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-//   mm128_bswap32_80( edata, pdata );
-//   sha256t_midstate( edata );
-
-   // initialize state
-   initstate[0] = 0x6A09E667;
-   initstate[1] = 0xBB67AE85;
-   initstate[2] = 0x3C6EF372;
-   initstate[3] = 0xA54FF53A;
-   initstate[4] = 0x510E527F;
-   initstate[5] = 0x9B05688C;
-   initstate[6] = 0x1F83D9AB;
-   initstate[7] = 0x5BE0CD19;
-
-   // hash first 64 bytes of data
-   sha256_opt_transform( midstate, pdata, initstate );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block, pdata + 16, 16 );
-      block[ 4] = 0x80000000;
-      memset( block + 5, 0, 40 );
-      block[15] = 80*8; // bit count
-      sha256_opt_transform( hash32, block, midstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block, hash32, 32 );
-      block[ 8] = 0x80000000;
-      memset( block + 9, 0, 24 );
-      block[15] = 32*8; // bit count
-      sha256_opt_transform( hash32, block, initstate );
-
-      // 3. 32 byte hash from 2.
-      memcpy( block, hash32, 32 );
-      sha256_opt_transform( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      casti_m128i( hash32, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
-      casti_m128i( hash32, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
-
-      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
-         submit_solution( work, hash32, mythr );
-      n++;
-      pdata[19] = n;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
 
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
@@ -149,7 +41,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
    initstate[7] = 0x5BE0CD19;
 
    // hash first 64 bytes of data
-   sha256_opt_transform( midstate, pdata, initstate );
+   sha256_opt_transform_le( midstate, pdata, initstate );
 
    do
    {
@@ -162,7 +54,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
       memset( block0 + 5, 0, 40 );
       memset( block1 + 5, 0, 40 );
       block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy( block0, hash0, 32 );
@@ -171,12 +63,12 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
       memset( block0 + 9, 0, 24 );
       memset( block1 + 9, 0, 24 );
       block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy( block0, hash0, 32 );
       memcpy( block1, hash1, 32 );
-      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
 
       // byte swap final hash for testing
       casti_m128i( hash0, 0 ) =
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index e41a92ba..7c96d2eb 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -95,32 +95,36 @@ static const uint64_t K512[80] =
 
 // SHA-512 8 way 64 bit
 
-#define CH8W(X, Y, Z) \
-   _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
+#define CH8W( X, Y, Z )    _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
 
-#define MAJ8W(X, Y, Z) \
-   _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
+#define MAJ8W( X, Y, Z )   _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
 
-#define BSG8W_5_0(x) \
-   mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
+#define BSG8W_5_0( x )     mm512_xor3( _mm512_ror_epi64( x, 28 ), \
+                                       _mm512_ror_epi64( x, 34 ), \
+                                       _mm512_ror_epi64( x, 39 ) )
 
-#define BSG8W_5_1(x) \
-   mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
+#define BSG8W_5_1( x )     mm512_xor3( _mm512_ror_epi64( x, 14 ), \
+                                       _mm512_ror_epi64( x, 18 ), \
+                                       _mm512_ror_epi64( x, 41 ) )
 
-#define SSG8W_5_0(x) \
-   mm512_xor3( mm512_ror_64(x,  1), mm512_ror_64(x,  8), _mm512_srli_epi64(x, 7) ) 
+#define SSG8W_5_0( x )     mm512_xor3( _mm512_ror_epi64( x,  1 ), \
+                                       _mm512_ror_epi64( x,  8 ), \
+                                       _mm512_srli_epi64( x, 7 ) ) 
 
-#define SSG8W_5_1(x) \
-   mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
+#define SSG8W_5_1( x )     mm512_xor3( _mm512_ror_epi64( x, 19 ), \
+                                       _mm512_ror_epi64( x, 61 ), \
+                                       _mm512_srli_epi64( x, 6 ) )
 
-#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
+#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m512i T1, T2; \
-  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
-  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
-  D  = _mm512_add_epi64( D, T1 ); \
+  __m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
+  __m512i T1 = BSG8W_5_1( E ); \
+  __m512i T2 = BSG8W_5_0( A ); \
+  T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
+  T1 = _mm512_add_epi64( T1, H ); \
+  T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \
+  T1 = _mm512_add_epi64( T1, T0 ); \
+  D  = _mm512_add_epi64( D,  T1 ); \
   H  = _mm512_add_epi64( T1, T2 ); \
 } while (0)
 
@@ -267,16 +271,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 
 // SHA-512 4 way 64 bit
 
-
 #define CH(X, Y, Z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 
-/*
-#define MAJ(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
-*/
-
 #define MAJ(X, Y, Z) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
@@ -289,15 +286,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
   mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
                    _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
 
-/*
-#define BSG5_0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
-
-#define BSG5_1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
-*/
 /*
 #define SSG5_0(x) \
    _mm256_xor_si256( _mm256_xor_si256( \
@@ -325,94 +313,20 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
    return _mm256_add_epi64( w0a, w1a );
 }
 
-/*
-#define SSG512x2_0( w0, w1, i ) do \
-{ \
-   __m256i X0a, X1a, X0b, X1b; \
-  X0a = mm256_ror_64( W[i-15], 1 ); \
-  X1a = mm256_ror_64( W[i-14], 1 ); \
-  X0b = mm256_ror_64( W[i-15], 8 ); \
-  X1b = mm256_ror_64( W[i-14], 8 ); \
-  X0a = _mm256_xor_si256( X0a, X0b ); \
-  X1a = _mm256_xor_si256( X1a, X1b ); \
-  X0b = _mm256_srli_epi64( W[i-15], 7 ); \
-  X1b = _mm256_srli_epi64( W[i-14], 7 ); \
-  w0  = _mm256_xor_si256( X0a, X0b ); \
-  w1  = _mm256_xor_si256( X1a, X1b ); \
-} while(0)
-
-#define SSG512x2_1( w0, w1, i ) do \
-{ \
-   __m256i X0a, X1a, X0b, X1b; \
-  X0a = mm256_ror_64( W[i-2],19 ); \
-  X1a = mm256_ror_64( W[i-1],19 ); \
-  X0b = mm256_ror_64( W[i-2],61 ); \
-  X1b = mm256_ror_64( W[i-1],61 ); \
-  X0a = _mm256_xor_si256( X0a, X0b ); \
-  X1a = _mm256_xor_si256( X1a, X1b ); \
-  X0b = _mm256_srli_epi64( W[i-2], 6 ); \
-  X1b = _mm256_srli_epi64( W[i-1], 6 ); \
-  w0  = _mm256_xor_si256( X0a, X0b ); \
-  w1  = _mm256_xor_si256( X1a, X1b ); \
-} while(0)
-*/
-/*
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
+#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
-  __m256i T1 = mm256_ror_64( E, 23 ); \
-  __m256i T2 = mm256_ror_64( A,  5 ); \
-  __m256i T3 = _mm256_xor_si256( F, G ); \
-  __m256i T4 = _mm256_or_si256( A, B ); \
-  __m256i T5 = _mm256_and_si256( A, B ); \
-  K = _mm256_add_epi64( K, W[i] ); \
-  T1 = _mm256_xor_si256( T1, E ); \
-  T2 = _mm256_xor_si256( T2, A ); \
-  T3 = _mm256_and_si256( T3, E ); \
-  T4 = _mm256_and_si256( T4, C ); \
-  K = _mm256_add_epi64( H, K ); \
-  T1 = mm256_ror_64( T1, 4 ); \
-  T2 = mm256_ror_64( T2, 6 ); \
-  T3 = _mm256_xor_si256( T3, G ); \
-  T4 = _mm256_or_si256( T4, T5 ); \
-  T1 = _mm256_xor_si256( T1, E ); \
-  T2 = _mm256_xor_si256( T2, A ); \
-  T1 = mm256_ror_64( T1, 14 ); \
-  T2 = mm256_ror_64( T2, 28 ); \
-  T1 = _mm256_add_epi64( T1, T3 ); \
-  T2 = _mm256_add_epi64( T2, T4 ); \
-  T1 = _mm256_add_epi64( T1, K ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-  D  = _mm256_add_epi64( D, T1 ); \
-} while (0)
-*/
-/*
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  __m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
-  __m256i T1 = BSG5_1(E); \
-  __m256i T2 = BSG5_0(A); \
-  T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
-  T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
-  D  = _mm256_add_epi64( D, T1 ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-} while (0)
-*/
-
-
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  __m256i T1, T2; \
-  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
-  T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
+  __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
+  __m256i T1 = BSG5_1( E ); \
+  __m256i T2 = BSG5_0( A ); \
+  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
+  T1 = _mm256_add_epi64( T1, H ); \
+  T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
+  T1 = _mm256_add_epi64( T1, T0 ); \
   Y_xor_Z = X_xor_Y; \
-  D  = _mm256_add_epi64( D, T1 ); \
+  D  = _mm256_add_epi64( D,  T1 ); \
   H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)
 
-
 static void
 sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
 {
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index 7e399548..a89fc8d7 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -71,198 +71,6 @@ static const sph_u32 H256[8] = {
  * of the compression function.
  */
 
-#if defined(__SHA__)
-
-#include "simd-utils.h"
-
-static void sha2_round( const uint8_t input[], uint32_t state[8] )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    MSG = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state[0], STATE0);
-    _mm_store_si128((__m128i*) &state[4], STATE1);
-}
-
-#else   // no SHA
 
 /*
 static const sph_u32 K[64] = {
@@ -875,8 +683,24 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
 #undef SHA2_IN
 }
 
-#endif   // SHA else
+void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in )
+{
+memcpy( state_out, state_in, 32 );
+#define SHA2_IN(x)   (data[x])
+   SHA2_ROUND_BODY( SHA2_IN, state_out );
+#undef SHA2_IN
+}
 
+void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data, 
+                              const uint32_t *state_in )
+{  
+memcpy( state_out, state_in, 32 );
+#define SHA2_IN(x)   sph_dec32be_aligned( data+(x) )
+   SHA2_ROUND_BODY( SHA2_IN, state_out );
+#undef SHA2_IN
+
+}
 
 /* see sph_sha2.h */
 void
diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h
index e3a83eb8..b76c3f4b 100644
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -207,6 +207,13 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
 
 void sph_sha256_full( void *dst, const void *data, size_t len );
 
+// These shouldn't be called directly, use sha256-hash.h generic functions
+// sha256_transform_le & sha256_transform_be instead.
+void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in );
+
+void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in );
 
 
 #if SPH_64
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index 1b774263..9c71459a 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -20,8 +20,8 @@ static const uint32_t IV512[] =
 
 
 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror128_32( a ), \
-                       mm256_ror128_32( b ), 0x88 )
+   _mm256_blend_epi32( mm256_shuflr128_32( a ), \
+                       mm256_shuflr128_32( b ), 0x88 )
 
 #if defined(__VAES__)
 
@@ -78,7 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
    {
       // round 1, 5, 9
 
-     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
+     k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
                                   mm256_aesenc_2x128( k00, zero ) ) );
 
      if ( r == 0 )
@@ -88,7 +88,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
      k01 = _mm256_xor_si256( k00,
-		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
 
      if ( r == 1 )
         k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -97,25 +97,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
      k02 = _mm256_xor_si256( k01,
-		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
      k03 = _mm256_xor_si256( k02,
-		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p3 = _mm256_xor_si256( p3, x );
 
      k10 = _mm256_xor_si256( k03,
-		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
      k11 = _mm256_xor_si256( k10,
-		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
      k12 = _mm256_xor_si256( k11,
-		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
      k13 = _mm256_xor_si256( k12,
-		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
 
      if ( r == 2 )
         k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -151,31 +151,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      // round 3, 7, 11
 
-     k00 = _mm256_xor_si256( mm256_ror128_32(
+     k00 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k00, zero ) ), k13 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror128_32(
+     k01 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k01, zero ) ), k00 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror128_32(
+     k02 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k02, zero ) ), k01 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror128_32(
+     k03 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k03, zero ) ), k02 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p1 = _mm256_xor_si256( p1, x );
 
-     k10 = _mm256_xor_si256( mm256_ror128_32(
+     k10 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k10, zero ) ), k03 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror128_32(
+     k11 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k11, zero ) ), k10 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror128_32(
+     k12 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k12, zero ) ), k11 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror128_32(
+     k13 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k13, zero ) ), k12 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
@@ -209,35 +209,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
    // round 13
 
-   k00 = _mm256_xor_si256( mm256_ror128_32(
+   k00 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
    x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror128_32(
+   k01 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror128_32(
+   k02 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror128_32(
+   k03 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
    p3 = _mm256_xor_si256( p3, x );
 
-   k10 = _mm256_xor_si256( mm256_ror128_32(
+   k10 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror128_32(
+   k11 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
 
-   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
    k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
 
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror128_32(
+   k13 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
index 2c93df96..0184ee8c 100644
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -12,8 +12,8 @@ static const uint32_t IV512[] =
 };
 
 #define mm512_ror2x512hi_1x32( a, b ) \
-   _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
-                                    mm512_ror128_32( b ) )
+   _mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
+                                    mm512_shuflr128_32( b ) )
 
 static void
 c512_4way( shavite512_4way_context *ctx, const void *msg )
@@ -60,7 +60,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
    {
       // round 1, 5, 9
 
-     K0 = _mm512_xor_si512( K7, mm512_ror128_32(
+     K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
                                   _mm512_aesenc_epi128( K0, m512_zero ) ) );
 
      if ( r == 0 )
@@ -69,33 +69,33 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
      K1 = _mm512_xor_si512( K0,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
 
      if ( r == 1 )
-        K1 = _mm512_xor_si512( K1, mm512_ror128_32(
+        K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
                  _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
 
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
      K2 = _mm512_xor_si512( K1,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
      K3 = _mm512_xor_si512( K2,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
      P3 = _mm512_xor_si512( P3, X );
 
      K4 = _mm512_xor_si512( K3,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
      K5 = _mm512_xor_si512( K4,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
      K6 = _mm512_xor_si512( K5,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
      K7 = _mm512_xor_si512( K6,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
 
      if ( r == 2 )
         K7 = _mm512_xor_si512( K7, mm512_swap128_64(
@@ -130,31 +130,31 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 
      // round 3, 7, 11
 
-     K0 = _mm512_xor_si512( mm512_ror128_32(
+     K0 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( mm512_ror128_32(
+     K1 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( mm512_ror128_32(
+     K2 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( mm512_ror128_32(
+     K3 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
      P1 = _mm512_xor_si512( P1, X );
 
-     K4 = _mm512_xor_si512( mm512_ror128_32(
+     K4 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( mm512_ror128_32(
+     K5 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( mm512_ror128_32(
+     K6 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( mm512_ror128_32(
+     K7 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
 
@@ -187,34 +187,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 
    // round 13
 
-   K0 = _mm512_xor_si512( mm512_ror128_32(
+   K0 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
-   K1 = _mm512_xor_si512( mm512_ror128_32(
+   K1 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-   K2 = _mm512_xor_si512( mm512_ror128_32(
+   K2 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-   K3 = _mm512_xor_si512( mm512_ror128_32(
+   K3 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
    P3 = _mm512_xor_si512( P3, X );
 
-   K4 = _mm512_xor_si512( mm512_ror128_32(
+   K4 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
-   K5 = _mm512_xor_si512( mm512_ror128_32(
+   K5 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
 
-   K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
+   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
    K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
 
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-   K7= _mm512_xor_si512( mm512_ror128_32(
+   K7= _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
 
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index a593cf55..d8f6febd 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -74,15 +74,15 @@ static const sph_u32 IV512[] = {
 
 #endif
 
+/*
 #if defined(__AVX2__)
 // 2 way version of above
 // a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
-
 #define mm256_ror2x256hi_1x32( a, b ) \
    _mm256_blend_epi32( mm256_ror256_1x32( a ), \
                        mm256_rol256_3x32( b ), 0x88 )
-
 #endif
+*/
 
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
@@ -135,7 +135,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
    for ( r = 0; r < 3; r ++ )
    {
       // round 1, 5, 9
-      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
+      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
       k00 = _mm_xor_si128( k00, k13 ); 
 
       if ( r == 0 )
@@ -144,7 +144,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       x = _mm_xor_si128( p0, k00 );
       x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
+      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
       k01 = _mm_xor_si128( k01, k00 );
 
       if ( r == 1 )
@@ -153,31 +153,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
+      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
       k02 = _mm_xor_si128( k02, k01 );
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
+      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
       k03 = _mm_xor_si128( k03, k02 );
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, zero );
 
       p3 = _mm_xor_si128( p3, x );
 
-      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
+      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
       k10 = _mm_xor_si128( k10, k03 );
 
       x = _mm_xor_si128( p2, k10 );
       x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
+      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
       k11 = _mm_xor_si128( k11, k10 );
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
+      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
       k12 = _mm_xor_si128( k12, k11 );
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
+      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
       k13 = _mm_xor_si128( k13, k12 );
 
       if ( r == 2 )
@@ -222,38 +222,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       // round 3, 7, 11
 
-      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
+      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
       k00 = _mm_xor_si128( k00, k13 );
       x = _mm_xor_si128( p2, k00 );
       x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
+      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
       k01 = _mm_xor_si128( k01, k00 );
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
+      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
       k02 = _mm_xor_si128( k02, k01 );
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
+      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
       k03 = _mm_xor_si128( k03, k02 );
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, zero );
 
       p1 = _mm_xor_si128( p1, x );
 
-      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
+      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
       k10 = _mm_xor_si128( k10, k03 );
       x = _mm_xor_si128( p0, k10 );
       x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
+      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
       k11 = _mm_xor_si128( k11, k10 );
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
+      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
       k12 = _mm_xor_si128( k12, k11 );
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
+      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
       k13 = _mm_xor_si128( k13, k12 );
       x = _mm_xor_si128( x, k13 );
       x = _mm_aesenc_si128( x, zero );
@@ -295,39 +295,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
    // round 13
 
-   k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
+   k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
    k00 = _mm_xor_si128( k00, k13 );
    x = _mm_xor_si128( p0, k00 );
    x = _mm_aesenc_si128( x, zero );
-   k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); 
+   k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); 
    k01 = _mm_xor_si128( k01, k00 );
    x = _mm_xor_si128( x, k01 );
    x = _mm_aesenc_si128( x, zero );
-   k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
+   k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
    k02 = _mm_xor_si128( k02, k01 );
    x = _mm_xor_si128( x, k02 );
    x = _mm_aesenc_si128( x, zero );
-   k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
+   k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
    k03 = _mm_xor_si128( k03, k02 );
    x = _mm_xor_si128( x, k03 );
    x = _mm_aesenc_si128( x, zero );
 
    p3 = _mm_xor_si128( p3, x );
 
-   k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
+   k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
    k10 = _mm_xor_si128( k10, k03 );
    x = _mm_xor_si128( p2, k10 );
    x = _mm_aesenc_si128( x, zero );
-   k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
+   k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
    k11 = _mm_xor_si128( k11, k10 );
    x = _mm_xor_si128( x, k11 );
    x = _mm_aesenc_si128( x, zero );
-   k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
+   k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
    k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
                ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
    x = _mm_xor_si128( x, k12 );
    x = _mm_aesenc_si128( x, zero );
-   k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
+   k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
    k13 = _mm_xor_si128( k13, k12 );
    x = _mm_xor_si128( x, k13 );
    x = _mm_aesenc_si128( x, zero );
diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c
index a12af435..5a7cdbda 100644
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include "skein-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 #if defined (SKEIN_8WAY)
 
@@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input )
      uint32_t hash1[16] __attribute__ ((aligned (64)));
      uint32_t hash2[16] __attribute__ ((aligned (64)));
      uint32_t hash3[16] __attribute__ ((aligned (64)));
-     sph_sha256_context ctx_sha256;
 #else
      uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
      sha256_4way_context ctx_sha256;
@@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input )
 #if defined(__SHA__)      
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash0, 64 );
-     sph_sha256_close( &ctx_sha256, hash0 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash1, 64 );
-     sph_sha256_close( &ctx_sha256, hash1 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash2, 64 );
-     sph_sha256_close( &ctx_sha256, hash2 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash3, 64 );
-     sph_sha256_close( &ctx_sha256, hash3 );
+
+     sha256_full( hash0, hash0, 64 );
+     sha256_full( hash1, hash1, 64 );
+     sha256_full( hash2, hash2, 64 );
+     sha256_full( hash3, hash3, 64 );
+    
      intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
 
 #else
diff --git a/algo/skein/skein.c b/algo/skein/skein.c
index 91eb3252..be9bb82b 100644
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -5,21 +5,18 @@
 #include <string.h>
 #include <stdint.h>
 #include "sph_skein.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 void skeinhash(void *state, const void *input)
 {
      uint32_t hash[16] __attribute__ ((aligned (64)));
      sph_skein512_context ctx_skein;
-     sph_sha256_context   ctx_sha256;
 
      sph_skein512_init( &ctx_skein );
      sph_skein512( &ctx_skein, input, 80 );
      sph_skein512_close( &ctx_skein, hash );
 
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash, 64 );
-     sph_sha256_close( &ctx_sha256, hash );
+     sha256_full( hash, hash, 64 );
 
      memcpy(state, hash, 32);
 }
@@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input)
 int scanhash_skein( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t hash64[8] __attribute__ ((aligned (64)));
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t Htarg = ptarget[7];
@@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
 	uint32_t n = first_nonce;
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );
 
 	do {
 		be32enc(&endiandata[19], n); 
diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
index 0d971f2e..8880b45f 100644
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -176,12 +176,6 @@ static void rotate_indexes( uint32_t *p )
 */
 }
 #endif
-
-static inline uint32_t rotl32( uint32_t a, size_t r )
-{
-   return ( a << r ) | ( a >> (32-r) );
-}
-
 // Vectorized and targetted version of fnv1a
 #if defined (__AVX2__)        
 
@@ -232,7 +226,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 { \
    const uint32_t *blob_off = blob + \
-                 ( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
+                 ( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \
                  * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
    UPDATE_ACCUMULATOR; \
    MULXOR; \
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
index a0103444..ec808f6b 100644
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -1,5 +1,5 @@
 #include "algo-gate-api.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "Verthash.h"
 #include "tiny_sha3/sha3-4way.h"
 
@@ -140,7 +140,7 @@ bool register_verthash_algo( algo_gate_t* gate )
          uint8_t vhDataFileHash[32] = { 0 };
 
          applog( LOG_NOTICE, "Verifying Verthash data" );
-         sph_sha256_full( vhDataFileHash, verthashInfo.data,
+         sha256_full( vhDataFileHash, verthashInfo.data,
                           verthashInfo.dataSize );
          if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
                       sizeof(verthashDatFileHash_bytes) ) == 0 )
diff --git a/algo/whirlpool/whirlpool.c b/algo/whirlpool/whirlpool.c
index 59fcf71c..1c6b6883 100644
--- a/algo/whirlpool/whirlpool.c
+++ b/algo/whirlpool/whirlpool.c
@@ -82,7 +82,7 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce,
 		be32enc(&endiandata[19], n );
 		whirlpool_hash(vhash, endiandata);
 
-		if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+		if (vhash[7] <= Htarg && fulltest(vhash, ptarget) & ! opt_benchmark )
              submit_solution( work, vhash, mythr );
 	} while ( n < max_nonce && !work_restart[thr_id].restart);
 
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 29739525..8d4fb058 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -52,10 +52,10 @@ void x16r_8way_prehash( void *vdata, void *pdata )
       break;
       case CUBEHASH:
          mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
+         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
+         cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
+         cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
+         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
       break;
       case HAMSI:
          mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -207,15 +207,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
          case LUFFA:
             if ( i == 0 )
             {
-                intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash,
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-                memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-                intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash, 
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               luffa_4way_update_close( &ctx.luffa, vhash,
+                                                    vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               luffa_4way_update_close( &ctx.luffa, vhash, 
+                                                    vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
             }
             else
             {
@@ -230,56 +230,24 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
          case CUBEHASH:
             if ( i == 0 )
             {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                            (const byte*)in0 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                            (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                            (const byte*)in2 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                            (const byte*)in3 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
-                                            (const byte*)in4 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
-                                            (const byte*)in5 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
-                                            (const byte*)in6 + 64, 16 );
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               cube_4way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
                memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
-                                            (const byte*)in7 + 64, 16 );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               cube_4way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
             }
             else
             {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                             (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                             (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                             (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                             (const byte*)in3, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
-                                             (const byte*)in4, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
-                                             (const byte*)in5, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
-                                             (const byte*)in6, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
-                                             (const byte*)in7, size );
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
             }
          break;
          case SHAVITE:
@@ -556,9 +524,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
          break;
       case CUBEHASH:
          mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_2x128( vdata2, edata, edata, 640 );
+         cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
+         cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
+         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
       break;
       case HAMSI:
          mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -680,13 +649,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
          case LUFFA:
             if ( i == 0 )
             {
-               intrlv_2x128( vhash, hash0, hash1, 640 );
-               luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
-               dintrlv_2x128_512( hash0, hash1, vhash );
-               intrlv_2x128( vhash, hash2, hash3, 640 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
-               dintrlv_2x128_512( hash2, hash3, vhash );
+              intrlv_2x128( vhash, hash0, hash1, 640 );
+              luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+              dintrlv_2x128_512( hash0, hash1, vhash );
+              intrlv_2x128( vhash, hash2, hash3, 640 );
+              memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+              luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+              dintrlv_2x128_512( hash2, hash3, vhash );
             }
             else
             {
@@ -701,32 +670,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
          case CUBEHASH:
             if ( i == 0 )
             {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                          (const byte*)in0 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                          (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                          (const byte*)in2 + 64, 16 );
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               cube_2way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<1), 16 );
+               dintrlv_2x128_512( hash0, hash1, vhash );
                memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                          (const byte*)in3 + 64, 16 );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               cube_2way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<1), 16 );
+               dintrlv_2x128_512( hash2, hash3, vhash );
             }
             else
             {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                     (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                     (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                     (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                     (const byte*)in3, size );
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_2x128_512( hash0, hash1, vhash );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_2x128_512( hash2, hash3, vhash );
             }
          break;
          case SHAVITE:
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index 09315f6a..3a94344b 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -1,4 +1,5 @@
 #include "x16r-gate.h"
+#include "algo/sha/sha256d.h"
 
 __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
 
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index 748b7fa3..76ca5e7e 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -37,6 +37,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -115,7 +116,7 @@ union _x16r_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cubehashParam           cube;
+    cube_4way_context       cube;
     simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     hashState_fugue         fugue;
@@ -164,8 +165,8 @@ union _x16r_4way_context_overlay
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
+    cube_2way_context       cube;
     hashState_luffa         luffa1;
-    cubehashParam           cube;
     simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c
index de2dbe68..2f27116f 100644
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -13,7 +13,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
 #if defined(__SHA__)
-  #include "algo/sha/sph_sha2.h"
+  #include "algo/sha/sha256-hash.h"
 #endif
 
 #if defined (X21S_8WAY)
@@ -208,9 +208,7 @@ union _x21s_4way_context_overlay
     haval256_5_4way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
-#if defined(__SHA__)
-    sph_sha256_context      sha256;
-#else
+#if !defined(__SHA__)
     sha256_4way_context     sha256;
 #endif
 } __attribute__ ((aligned (64)));
@@ -275,18 +273,10 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
 
 #if defined(__SHA__)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0, 64 );
-   sph_sha256_close( &ctx.sha256, output );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1, 64 );
-   sph_sha256_close( &ctx.sha256, output+32 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2, 64 );
-   sph_sha256_close( &ctx.sha256, output+64 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3, 64 );
-   sph_sha256_close( &ctx.sha256, output+96 );
+   sha256_full( output,    hash0, 64 );
+   sha256_full( output+32, hash1, 64 );
+   sha256_full( output+64, hash2, 64 );
+   sha256_full( output+96, hash3, 64 );
 
 #else
 
diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c
index b81c07ec..96782e22 100644
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
@@ -23,7 +23,7 @@ union _x21s_context_overlay
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        sph_sha256_context      sha256;
+        sha256_context      sha256;
 };
 typedef union _x21s_context_overlay x21s_context_overlay;
 
@@ -50,9 +50,7 @@ int x21s_hash( void* output, const void* input, int thrid )
    sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
    sph_gost512_close( &ctx.gost, (void*) hash );
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash, 64 );
-   sph_sha256_close( &ctx.sha256, hash );
+   sha256_full( hash, hash, 64 );
 
    memcpy( output, hash, 32 );
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index fcff0b6e..1902a2de 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -37,7 +37,8 @@ union _x17_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cube_4way_context       cube;
+//    cube_4way_context       cube;
+    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
     groestl512_4way_context groestl;
     shavite512_4way_context shavite;
@@ -119,8 +120,10 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
      luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
      luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
-     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
+     cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
+     
+//     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+//     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
 
 #if defined(__VAES__)
 
diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c
index 94b34cc5..5acf3de5 100644
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -28,7 +28,7 @@
   #include "algo/echo/echo-hash-4way.h"
 #endif
 #if defined(__SHA__)
-  #include "algo/sha/sph_sha2.h"
+  #include "algo/sha/sha256-hash.h"
 #endif
 
 #if defined(X22I_8WAY)
@@ -51,9 +51,7 @@ union _x22i_8way_ctx_overlay
     haval256_5_8way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
-#if defined(X22I_8WAY_SHA)
-    sph_sha256_context      sha256;
-#else
+#if !defined(X22I_8WAY_SHA)
     sha256_8way_context     sha256;
 #endif
 #if defined(__VAES__)
@@ -391,30 +389,14 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
 
 #if defined(X22I_8WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0, 64 );
-   sph_sha256_close( &ctx.sha256, output );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1, 64 );
-   sph_sha256_close( &ctx.sha256, output+32 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2, 64 );
-   sph_sha256_close( &ctx.sha256, output+64 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3, 64 );
-   sph_sha256_close( &ctx.sha256, output+96 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash4, 64 );
-   sph_sha256_close( &ctx.sha256, output+128 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash5, 64 );
-   sph_sha256_close( &ctx.sha256, output+160 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash6, 64 );
-   sph_sha256_close( &ctx.sha256, output+192 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash7, 64 );
-   sph_sha256_close( &ctx.sha256, output+224 );
+   sha256_full( hash0, hash0, 64 );
+   sha256_full( hash1, hash1, 64 );
+   sha256_full( hash2, hash2, 64 );
+   sha256_full( hash3, hash3, 64 );
+   sha256_full( hash4, hash4, 64 );
+   sha256_full( hash5, hash5, 64 );
+   sha256_full( hash6, hash6, 64 );
+   sha256_full( hash7, hash7, 64 );
    
 #else
 
@@ -551,9 +533,7 @@ union _x22i_4way_ctx_overlay
     haval256_5_4way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
-#if defined(X22I_4WAY_SHA)
-    sph_sha256_context      sha256;
-#else
+#if !defined(X22I_4WAY_SHA)
     sha256_4way_context     sha256;
 #endif
 };
@@ -757,18 +737,10 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
 
 #if defined(X22I_4WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0, 64 );
-   sph_sha256_close( &ctx.sha256, output );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1, 64 );
-   sph_sha256_close( &ctx.sha256, output+32 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2, 64 );
-   sph_sha256_close( &ctx.sha256, output+64 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3, 64 );
-   sph_sha256_close( &ctx.sha256, output+96 );
+   sha256_full( hash0, hash0, 64 );
+   sha256_full( hash1, hash1, 64 );
+   sha256_full( hash2, hash2, 64 );
+   sha256_full( hash3, hash3, 64 );
 
 #else
 
diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c
index 759e44c4..d63ddf24 100644
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -24,6 +24,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -57,7 +58,6 @@ union _x22i_context_overlay
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        sph_sha256_context      sha256;
 };
 typedef union _x22i_context_overlay x22i_context_overlay;
 
@@ -172,9 +172,7 @@ int x22i_hash( void *output, const void *input, int thrid )
    sph_gost512 (&ctx.gost, (const void*) hash, 64);
    sph_gost512_close(&ctx.gost, (void*) hash);
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash, 64 );
-   sph_sha256_close( &ctx.sha256, hash );
+   sha256_full( hash, hash, 64 );
 
    memcpy(output, hash, 32);
 
diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c
index 86f56997..ff2888ec 100644
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -33,7 +33,7 @@
   #include "algo/echo/echo-hash-4way.h"
 #endif
 #if defined(__SHA__)
-  #include "algo/sha/sph_sha2.h"
+  #include "algo/sha/sha256-hash.h"
 #endif
 
 void x25x_shuffle( void *hash )
@@ -84,7 +84,7 @@ union _x25x_8way_ctx_overlay
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
 #if defined(X25X_8WAY_SHA)
-    sph_sha256_context      sha256;
+    sha256_context          sha256;
 #else
     sha256_8way_context     sha256;
 #endif
@@ -447,31 +447,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
 
 #if defined(X25X_8WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash0[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash1[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash2[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash3[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash4[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash4[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash5[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash5[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash6[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash6[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash7[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash7[21] );
-
+   sha256_full( hash0[21], hash0[20], 64 );
+   sha256_full( hash1[21], hash1[20], 64 );
+   sha256_full( hash2[21], hash2[20], 64 );
+   sha256_full( hash3[21], hash3[20], 64 );
+   sha256_full( hash4[21], hash4[20], 64 );
+   sha256_full( hash5[21], hash5[20], 64 );
+   sha256_full( hash6[21], hash6[20], 64 );
+   sha256_full( hash7[21], hash7[20], 64 );
+   
    intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21],
                            hash4[21], hash5[21], hash6[21], hash7[21] );
    
@@ -646,7 +630,7 @@ union _x25x_4way_ctx_overlay
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
 #if defined(X25X_4WAY_SHA)
-    sph_sha256_context      sha256;
+    sha256_context          sha256;
 #else
     sha256_4way_context     sha256;
 #endif
@@ -848,18 +832,10 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
 
 #if defined(X25X_4WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash0[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash1[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash2[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash3[21] );
+   sha256_full( hash0[21], hash0[20], 64 );
+   sha256_full( hash1[21], hash1[20], 64 );
+   sha256_full( hash2[21], hash2[20], 64 );
+   sha256_full( hash3[21], hash3[20], 64 );
 
    intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] );
 
diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c
index 42e7eda0..aade6e2b 100644
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -23,7 +23,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -60,7 +60,7 @@ union _x25x_context_overlay
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        sph_sha256_context      sha256;
+        sha256_context          sha256;
         sph_panama_context      panama;
         blake2s_state           blake2s;
 };
@@ -174,9 +174,7 @@ int x25x_hash( void *output, const void *input, int thrid )
    sph_gost512 (&ctx.gost, (const void*) &hash[19], 64);
    sph_gost512_close(&ctx.gost, (void*) &hash[20]);
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, &hash[20], 64 );
-   sph_sha256_close( &ctx.sha256, &hash[21] );
+   sha256_full( &hash[21], &hash[20], 64 );
 
    sph_panama_init(&ctx.panama);
    sph_panama (&ctx.panama, (const void*) &hash[21], 64 );
diff --git a/algo/yespower/crypto/blake2b-yp.c b/algo/yespower/crypto/blake2b-yp.c
index 407d2dde..dc6eee6a 100644
--- a/algo/yespower/crypto/blake2b-yp.c
+++ b/algo/yespower/crypto/blake2b-yp.c
@@ -35,9 +35,11 @@
 #include "blake2b-yp.h"
 
 // Cyclic right rotation.
-#ifndef ROTR64
-#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-#endif
+//#ifndef ROTR64
+//#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+//#endif
+
+#define ROTR64(x, y) ror64( x, y )
 
 // Little-endian byte access.
 #define B2B_GET64(p)                            \
diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c
index 27d1fd85..b278c36b 100644
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
     endiandata[19] = n;
 
 // do sha256 prehash
-   sph_sha256_init( &sha256_prehash_ctx );
-   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
+   sha256_ctx_init( &sha256_prehash_ctx );
+   sha256_update( &sha256_prehash_ctx, endiandata, 64 );
     
     do {
        yespower_tls( (unsigned char *)endiandata, params.perslen,
diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c
index 8c9a9447..89680371 100644
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -27,14 +27,11 @@
  * coin.
  */
 #include "yespower.h"
-
 #include "algo-gate-api.h"
 
 yespower_params_t yespower_params;
 
-//SHA256_CTX sha256_prehash_ctx;
-__thread sph_sha256_context sha256_prehash_ctx;
-//__thread SHA256_CTX sha256_prehash_ctx;
+__thread sha256_context sha256_prehash_ctx;
 
 // YESPOWER
 
@@ -61,8 +58,8 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
    endiandata[19] = n;
 
    // do sha256 prehash
-   sph_sha256_init( &sha256_prehash_ctx );
-   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
+   sha256_ctx_init( &sha256_prehash_ctx );
+   sha256_update( &sha256_prehash_ctx, endiandata, 64 );
 
    do {
       if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
@@ -101,10 +98,6 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
       be32enc( &endiandata[k], pdata[k] );
    endiandata[19] = n;
 
-   // do sha256 prehash
-   sph_sha256_init( &sha256_prehash_ctx );
-   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
-
    do {
       if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) )
       if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c
index fd16c241..5e725af7 100644
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -203,17 +203,17 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 	ARX(X0, X3, X2, 18) \
 	/* Rearrange data */ \
 	X1 = _mm_shuffle_epi32(X1, 0x93); \
+   X3 = _mm_shuffle_epi32(X3, 0x39); \
 	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x39); \
 	/* Operate on "rows" */ \
 	ARX(X3, X0, X1, 7) \
 	ARX(X2, X3, X0, 9) \
 	ARX(X1, X2, X3, 13) \
 	ARX(X0, X1, X2, 18) \
 	/* Rearrange data */ \
+   X3 = _mm_shuffle_epi32(X3, 0x93); \
 	X1 = _mm_shuffle_epi32(X1, 0x39); \
-	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x93);
+	X2 = _mm_shuffle_epi32(X2, 0x4E);
 
 /**
  * Apply the Salsa20 core to the block provided in (X0 ... X3).
@@ -1095,7 +1095,7 @@ int yespower(yespower_local_t *local,
    salsa20_blk_t *V, *XY;
    pwxform_ctx_t ctx;
    uint8_t sha256[32];
-   sph_sha256_context sha256_ctx;
+   sha256_context sha256_ctx;
 
    /* Sanity-check parameters */
    if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0)
@@ -1138,10 +1138,9 @@ int yespower(yespower_local_t *local,
 
    // copy prehash, do tail   
    memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
-
-   sph_sha256( &sha256_ctx, src+64, srclen-64 );
-   sph_sha256_close( &sha256_ctx, sha256 );
-
+   sha256_update( &sha256_ctx, src+64, srclen-64 );
+   sha256_final( &sha256_ctx, sha256 );
+   
    if ( version == YESPOWER_0_5 )
    {
       PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size );
@@ -1186,7 +1185,9 @@ int yespower(yespower_local_t *local,
       if ( work_restart[thrid].restart ) return 0;
 
       smix_1_0( B, r, N, V, XY, &ctx );
-      
+
+      if ( work_restart[thrid].restart ) return 0;
+
       HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256),
                        (uint8_t *)dst );
    }
diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h
index 260322a7..aa190049 100644
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -34,7 +34,7 @@
 #include <stdlib.h> /* for size_t */
 #include "miner.h"
 #include "simd-utils.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -78,9 +78,7 @@ typedef struct {
 
 extern yespower_params_t yespower_params;
 
-//SHA256_CTX sha256_prehash_ctx;
-extern __thread sph_sha256_context sha256_prehash_ctx;
-//extern __thread SHA256_CTX sha256_prehash_ctx;
+extern __thread sha256_context sha256_prehash_ctx;
 
 /**
  * yespower_init_local(local):
diff --git a/build-allarch.sh b/build-allarch.sh
index c4d9ffd4..5fa38f6c 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
 
 # Icelake AVX512 SHA VAES
 make distclean || echo clean
diff --git a/configure b/configure
index 7430186f..db3efc9f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.17.1'
-PACKAGE_STRING='cpuminer-opt 3.17.1'
+PACKAGE_VERSION='3.18.0'
+PACKAGE_STRING='cpuminer-opt 3.18.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.17.1
+cpuminer-opt configure 3.18.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.17.1, which was
+It was created by cpuminer-opt $as_me 3.18.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.17.1'
+ VERSION='3.18.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.17.1, which was
+This file was extended by cpuminer-opt $as_me 3.18.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.17.1
+cpuminer-opt config.status 3.18.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 332d1e68..fbe5a9b0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.17.1])
+AC_INIT([cpuminer-opt], [3.18.0])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index e9c01fe6..c8895381 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -38,6 +38,7 @@
 #include <jansson.h>
 #include <openssl/sha.h>
 #include "sysinfos.c"
+#include "algo/sha/sha256d.h"
 
 #ifdef WIN32
 #include <winsock2.h>
@@ -94,6 +95,7 @@ bool have_gbt = true;
 bool allow_getwork = true;
 bool want_stratum = true;    // pretty useless
 bool have_stratum = false;
+bool stratum_down = true;
 bool allow_mininginfo = true;
 bool use_syslog = false;
 bool use_colors = true;
@@ -166,6 +168,8 @@ uint32_t stale_share_count = 0;
 uint32_t solved_block_count = 0;
 double *thr_hashrates;
 double global_hashrate = 0.;
+double total_hashes = 0.;
+struct timeval total_hashes_time = {0,0};
 double stratum_diff = 0.;
 double net_diff = 0.;
 double net_hashrate = 0.;
@@ -1001,6 +1005,7 @@ struct share_stats_t
    double share_diff;
    double stratum_diff;
    double target_diff;
+   uint32_t height;
    char   job_id[32];
 };
 
@@ -1080,13 +1085,14 @@ void report_summary_log( bool force )
    pthread_mutex_unlock( &stats_lock );
 
    timeval_subtract( &et, &now, &start_time );
-   timeval_subtract( &uptime, &now, &session_start );
+   timeval_subtract( &uptime, &total_hashes_time, &session_start );
    
    double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
-   double ghrate = global_hashrate;
+   double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
    double target_diff = exp32 * last_targetdiff;
    double shrate = safe_div( target_diff * (double)(accepts),
                              share_time, 0. );
+//   global_hashrate = ghrate;
    double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                  (double)uptime.tv_sec, 0. );
    double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1134,29 +1140,38 @@ void report_summary_log( bool force )
                       100. * safe_div( (double)accepted_share_count, 
                                        (double)submitted_share_count, 0. ) ); 
    if ( stale_share_count )
-      applog2( LOG_INFO, "Stale           %7d      %7d      %5.1f%%",
+   {
+      int prio = stales ? LOG_MINR : LOG_INFO;
+      applog2( prio, "Stale           %7d      %7d      %5.1f%%",
                       stales, stale_share_count,
                       100. * safe_div( (double)stale_share_count,
                                        (double)submitted_share_count, 0. ) );
+   }
    if ( rejected_share_count )
-      applog2( LOG_INFO, "Rejected        %7d      %7d      %5.1f%%",
+   {
+      int prio = rejects ? LOG_ERR : LOG_INFO;
+      applog2( prio, "Rejected        %7d      %7d      %5.1f%%",
                       rejects, rejected_share_count,
                       100. * safe_div( (double)rejected_share_count,
                                        (double)submitted_share_count, 0. ) );
+   }
    if ( solved_block_count )
-      applog2( LOG_INFO,"Blocks Solved   %7d      %7d",
+   {      
+      int prio = solved ? LOG_PINK : LOG_INFO;
+      applog2( prio, "Blocks Solved   %7d      %7d",
                solved, solved_block_count );
+   }
    applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
-               highest_share, lowest_share );
+            highest_share, lowest_share );
 
    int mismatch = submitted_share_count
          - ( accepted_share_count + stale_share_count + rejected_share_count );
    if ( mismatch )
    {
       if ( mismatch != 1 )
-         applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
+         applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch );
       else
-         applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
+         applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N );
    }
 }
 
@@ -1278,17 +1293,17 @@ static int share_result( int result, struct work *work,
 
    if ( use_colors )
    {
-     bcol = acol = scol = rcol = CL_WHT;
+     bcol = acol = scol = rcol = CL_N;
      if ( likely( result ) )
      {
-       acol = CL_WHT CL_GRN;  
-       if ( unlikely( solved ) ) bcol = CL_WHT CL_MAG;
+       acol = CL_LGR;       
+       if ( unlikely( solved ) ) bcol = CL_LMA;
      }        
-     else if ( stale ) scol = CL_WHT CL_YL2;
-     else              rcol = CL_WHT CL_RED;
+     else if ( stale ) scol = CL_YL2;
+     else              rcol = CL_LRD;
    }
 
-   applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
+   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
            my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
            bres, share_time, latency );
 
@@ -1296,8 +1311,7 @@ static int share_result( int result, struct work *work,
    {
       if ( have_stratum )
          applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s",
-               my_stats.share_diff, stratum.block_height,
-               my_stats.job_id );
+               my_stats.share_diff, my_stats.height, my_stats.job_id );
       else
          applog2( LOG_INFO, "Diff %.5g, Block %d",
                my_stats.share_diff, work ? work->height : last_block_height );
@@ -1308,7 +1322,7 @@ static int share_result( int result, struct work *work,
       uint32_t str[8];
       uint32_t *targ;
 
-      if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason );
+      if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
          
       diff_to_hash( str, my_stats.share_diff );
       applog2( LOG_INFO, "Hash:   %08x%08x%08x%08x%08x%08x", str[7], str[6],
@@ -1861,6 +1875,7 @@ static void update_submit_stats( struct work *work, const void *hash )
    share_stats[ s_put_ptr ].net_diff = net_diff;
    share_stats[ s_put_ptr ].stratum_diff = stratum_diff;
    share_stats[ s_put_ptr ].target_diff = work->targetdiff;
+   share_stats[ s_put_ptr ].height = work->height; 
    if ( have_stratum )
       strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 );
    s_put_ptr = stats_ptr_incr( s_put_ptr );
@@ -1871,6 +1886,10 @@ static void update_submit_stats( struct work *work, const void *hash )
 bool submit_solution( struct work *work, const void *hash,
                       struct thr_info *thr )
 {
+   // Job went stale during hashing of a valid share.
+   if ( !opt_quiet && work_restart[ thr->id ].restart )
+      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
+   
    work->sharediff = hash_to_diff( hash );
    if ( likely( submit_work( thr, work ) ) )
    {
@@ -1887,11 +1906,11 @@ bool submit_solution( struct work *work, const void *hash,
      if ( !opt_quiet )
      {
         if ( have_stratum )
-           applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Job %s",
+           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
                    submitted_share_count, work->sharediff, work->height,
                    work->job_id );
         else
-           applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
+           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
                    submitted_share_count, work->sharediff, work->height,
                    work->data[ algo_gate.ntime_index ] );
      }
@@ -2048,7 +2067,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
    pthread_rwlock_wrlock( &g_work_lock );
    pthread_mutex_lock( &sctx->work_lock );
 
-   new_job =  sctx->new_job;
+   new_job =  sctx->new_job;  // otherwise just increment extranonce2
    sctx->new_job = false;
    
    free( g_work->job_id );
@@ -2084,6 +2103,14 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 
    pthread_mutex_unlock( &stats_lock );
 
+   if ( !opt_quiet )
+   {
+      int mismatch = submitted_share_count
+         - ( accepted_share_count + stale_share_count + rejected_share_count );
+      if ( mismatch )
+         applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
+   }
+
    if ( stratum_diff != sctx->job.diff )
       applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
                         sctx->job.diff, sctx->block_height, g_work->job_id );
@@ -2264,19 +2291,29 @@ static void *miner_thread( void *userdata )
    }
 
    // wait for stratum to send first job
-   if ( have_stratum ) while ( unlikely( !g_work.job_id ) ) sleep(1);
+   if ( have_stratum ) while ( unlikely( stratum_down ) )
+   {
+     if ( opt_debug )
+        applog( LOG_INFO, "Thread %d waiting for first job", thr_id );
+     sleep(1);
+   }
 
+   // nominal startng values
+   int64_t max64 = 20;
+   thr_hashrates[thr_id] = 20;
    while (1)
    {
        uint64_t hashes_done;
        struct timeval tv_start, tv_end, diff;
-       int64_t max64 = 1000;
+//       int64_t max64 = 1000;
        int nonce_found = 0;
 
        if ( likely( algo_gate.do_this_thread( thr_id ) ) )
        {
-          if ( have_stratum )
+          if ( have_stratum ) 
           {
+             while ( unlikely( stratum_down ) )
+                sleep( 1 );
              if ( *nonceptr >= end_nonce )
                 stratum_gen_work( &stratum, &g_work );
           }
@@ -2383,6 +2420,8 @@ static void *miner_thread( void *userdata )
        if ( diff.tv_usec || diff.tv_sec )
        {
           pthread_mutex_lock( &stats_lock );
+          total_hashes += hashes_done;
+          total_hashes_time = tv_end;
           thr_hashrates[thr_id] =
           hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 );
           pthread_mutex_unlock( &stats_lock );
@@ -2439,7 +2478,6 @@ static void *miner_thread( void *userdata )
             && thr_id == opt_n_threads - 1 ) )
        {
           double hashrate  = 0.;
-
           pthread_mutex_lock( &stats_lock );
           for ( i = 0; i < opt_n_threads; i++ )
               hashrate  += thr_hashrates[i];
@@ -2448,8 +2486,12 @@ static void *miner_thread( void *userdata )
 
           if ( opt_benchmark )
           {
+             struct timeval uptime;
              char hr[16];
              char hr_units[2] = {0,0};
+             timeval_subtract( &uptime, &total_hashes_time, &session_start ); 
+             double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
+
              scale_hash_for_display( &hashrate,  hr_units );
              sprintf( hr, "%.2f", hashrate );
 #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
@@ -2745,6 +2787,7 @@ static void *stratum_thread(void *userdata )
       if ( unlikely( stratum_need_reset ) )
       {
           stratum_need_reset = false;
+          stratum_down = true;
           stratum_disconnect( &stratum );
           if ( strcmp( stratum.url, rpc_url ) )
           {
@@ -2755,11 +2798,13 @@ static void *stratum_thread(void *userdata )
           else 
 	          applog(LOG_WARNING, "Stratum connection reset");
           // reset stats queue as well
+          restart_threads();
           if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
       }
 
       while ( !stratum.curl )
       {
+         stratum_down = true;
          pthread_rwlock_wrlock( &g_work_lock );
          g_work_time = 0;
          pthread_rwlock_unlock( &g_work_lock );
@@ -2780,6 +2825,7 @@ static void *stratum_thread(void *userdata )
          }
          else
          {
+            stratum_down = false;
             restart_threads();
             applog(LOG_BLUE,"Stratum connection established" );
          }
@@ -2801,7 +2847,7 @@ static void *stratum_thread(void *userdata )
          }
          else
          {
-            applog(LOG_WARNING, "Stratum connection interrupted");
+//            applog(LOG_WARNING, "Stratum connection interrupted");
 //            stratum_disconnect( &stratum );
             stratum_need_reset = true;
          }
@@ -3629,6 +3675,10 @@ int main(int argc, char *argv[])
       show_usage_and_exit(1);
    }
 
+   // need to register to get algo optimizations for cpu capabilities
+   // but that causes register logs before cpu capabilities is output.
+   // Would need to split register into 2 parts. First part sets algo
+   // optimizations but no logging, second part does any logging.   
    if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
 
    if ( !check_cpu_capability() ) exit(1);
@@ -3685,12 +3735,6 @@ int main(int argc, char *argv[])
       }
    }
 
-   // Initialize stats times and counters
-   memset( share_stats, 0, s_stats_size *  sizeof (struct share_stats_t) );
-   gettimeofday( &last_submit_time, NULL );
-   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
-   memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
-
 //   if ( !check_cpu_capability() ) exit(1);
 
 	pthread_mutex_init( &stats_lock, NULL );
@@ -3854,7 +3898,8 @@ int main(int argc, char *argv[])
 			return 1;
 		}
 	}
-	if ( have_stratum )
+
+   if ( have_stratum )
    {
       if ( opt_debug )
          applog(LOG_INFO,"Creating stratum thread");
@@ -3900,24 +3945,35 @@ int main(int argc, char *argv[])
                                                      opt_api_listen );
    }
 
+   // hold the stats lock while starting miner threads
+   pthread_mutex_lock( &stats_lock );
+   
 	/* start mining threads */
-	for (i = 0; i < opt_n_threads; i++)
+	for ( i = 0; i < opt_n_threads; i++ )
    {
-      usleep( 5000 );
+//      usleep( 5000 );
 		thr = &thr_info[i];
 		thr->id = i;
 		thr->q = tq_new();
-		if (!thr->q)
+		if ( !thr->q )
 			return 1;
-         err = thread_create(thr, miner_thread);
-		if (err) {
-			applog(LOG_ERR, "Miner thread %d create failed", i);
+      err = thread_create( thr, miner_thread );
+		if ( err )
+      {
+			applog( LOG_ERR, "Miner thread %d create failed", i );
 			return 1;
 		}
    }
 
-	applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
-	                  opt_n_threads, num_cpus, algo_names[opt_algo] );
+   // Initialize stats times and counters
+   memset( share_stats, 0, s_stats_size *  sizeof (struct share_stats_t) );
+   gettimeofday( &last_submit_time, NULL );
+   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
+   pthread_mutex_unlock( &stats_lock );
+
+   applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
+                     opt_n_threads, num_cpus, algo_names[opt_algo] );
 
 	/* main loop - simply wait for workio thread to exit */
 	pthread_join( thr_info[work_thr_id].pth, NULL );
diff --git a/miner.h b/miner.h
index 9ca56b83..5592d4ac 100644
--- a/miner.h
+++ b/miner.h
@@ -70,17 +70,25 @@ void *alloca (size_t);
 
 #ifdef HAVE_SYSLOG_H
 #include <syslog.h>
-#define LOG_BLUE 0x10 /* unique value */
+#define LOG_BLUE  0x10 /* unique value */
+#define LOG_MAJR  0x11 /* unique value */
+#define LOG_MINR  0x12 /* unique value */
+#define LOG_GREEN 0x13 /* unique value */
+#define LOG_PINK  0x14 /* unique value */
 #else
 enum {
-	LOG_ERR,
+   LOG_CRIT,
+   LOG_ERR,
 	LOG_WARNING,
 	LOG_NOTICE,
 	LOG_INFO,
 	LOG_DEBUG,
-	/* custom notices */
-	LOG_BLUE = 0x10,
-};
+   /* custom notices */
+	LOG_BLUE  = 0x10,
+   LOG_MAJR  = 0x11,
+   LOG_MINR  = 0x12,
+   LOG_GREEN = 0x13,
+   LOG_PINK  = 0x14 };
 #endif
 
 extern bool is_power_of_2( int n );
@@ -216,7 +224,7 @@ json_t* json_load_url(char* cfg_url, json_error_t *err);
 
 void sha256_init(uint32_t *state);
 void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-void sha256d(unsigned char *hash, const unsigned char *data, int len);
+//void sha256d(unsigned char *hash, const unsigned char *data, int len);
 
 #ifdef USE_ASM
 #if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
@@ -225,7 +233,8 @@ int sha256_use_4way();
 void sha256_init_4way(uint32_t *state);
 void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
 #endif
-#if defined(__x86_64__) && defined(USE_AVX2)
+//#if defined(__x86_64__) && defined(USE_AVX2)
+#if defined(__x86_64__) && defined(__AVX2__)
 #define HAVE_SHA256_8WAY 1
 int sha256_use_8way();
 void sha256_init_8way(uint32_t *state);
@@ -271,9 +280,9 @@ struct thr_api {
 #define CL_N    "\x1B[0m"
 #define CL_RED  "\x1B[31m"
 #define CL_GRN  "\x1B[32m"
-#define CL_YLW  "\x1B[33m"
+#define CL_YLW  "\x1B[33m"  // dark yellow
 #define CL_BLU  "\x1B[34m"
-#define CL_MAG  "\x1B[35m"
+#define CL_MAG  "\x1B[35m"  // purple
 #define CL_CYN  "\x1B[36m"
 
 #define CL_BLK  "\x1B[22;30m" /* black */
@@ -281,7 +290,7 @@ struct thr_api {
 #define CL_GR2  "\x1B[22;32m" /* green */
 #define CL_BRW  "\x1B[22;33m" /* brown */
 #define CL_BL2  "\x1B[22;34m" /* blue */
-#define CL_MA2  "\x1B[22;35m" /* magenta */
+#define CL_MA2  "\x1B[22;35m" /* purple */
 #define CL_CY2  "\x1B[22;36m" /* cyan */
 #define CL_SIL  "\x1B[22;37m" /* gray */
 
@@ -290,9 +299,9 @@ struct thr_api {
 #else
 #define CL_GRY  "\x1B[90m"    /* dark gray selectable in putty */
 #endif
-#define CL_LRD  "\x1B[01;31m" /* light red */
-#define CL_LGR  "\x1B[01;32m" /* light green */
-#define CL_YL2  "\x1B[01;33m" /* yellow */
+#define CL_LRD  "\x1B[01;31m" /* bright red */
+#define CL_LGR  "\x1B[01;32m" /* bright green */
+#define CL_YL2  "\x1B[01;33m" /* bright yellow */
 #define CL_LBL  "\x1B[01;34m" /* light blue */
 #define CL_LMA  "\x1B[01;35m" /* light magenta */
 #define CL_LCY  "\x1B[01;36m" /* light cyan */
@@ -481,7 +490,7 @@ void format_hashrate(double hashrate, char *output);
 void print_hash_tests(void);
 
 void scale_hash_for_display ( double* hashrate, char* units );
-
+void format_number_si( double* hashrate, char* si_units );
 void report_summary_log( bool force );
 
 /*
diff --git a/simd-utils.h b/simd-utils.h
index 55cc5529..f2e201d6 100644
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -78,6 +78,8 @@
 //   - specialized shift and rotate functions that move elements around
 //     use the notation "1x32" to indicate the distance moved as units of
 //     the element size.
+//     Vector shuffle rotations are being renamed to "vrol" and "vror"
+//     to avoid confusion with bit rotations.
 //   - there is a subset of some functions for scalar data. They may have
 //     no prefix nor vec-size, just one size, the size of the data.
 //   - Some integer functions are also defined which use a similar notation.
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index cedcae34..956f3e37 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -65,7 +65,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1,
    d0[24] = s[48];   d1[24] = s[49];   d0[25] = s[50];   d1[25] = s[51];
    d0[26] = s[52];   d1[26] = s[53];   d0[27] = s[54];   d1[27] = s[55];
    d0[28] = s[56];   d1[28] = s[57];   d0[29] = s[58];   d1[29] = s[59];
-   d0[30] = s[60];   d1[30] = s[61];   d0[31] = s[61];   d1[31] = s[63];
+   d0[30] = s[60];   d1[30] = s[61];   d0[31] = s[62];   d1[31] = s[63];
 }
 
 static inline void extr_lane_2x32( void *dst, const void *src,
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 90066f09..765d8479 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -35,6 +35,13 @@
 ///////////////////////////////////////////////////////////////////////////
 
 
+// Used instead if casting.
+typedef union
+{
+   __m128i m128;
+   uint32_t u32[4];
+} __attribute__ ((aligned (16))) m128_ovly;
+
 // Efficient and convenient moving between GP & low bits of XMM.
 // Use VEX when available to give access to xmm8-15 and zero extend for
 // larger vectors.
@@ -61,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
   return  a;
 }
 
-static inline uint64_t mm128_mov128_64( const __m128i a )
+// Inconstant naming, prefix should reflect return value:
+// u64_mov128_64
+
+static inline uint64_t u64_mov128_64( const __m128i a )
 {
   uint64_t n;
 #if defined(__AVX__)
@@ -72,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a )
   return  n;
 }
 
-static inline uint32_t mm128_mov128_32( const __m128i a )
+static inline uint32_t u32_mov128_32( const __m128i a )
 {
   uint32_t n;
 #if defined(__AVX__)
@@ -166,12 +176,17 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
 
 // Extract 32 bit element c from v and return as integer.
 static inline uint32_t mm128_extract_32( const __m128i v, const int c )
-{   return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
+{   return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
 
 // Clear (zero) 32 bit elements based on bits set in 4 bit mask.
 static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
 {   return mm128_xim_32( v, v, m ); }
 
+// Move element i2 of v2 to element i1 of v1. For reference and convenience,
+// it's faster to precalculate the index.
+#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
+  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
+
 #endif  // SSE4_1
 
 //
@@ -257,12 +272,37 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #endif
 
+
+
+// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
+
+// Blend 4 32 bit elements from 4 vectors
+
+#if defined (__AVX2__)
+
+#define mm128_diagonal_32( v3, v2, v1, v0 ) \
+  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
+                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
+
+#elif defined(__SSE4_1)
+
+#define mm128_diagonal_32( v3, v2, v1, v0 ) \
+  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
+                  _mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
+
+#endif
+
+
 //
 // Bit rotations
 
 // AVX512VL has implemented bit rotation for 128 bit vectors with
 // 64 and 32 bit elements.
 
+// x2 rotates elements in 2 individual vectors in a double buffered
+// optimization for SSE2, does nothing for AVX512 but is there for
+// transparency.
+
 // compiler doesn't like when a variable is used for the last arg of
 // _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
 // specification but works with a variable. Therefore use rol_var where
@@ -290,6 +330,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_32    _mm_ror_epi32
 #define mm128_rol_32    _mm_rol_epi32
 
+#define mm128_rorx2_64( v1, v0, c ) \
+   _mm_ror_epi64( v0, c ); \
+   _mm_ror_epi64( v1, c )
+
+#define mm128_rolx2_64( v1, v0, c ) \
+   _mm_rol_epi64( v0, c ); \
+   _mm_rol_epi64( v1, c )
+
+#define mm128_rorx2_32( v1, v0, c ) \
+   _mm_ror_epi32( v0, c ); \
+   _mm_ror_epi32( v1, c )
+
+#define mm128_rolx2_32( v1, v0, c ) \
+   _mm_rol_epi32( v0, c ); \
+   _mm_rol_epi32( v1, c )
+
 #else  // SSE2
 
 #define mm128_ror_64   mm128_ror_var_64
@@ -297,6 +353,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_32   mm128_ror_var_32
 #define mm128_rol_32   mm128_rol_var_32
 
+#define mm128_rorx2_64( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_srli_epi64( v0, c ); \
+ __m128i t1 = _mm_srli_epi64( v1, c ); \
+ v0 = _mm_slli_epi64( v0, 64-(c) ); \
+ v1 = _mm_slli_epi64( v1, 64-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rolx2_64( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_slli_epi64( v0, c ); \
+ __m128i t1 = _mm_slli_epi64( v1, c ); \
+ v0 = _mm_srli_epi64( v0, 64-(c) ); \
+ v1 = _mm_srli_epi64( v1, 64-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rorx2_32( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_srli_epi32( v0, c ); \
+ __m128i t1 = _mm_srli_epi32( v1, c ); \
+ v0 = _mm_slli_epi32( v0, 32-(c) ); \
+ v1 = _mm_slli_epi32( v1, 32-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rolx2_32( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_slli_epi32( v0, c ); \
+ __m128i t1 = _mm_slli_epi32( v1, c ); \
+ v0 = _mm_srli_epi32( v0, 32-(c) ); \
+ v1 = _mm_srli_epi32( v1, 32-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
 #endif   // AVX512 else SSE2
 
 #define mm128_ror_16( v, c ) \
@@ -309,16 +405,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes
 
 #define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-#define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
-#define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
+#define mm128_shuflr_64       mm128_swap_64
+#define mm128_shufll_64       mm128_swap_64
+
+#define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
+#define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
+
 
 // Swap 32 bit elements in 64 bit lanes
 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_shuflr64_32 mm128_swap64_32
+#define mm128_shufll64_32 mm128_swap64_32
 
 #if defined(__SSSE3__)
 
 // Rotate right by c bytes, no SSE2 equivalent.
-static inline __m128i mm128_ror_x8( const __m128i v, const int c )
+static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }
 
 //
@@ -422,59 +524,88 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
    v1 = _mm_xor_si128( v1, v2 );
 
 
+// Two input shuffle-rotate.
 // Concatenate v1 & v2 and rotate as one 256 bit vector.
-#if defined(__SSE4_1__)
+// Continue to use vror/vrol for now to avoid confusion with
+// shufl2r/shufl2l function macros available with AVX512.
+
+#if defined(__SSSE3__)
+
+// Function macro with two inputs and one output, inputs are preserved.
+// Returns modified first arg.
+// Two input functions are not available without SSSE3. Use procedure
+// belowe instead.
+
+#define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
+#define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
+
+#define mm128_shufl2r_32( v1, v2 )     _mm_alignr_epi8( v2, v1, 4 )
+#define mm128_shufl2l_32( v1, v2 )     _mm_alignr_epi8( v1, v2, 4 )
+
+#define mm128_shufl2r_16( v1, v2 )     _mm_alignr_epi8( v2, v1, 2 )
+#define mm128_shufl2l_16( v1, v2 )     _mm_alignr_epi8( v1, v2, 2 )
+
+#define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
+#define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
+
+// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
+// Returns both modified args in place.
+
+// These macros retain the vrol/vror name for now to avoid
+// confusion with the shufl2r/shuffle2l function macros above.
+// These may be renamed to something like shufl2r2 for 2 1nputs and
+// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
 
-#define mm128_ror256_64( v1, v2 ) \
+#define mm128_vror256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
            v1 = _mm_alignr_epi8( v2, v1, 8 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_64( v1, v2 ) \
+#define mm128_vrol256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
            v2 = _mm_alignr_epi8( v2, v1, 8 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_32( v1, v2 ) \
+#define mm128_vror256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
            v1 = _mm_alignr_epi8( v2, v1, 4 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_32( v1, v2 ) \
+#define mm128_vrol256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
            v2 = _mm_alignr_epi8( v2, v1, 12 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_16( v1, v2 ) \
+#define mm128_vror256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
            v1 = _mm_alignr_epi8( v2, v1, 2 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_16( v1, v2 ) \
+#define mm128_vrol256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
            v2 = _mm_alignr_epi8( v2, v1, 14 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_8( v1, v2 ) \
+#define mm128_vror256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
            v1 = _mm_alignr_epi8( v2, v1, 1 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_8( v1, v2 ) \
+#define mm128_vrol256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
            v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -483,7 +614,7 @@ do { \
 
 #else  // SSE2
 
-#define mm128_ror256_64( v1, v2 ) \
+#define mm128_vror256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
                               _mm_slli_si128( v2, 8 ) ); \
@@ -492,7 +623,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_64( v1, v2 ) \
+#define mm128_vrol256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                               _mm_srli_si128( v2, 8 ) ); \
@@ -501,7 +632,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_32( v1, v2 ) \
+#define mm128_vror256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
                               _mm_slli_si128( v2, 12 ) ); \
@@ -510,7 +641,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_32( v1, v2 ) \
+#define mm128_vrol256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
                               _mm_srli_si128( v2, 12 ) ); \
@@ -519,7 +650,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_16( v1, v2 ) \
+#define mm128_vror256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
                               _mm_slli_si128( v2, 14 ) ); \
@@ -528,7 +659,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_16( v1, v2 ) \
+#define mm128_vrol256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
                               _mm_srli_si128( v2, 14 ) ); \
@@ -537,7 +668,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_8( v1, v2 ) \
+#define mm128_vror256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
                               _mm_slli_si128( v2, 15 ) ); \
@@ -546,7 +677,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_8( v1, v2 ) \
+#define mm128_vrol256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
                               _mm_srli_si128( v2, 15 ) ); \
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 125e2c82..3d840107 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -14,13 +14,28 @@
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
 
+// Used instead if casting.
+typedef union
+{
+   __m256i m256;
+   __m128i m128[2];
+   uint64_t u64[4];
+   uint32_t u32[8];
+} __attribute__ ((aligned (32))) m256_ovly;
+
+
 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
 #define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
 
 // Move low element of vector to integer.
-#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
-#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
+#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
+#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
+
+// deprecated
+//#define mm256_mov256_64 u64_mov256_64 
+//#define mm256_mov256_32 u32_mov256_32
+
 
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
@@ -214,12 +229,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 #endif
 
+// Diagonal blending
+
+// Blend 4 64 bit elements from 4 vectors
+#define mm256_diagonal_64( v3, v2, v1, v0 ) \
+  mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
+                     _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
+
+// Blend 8 32 bit elements from 8 vectors
+#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
+  _mm256_blend_epi32( \
+        _mm256_blend_epi32( \
+               _mm256_blend_epi32( v7, v6, 0x40 ), \
+               _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
+        _mm256_blend_epi32( \
+               _mm256_blend_epi32( v3, v2, 0x04) \
+               _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )  
+
+
+// Blend 4 32 bit elements from each 128 bit lane.
+#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
+    _mm256_blend_epi32( \
+           _mm256_blend_epi32( v3, v2, 0x44) \
+           _mm256_blend_epi32( v1, v0, 0x11 ) )  
+
+
 //
 //           Bit rotations.
 //
-// The only bit shift for more than 64 bits is with __int128.
+// The only bit shift for more than 64 bits is with __int128 which is slow.
 //
 // AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
+//
+// x2 rotates elements in 2 individual vectors in a double buffered
+// optimization for SSE2, does nothing for AVX512 but is there for
+// transparency.
 
 
 // compiler doesn't like when a variable is used for the last arg of
@@ -255,6 +299,22 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    _mm256_ror_epi32
 #define mm256_rol_32    _mm256_rol_epi32
 
+#define mm256_rorx2_64( v1, v0, c ) \
+   _mm256_ror_epi64( v0, c ); \
+   _mm256_ror_epi64( v1, c )
+
+#define mm256_rolx2_64( v1, v0, c ) \
+   _mm256_rol_epi64( v0, c ); \
+   _mm256_rol_epi64( v1, c )
+
+#define mm256_rorx2_32( v1, v0, c ) \
+   _mm256_ror_epi32( v0, c ); \
+   _mm256_ror_epi32( v1, c )
+
+#define mm256_rolx2_32( v1, v0, c ) \
+   _mm256_rol_epi32( v0, c ); \
+   _mm256_rol_epi32( v1, c )
+
 #else   // AVX2
 
 #define mm256_ror_64    mm256_ror_var_64 
@@ -262,6 +322,46 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    mm256_ror_var_32
 #define mm256_rol_32    mm256_rol_var_32
 
+#define mm256_rorx2_64( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_srli_epi64( v0, c ); \
+ __m256i t1 = _mm256_srli_epi64( v1, c ); \
+ v0 = _mm256_slli_epi64( v0, 64-(c) ); \
+ v1 = _mm256_slli_epi64( v1, 64-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rolx2_64( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi64( v0, c ); \
+ __m256i t1 = _mm256_slli_epi64( v1, c ); \
+ v0 = _mm256_srli_epi64( v0, 64-(c) ); \
+ v1 = _mm256_srli_epi64( v1, 64-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rorx2_32( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_srli_epi32( v0, c ); \
+ __m256i t1 = _mm256_srli_epi32( v1, c ); \
+ v0 = _mm256_slli_epi32( v0, 32-(c) ); \
+ v1 = _mm256_slli_epi32( v1, 32-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rolx2_32( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi32( v0, c ); \
+ __m256i t1 = _mm256_slli_epi32( v1, c ); \
+ v0 = _mm256_srli_epi32( v0, 32-(c) ); \
+ v1 = _mm256_srli_epi32( v1, 32-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
 #endif     // AVX512 else AVX2
 
 #define  mm256_ror_16( v, c ) \
@@ -276,58 +376,45 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements accross all lanes.
 
-#if defined(__AVX512VL__)
-
-static inline __m256i mm256_swap_128( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 2 ); }
-
-static inline __m256i mm256_ror_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 1 ); }
-
-static inline __m256i mm256_rol_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 3 ); }
-
-static inline __m256i mm256_ror_1x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 1 ); }
-
-static inline __m256i mm256_rol_1x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 7 ); }
-
-#else   // AVX2
-
 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
+#define mm256_shuflr_128 mm256_swap_128
+#define mm256_shufll_128 mm256_swap_128
 
 // Rotate 256 bit vector by one 64 bit element
-#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
-#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+#define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
+
+#define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )
 
 // Rotate 256 bit vector by one 32 bit element.
-#define mm256_ror_1x32( v ) \
+#define mm256_shuflr_32( v ) \
     _mm256_permutevar8x32_epi32( v, \
                      m256_const_64( 0x0000000000000007, 0x0000000600000005, \
-                                    0x0000000400000003, 0x0000000200000001 )
+                                    0x0000000400000003, 0x0000000200000001 ) )
 
-#define mm256_rol_1x32( v ) \
+#define mm256_shufll_32( v ) \
     _mm256_permutevar8x32_epi32( v, \
                      m256_const_64( 0x0000000600000005,  0x0000000400000003, \
-                                    0x0000000200000001,  0x0000000000000007 )
+                                    0x0000000200000001,  0x0000000000000007 ) )
 
        
-#endif    // AVX512 else AVX2
-
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.
 
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
-#define mm256_ror128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
-#define mm256_rol128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
+#define mm256_shuflr128_64 mm256_swap128_64
+#define mm256_shufll128_64 mm256_swap128_64
+
+#define mm256_shuflr128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
+#define mm256_shufll128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
 
-static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
+static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }
 
 // Swap 32 bit elements in each 64 bit lane.
 #define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
+#define mm256_shuflr64_32 mm256_swap64_32
+#define mm256_shufll64_32 mm256_swap64_32
 
 //
 // Swap bytes in vector elements, endian bswap.
@@ -387,19 +474,21 @@ static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
 //  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
 //  makes these macros unnecessary.
 
+// continue using vror/vrol notation for now to avoid confusion with
+// shufl2r/shufl2l macro functions available with AVX512.
 #define mm256_swap512_256( v1, v2 ) \
    v1 = _mm256_xor_si256( v1, v2 ); \
    v2 = _mm256_xor_si256( v1, v2 ); \
    v1 = _mm256_xor_si256( v1, v2 );
 
-#define mm256_ror512_128( v1, v2 ) \
+#define mm256_vror512_128( v1, v2 ) \
 do { \
    __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
    v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
    v2 = t; \
 } while(0)
 
-#define mm256_rol512_128( v1, v2 ) \
+#define mm256_vrol512_128( v1, v2 ) \
 do { \
    __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
    v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index e6b7ac22..de948cc4 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -74,13 +74,22 @@
 // __AVX512VBMI__  __AVX512VAES__
 //
 
+// Used instead if casting.
+typedef union
+{
+   __m512i m512;
+   __m128i m128[4];
+   uint32_t u32[16];
+   uint64_t u64[8];
+} __attribute__ ((aligned (64))) m512_ovly;
+
 // Move integer to/from element 0 of vector.
 
 #define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
 #define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )
 
-#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
-#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
+#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) )
+#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) )
 
 // A simple 128 bit permute, using function instead of macro avoids
 // problems if the v arg passed as an expression.
@@ -91,6 +100,10 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_concat_256( hi, lo ) \
    _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
 
+#define m512_const_128( v3, v2, v1, v0 ) \
+   mm512_concat_256( mm256_concat_128( v3, v2 ), \
+                     mm256_concat_128( v1, v0 ) )
+
 // Equivalent of set, assign 64 bit integers to respective 64 bit elements.
 // Use stack memory overlay
 static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
@@ -225,7 +238,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 
 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
-// operation using any number or combinations of AND, OR XOR, NOT.
+// expression using any number or combinations of AND, OR, XOR, NOT.
 
 // a ^ b ^ c
 #define mm512_xor3( a, b, c ) \
@@ -251,11 +264,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_andxor( a, b, c ) \
    _mm512_ternarylogic_epi64( a, b, c, 0x60 )
 
-// a ^ ( b & c )
+// a ^ ( b | c )
 #define mm512_xoror( a, b, c ) \
    _mm512_ternarylogic_epi64( a, b, c, 0x1e )
 
-// a ^ ( ~b & c )     [ xor( a, andnot( b, c ) ]
+// a ^ ( ~b & c )     xor( a, andnot( b, c ) )
 #define mm512_xorandnot( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
 
@@ -265,11 +278,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 
 // Some 2 input operations that don't have their own instruction mnemonic.
 
-// ~( a | b )
+// ~( a | b ),  (~a) & (~b)
 #define mm512_nor( a, b ) \
    _mm512_ternarylogic_epi64( a, b, b, 0x01  )
 
-// ~( a ^ b ), same as (~a) ^ b
+// ~( a ^ b ),  (~a) ^ b
 #define mm512_xnor( a, b ) \
    _mm512_ternarylogic_epi64( a, b, b, 0x81  )
 
@@ -278,6 +291,27 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
    _mm512_ternarylogic_epi64( a, b, b, 0xef  )
 
 
+// Diagonal blending
+// Blend 8 64 bit elements from 8 vectors
+#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
+  _mm512_mask_blend_epi64( 0x0f, \
+        _mm512_mask_blend_epi64( 0x30, \
+               _mm512_mask_blend_epi64( 0x40, v7, v6 ), \
+               _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
+        _mm512_mask_blend_epi64( 0x03, \
+               _mm512_mask_blend_epi64( 0x04, v3, v2 ) \
+               _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )  
+
+
+// Blend 4 32 bit elements from each 128 bit lane.
+#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
+    _mm512_mask_blend_epi32( 0x3333, \
+           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
+           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
+
+
+
+
 // Bit rotations.
 
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
@@ -395,59 +429,95 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
   casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
 } while(0)
 
-
 //
-// Rotate elements in 512 bit vector.
+// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
+//
+
+// rename plan change ror to vror for Vector ROtate Right,
+// and vrol for Vector ROtate Left, not to be confused with
+//variable rotate rorv, rolv,
+// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
+// operation. 1xNN notaion ia also removed and replaced with simpler NN.
+// Swap will still have its own mnemonic and will be aliased as both
+// left and right shuffles.
+
+// Shift elements right or left in 512 bit vector, filling with zeros.
+// Multiple element shifts can be combined into a single larger
+// element shift.
+
+#define mm512_shiftr_256( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
+#define mm512_shiftl_256( v ) mm512_shifr_256
+
+#define mm512_shiftr_128( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 2 )
+#define mm512_shiftl_128( v ) \
+  _mm512_alignr_epi64( v,  _mm512_setzero, 6 )
+
+#define mm512_shiftr_64( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 1 )
+#define mm512_shiftl_64( v ) \
+  _mm512_alignr_epi64( v, _mm512_setzero, 7 )
+
+#define mm512_shiftr_32( v ) \
+  _mm512_alignr_epi32( _mm512_setzero, v, 1 )
+#define mm512_shiftl_32( v ) \
+  _mm512_alignr_epi32( v, _mm512_setzero, 15 )
+
+// Shuffle-rotate elements left or right in 512 bit vector.
 
 static inline __m512i mm512_swap_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
+#define mm512_shuflr_256( v ) mm512_swap_256
+#define mm512_shufll_256( v ) mm512_swap_256
 
-static inline __m512i mm512_ror_1x128( const __m512i v )
+static inline __m512i mm512_shuflr_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 2 ); }
 
-static inline __m512i mm512_rol_1x128( const __m512i v )
+static inline __m512i mm512_shufll_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 6 ); }
 
-static inline __m512i mm512_ror_1x64( const __m512i v )
+static inline __m512i mm512_shuflr_64( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 1 ); }
 
-static inline __m512i mm512_rol_1x64( const __m512i v )
+static inline __m512i mm512_shufll_64( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 7 ); }
 
-static inline __m512i mm512_ror_1x32( const __m512i v )
+static inline __m512i mm512_shuflr_32( const __m512i v )
 { return _mm512_alignr_epi32( v, v, 1 ); }
 
-static inline __m512i mm512_rol_1x32( const __m512i v )
+static inline __m512i mm512_shufll_32( const __m512i v )
 { return _mm512_alignr_epi32( v, v, 15 ); }
 
-static inline __m512i mm512_ror_x64( const __m512i v, const int n )
+// Generic
+static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 { return _mm512_alignr_epi64( v, v, n ); }
 
-static inline __m512i mm512_ror_x32( const __m512i v, const int n )
+static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }
 
-#define mm512_ror_1x16( v ) \
+#define mm512_shuflr_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                        0x0000001F001E001D, 0x001C001B001A0019, \
                        0X0018001700160015, 0X0014001300120011, \
                        0X0010000F000E000D, 0X000C000B000A0009, \
                        0X0008000700060005, 0X0004000300020001 ), v )
 
-#define mm512_rol_1x16( v ) \
+#define mm512_shufll_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                        0x001E001D001C001B, 0x001A001900180017, \
                        0X0016001500140013, 0X001200110010000F, \
                        0X000E000D000C000B, 0X000A000900080007, \
                        0X0006000500040003, 0X000200010000001F ), v )
 
-#define mm512_ror_1x8( v ) \
+#define mm512_shuflr_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                        0x003F3E3D3C3B3A39, 0x3837363534333231, \
                        0x302F2E2D2C2B2A29, 0x2827262524232221, \
                        0x201F1E1D1C1B1A19. 0x1817161514131211, \
                        0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
 
-#define mm512_rol_1x8( v ) \
+#define mm512_shufll_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                        0x3E3D3C3B3A393837, 0x363534333231302F. \
                        0x2E2D2C2B2A292827, 0x262524232221201F, \
@@ -456,51 +526,55 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
 
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
+// 128 bit lane shift is handled by bslli bsrli.
 
 // Swap hi & lo 128 bits in each 256 bit lane
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
+#define mm512_shuflr256_128 mm512_swap256_128
+#define mm512_shufll256_128 mm512_swap256_128
 
 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror256_64( v )     _mm512_permutex_epi64( v, 0x39 )
-#define mm512_rol256_64( v )     _mm512_permutex_epi64( v, 0x93 )
+#define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
+
+#define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )
 
 // Rotate 256 bit lanes by one 32 bit element
-#define mm512_ror256_32( v ) \
+#define mm512_shuflr256_32( v ) \
    _mm512_permutexvar_epi32( m512_const_64( \
                       0x000000080000000f, 0x0000000e0000000d, \
                       0x0000000c0000000b, 0x0000000a00000009, \
                       0x0000000000000007, 0x0000000600000005, \
                       0x0000000400000003, 0x0000000200000001 ), v )
 
-#define mm512_rol256_32( v ) \
+#define mm512_shufll256_32( v ) \
    _mm512_permutexvar_epi32( m512_const_64( \
                       0x0000000e0000000d, 0x0000000c0000000b, \
                       0x0000000a00000009, 0x000000080000000f, \
                       0x0000000600000005, 0x0000000400000003, \
                       0x0000000200000001, 0x0000000000000007 ), v )
 
-#define mm512_ror256_16( v ) \
+#define mm512_shuflr256_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x00100001001e001d, 0x001c001b001a0019, \
                      0x0018001700160015, 0x0014001300120011, \
                      0x0000000f000e000d, 0x000c000b000a0009, \
                      0x0008000700060005, 0x0004000300020001 ), v )
 
-#define mm512_rol256_16( v ) \
+#define mm512_shufll256_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x001e001d001c001b, 0x001a001900180017, \
                      0x0016001500140013, 0x001200110010001f, \
                      0x000e000d000c000b, 0x000a000900080007, \
                      0x0006000500040003, 0x000200010000000f ), v )
 
-#define mm512_ror256_8( v ) \
+#define mm512_shuflr256_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x203f3e3d3c3b3a39, 0x3837363534333231, \
                      0x302f2e2d2c2b2a29, 0x2827262524232221, \
                      0x001f1e1d1c1b1a19, 0x1817161514131211, \
                      0x100f0e0d0c0b0a09, 0x0807060504030201 ) )
 
-#define mm512_rol256_8( v ) \
+#define mm512_shufll256_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x3e3d3c3b3a393837, 0x363534333231302f, \
                      0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -508,82 +582,120 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
                      0x0e0d0c0b0a090807, 0x060504030201001f ) )
 
 //
-// Rotate elements within 128 bit lanes of 512 bit vector.
-
+// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
+ 
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_shuflr128_64  mm512_swap128_64
+#define mm512_shufll128_64  mm512_swap128_64
 
 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
+#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
 
-// Rotate right 128 bit lanes by c bytes
-static inline __m512i mm512_ror128_x8( const __m512i v, const int c )
+// Rotate right 128 bit lanes by c bytes, versatile and just as fast
+static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
 
-// Swap 32 bits in each 64 bit lane.
+// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
+// but only with AVX512. Shuffle is just as fast and availble with AVX2
+// & SSE2.
 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_shuflr64_32 mm512_swap64_32
+#define mm512_shufll64_32 mm512_swap64_32
 
-
+// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
+// and 2 input 2 output shuffle macros.
 //
-//  Rotate elements from 2 512 bit vectors in place, source arguments
+// shuflr is 1 input
+// shufl2r is 2 input ...
+// Drop macros? They can easilly be rebuilt using shufl2 functions
+
+// add shuflr shufll functions performing rotate, returning first arg
+// They're faster than doing both, when both not needed.
+
+// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
+// rotated v1 
+// visually confusing for shif2r because of arg order. First arg is always
+// the target for modification, either update by reference or by function
+// return.
+#define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
+#define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )
+
+#define mm512_shufl2r_128( v1, v2 )    _mm512_alignr_epi64( v2, v1, 2 )
+#define mm512_shufl2l_128( v1, v2 )    _mm512_alignr_epi64( v1, v2, 2 )
+
+#define mm512_shufl2r_64( v1, v2 )     _mm512_alignr_epi64( v2, v1, 1 )
+#define mm512_shufl2l_64( v1, v2 )     _mm512_alignr_epi64( v1, v2, 1 )
+
+#define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
+#define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
+
+// Rotate elements from 2 512 bit vectors in place, source arguments
 //  are overwritten.
 
 #define mm512_swap1024_512( v1, v2 ) \
    v1 = _mm512_xor_si512( v1, v2 ); \
    v2 = _mm512_xor_si512( v1, v2 ); \
    v1 = _mm512_xor_si512( v1, v2 );
+#define mm512_shufl2l_512 mm512_swap1024_512 \
+#define mm512_shufl2r_512 mm512_swap1024_512 \
+
+// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
+// for now.
+//  Rotate elements from 2 512 bit vectors in place, both source arguments
+//  are updated.
 
-#define mm512_ror1024_256( v1, v2 ) \
+#define mm512_vror1024_256( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_256( v1, v2 ) \
+#define mm512_vrol1024_256( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1024_128( v1, v2 ) \
+#define mm512_vror1024_128( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_128( v1, v2 ) \
+#define mm512_vrol1024_128( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1024_64( v1, v2 ) \
+#define mm512_vror1024_64( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_64( v1, v2 ) \
+#define mm512_vrol1024_64( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1024_32( v1, v2 ) \
+#define mm512_vror1024_32( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
    v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_32( v1, v2 ) \
+#define mm512_vrol1024_32( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
    v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h
index e74066b6..31b0b89a 100644
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -68,13 +68,13 @@
 // rotation.
 
 // Swap hi & lo 32 bits.
-#define mm64_swap_32( a )     _mm_shuffle_pi16( a, 0x4e )
+#define mm64_swap_32( a )      _mm_shuffle_pi16( a, 0x4e )
 
-#define mm64_ror64_1x16( a )  _mm_shuffle_pi16( a, 0x39 ) 
-#define mm64_rol64_1x16( a )  _mm_shuffle_pi16( a, 0x93 ) 
+#define mm64_shulfr_16( a )  _mm_shuffle_pi16( a, 0x39 ) 
+#define mm64_shufll_16( a )  _mm_shuffle_pi16( a, 0x93 ) 
 
 // Swap hi & lo 16 bits of each 32 bit element
-#define mm64_swap32_16( a )  _mm_shuffle_pi16( a, 0xb1 )
+#define mm64_swap32_16( a )    _mm_shuffle_pi16( a, 0xb1 )
 
 #if defined(__SSSE3__)
 
@@ -86,7 +86,7 @@
     _mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
 
 // Rotate right by c bytes
-static inline __m64 mm64_ror_x8( __m64 v, const int c )
+static inline __m64 mm64_vror_x8( __m64 v, const int c )
 { return _mm_alignr_pi8( v, v, c ); }
 
 #else
diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h
index 4a7188e5..601c7508 100644
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -5,10 +5,19 @@
 #define bswap_64( a ) __builtin_bswap64( a )
 #define bswap_32( a ) __builtin_bswap32( a )
 
-// safe division, integer or floating point
+// Safe division, integer or floating point. For floating point it's as  
+// safe as 0. is precisely zero.
+// Returns safe_result if division by zero.
 #define safe_div( dividend, divisor, safe_result ) \
    ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )
 
+// Aliases with familiar names for built in bit rotate instructions
+#define rol64( a, n )   _lrotl( a, n )  
+#define ror64( a, n )   _lrotr( a, n )
+#define rol32( a, n )   _rotl( a, n )
+#define ror32( a, n )   _rotr( a, n )
+#define rol16( a, n )   _rotwl( a, n )
+#define ror16( a, n )   _rotwr( a, n )
 
 ///////////////////////////////////////
 // 
@@ -29,12 +38,14 @@
 // __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
 // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
 
+// obsolete test
 // Compiler check for __int128 support
 // Configure also has a test for int128.
 #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
   #define GCC_INT128 1
 #endif
 
+// obsolte test
 #if !defined(GCC_INT128)
   #warning "__int128 not supported, requires GCC-4.8 or newer."
 #endif
diff --git a/sysinfos.c b/sysinfos.c
index 010c78f4..ed453e2f 100644
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -218,7 +218,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
       for (int i = 2; i <= (ext & 0xF); i++)
       {
          cpuid(0x80000000+i, output);
-	 memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
+         memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
       }
       snprintf(outbuf, maxsz, "%s", brand);
    }
diff --git a/util.c b/util.c
index b96c4fe0..31b92703 100644
--- a/util.c
+++ b/util.c
@@ -47,6 +47,7 @@
 //#include "miner.h"
 #include "elist.h"
 #include "algo-gate-api.h"
+#include "algo/sha/sha256d.h"
 
 //extern pthread_mutex_t stats_lock;
 
@@ -129,17 +130,19 @@ void applog2( int prio, const char *fmt, ... )
 
 //    localtime_r(&now, &tm);
 
-      switch (prio) {
+      switch ( prio )
+      {
+         case LOG_CRIT:    color = CL_LRD; break;
          case LOG_ERR:     color = CL_RED; break;
-         case LOG_WARNING: color = CL_YLW; break;
+         case LOG_WARNING: color = CL_YL2; break;
+         case LOG_MAJR:    color = CL_YL2; break;
          case LOG_NOTICE:  color = CL_WHT; break;
          case LOG_INFO:    color = ""; break;
          case LOG_DEBUG:   color = CL_GRY; break;
-
-         case LOG_BLUE:
-            prio = LOG_NOTICE;
-            color = CL_CYN;
-            break;
+         case LOG_MINR:    color = CL_YLW; break;
+         case LOG_GREEN:   color = CL_GRN; prio = LOG_INFO; break;
+         case LOG_BLUE:    color = CL_CYN; prio = LOG_NOTICE; break;
+         case LOG_PINK:    color = CL_LMA; prio = LOG_NOTICE; break;
       }
       if (!use_colors)
          color = "";
@@ -206,17 +209,19 @@ void applog(int prio, const char *fmt, ...)
 
 		localtime_r(&now, &tm);
 
-		switch (prio) {
-			case LOG_ERR:     color = CL_RED; break;
-			case LOG_WARNING: color = CL_YLW; break;
+		switch ( prio )
+      {
+         case LOG_CRIT:    color = CL_LRD; break;
+         case LOG_ERR:     color = CL_RED; break;
+			case LOG_WARNING: color = CL_YL2; break;
+         case LOG_MAJR:    color = CL_YL2; break;
 			case LOG_NOTICE:  color = CL_WHT; break;
-			case LOG_INFO:    color = ""; break;
+			case LOG_INFO:    color = "";     break;
 			case LOG_DEBUG:   color = CL_GRY; break;
-
-			case LOG_BLUE:
-				prio = LOG_NOTICE;
-				color = CL_CYN;
-				break;
+         case LOG_MINR:    color = CL_YLW; break;
+         case LOG_GREEN:   color = CL_GRN; prio = LOG_INFO;  break;
+			case LOG_BLUE:    color = CL_CYN; prio = LOG_NOTICE; break;
+         case LOG_PINK:    color = CL_LMA; prio = LOG_NOTICE; break;
 		}
 		if (!use_colors)
 			color = "";
@@ -303,6 +308,29 @@ void format_hashrate(double hashrate, char *output)
 	);
 }
 
+// For use with MiB etc
+void format_number_si( double* n, char* si_units )
+{
+  if ( *n < 1024*10 )  {  *si_units = 0;   return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'k'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'M'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'G'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'T'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'P'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'E'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'Z'; return;  }
+  *n /= 1024;
+  *si_units = 'Y';
+}
+
+
 /* Modify the representation of integer numbers which would cause an overflow
  * so that they are treated as floating-point numbers.
  * This is a hack to overcome the limitations of some versions of Jansson. */

From 47cc5dcff519d0be1e206bfdc52121a44d345e98 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Sun, 10 Oct 2021 22:50:19 -0400
Subject: [PATCH 12/20] v3.18.1

---
 INSTALL_LINUX                  |   22 +-
 RELEASE_NOTES                  |   16 +-
 algo/scrypt/scrypt-core-4way.c | 2886 ++++++++++++--------------------
 algo/scrypt/scrypt.c           |  456 +++--
 algo/sha/sha-hash-4way.h       |    3 -
 algo/sha/sha256-hash-4way.c    |  881 ++--------
 algo/shabal/shabal-hash-4way.c |   13 +-
 algo/swifftx/inttypes.h        |    8 +-
 algo/swifftx/swifftx.c         |  409 ++++-
 configure                      |   20 +-
 configure.ac                   |    2 +-
 cpu-miner.c                    |   28 +-
 simd-utils/simd-256.h          |   48 +-
 simd-utils/simd-512.h          |    4 -
 14 files changed, 2013 insertions(+), 2783 deletions(-)

diff --git a/INSTALL_LINUX b/INSTALL_LINUX
index a88f888c..24927b46 100644
--- a/INSTALL_LINUX
+++ b/INSTALL_LINUX
@@ -32,14 +32,26 @@ but different package names.
 $ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
 
 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
-support depending on your CPU and compiler version:
+openssl 1.1.0e or higher.
 
-"-march=native" is always the best choice
+znver1 and znver2 should be recognized on most recent version of GCC and
+znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
+In the meantime here are some suggestions to compile with new CPUs:
 
-"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
+"-march=native" is usually the best choice, used by build.sh.
 
-"-msha"  Add SHA to other tuning options
+"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
+
+"-mcascadelake -msha" or
+"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
+
+Features can also be added individually:
+
+"-msha" adds support for HW accelerated sha256.
+
+"-mavx512" adds support for 512 bit vectors
+
+"-mvaes" add support for parallel AES
 
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 056491f7..ef3f912f 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,10 +65,24 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.18.1
+
+More speed for scrypt:
+ - additional scryptn2 optimizations for all CPU architectures,
+ - AVX2 is now used by default on CPUS with SHA but not AVX512,
+ - scrypt:1024 performance lost in v3.18.0 is restored,
+ - AVX512 & AVX2 improvements to scrypt:1024.
+
+Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
+
+Issue #337: fixed a problem that could display negative stats values in the
+first summary report if the report was forced prematurely due to a stratum
+diff change. The stats will still be invalid but should display zeros.
+
 v3.18.0
 
 Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
-  - AVX512 & SHA support for SHA256, AVX512 has priority,
+  - AVX512 & SHA support for sha256, AVX512 has priority,
   - up to 50% increase in hashrate,
   - memory requirements reduced 30-60% depending on CPU architecture,
   - memory usage displayed at startup,
diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
index 19ff9cdd..1039c3fc 100644
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -116,23 +116,6 @@ do{ \
    c1 = XOR( c1, tc ); \
 } while (0);
 
-// use 16 regs   AVX, AVX2, 8 buf for AVX512?
-#define ARX_4BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3, n ) \
-do{ \
-   TYPE ta = ADD32( a2, a3 ); \
-   TYPE tb = ADD32( b2, b3 ); \
-   TYPE tc = ADD32( c2, c3 ); \
-   TYPE td = ADD32( d2, d3 ); \
-   ta = ROL32( ta, n ); \
-   tb = ROL32( tb, n ); \
-   tc = ROL32( tc, n ); \
-   td = ROL32( td, n ); \
-   a1 = XOR( a1, ta ); \
-   b1 = XOR( b1, tb ); \
-   c1 = XOR( c1, tc ); \
-   d1 = XOR( d1, td ); \
-} while (0);
-
 
 // Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 &
 // ROR_1X32 defined.
@@ -208,95 +191,127 @@ do{ \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 );
 
-#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \
-   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
-   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
-   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
-   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
+// For use when fast bit rotate is not available.
+// contains target specif instructions, only use with 128 bit vectrors.
+#define SALSA_2ROUNDS_SIMD128_2BUF_SLOROT \
+do{ \
+   TYPE TA = ADD32( XA0, XA3 ); \
+   TYPE TB = ADD32( XB0, XB3 ); \
+   TYPE T  = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
+   XA1 = XOR( XA1, T  ); \
+   XA1 = XOR( XA1, TA  ); \
+   T = _mm_slli_epi32( TB, 7 );\
+   TB = _mm_srli_epi32( TB, 25 ); \
+   XB1 = XOR( XB1, T ); \
+   XB1 = XOR( XB1, TB ); \
+\
+   TA = ADD32( XA1, XA0 ); \
+   TB = ADD32( XB1, XB0 ); \
+   T  = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
+   XA2 = XOR( XA2, TA ); \
+   T = _mm_slli_epi32( TB, 9 );\
+   TB = _mm_srli_epi32( TB, 23 );\
+   XB2 = XOR( XB2, T ); \
+   XB2 = XOR( XB2, TB ); \
+\
+   TA = ADD32( XA2, XA1 ); \
+   TB = ADD32( XB2, XB1 ); \
+   T  = _mm_slli_epi32( TA, 13); \
+   TA = _mm_srli_epi32( TA, 19 ); \
    XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
-   XA3 = ROR_1X32( XA3 ); \
-   XB3 = ROR_1X32( XB3 ); \
+   XA3 = XOR( XA3, T ); \
+   XA3 = XOR( XA3, TA ); \
+   T  = _mm_slli_epi32( TB, 13); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XB3 = XOR( XB3, T ); \
+   XB3 = XOR( XB3, TB ); \
+\
+   TA = ADD32( XA3, XA2 ); \
+   TB = ADD32( XB3, XB2 ); \
+   T  = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
-   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
-   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
-   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 );
-
-// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-//                XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3 )
-#define SALSA_2ROUNDS_SIMD128_4BUF \
-   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
-             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
-   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
-             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
-   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
-             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
-   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
-             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
-   XA1 = ROL_1X32( XA1 ); \
-   XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
-   XD1 = ROL_1X32( XD1 ); \
+   XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
+   T  = _mm_slli_epi32( TB, 18 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
+   XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
+\
+   TA = ADD32( XA0, XA1 ); \
+   TB = ADD32( XB0, XB1 ); \
+   T = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
    XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, T ); \
+   XA3 = XOR( XA3, TA ); \
+   T = _mm_slli_epi32( TB, 7 ); \
+   TB = _mm_srli_epi32( TB, 25 ); \
    XB3 = ROR_1X32( XB3 ); \
-   XC3 = ROR_1X32( XC3 ); \
-   XD3 = ROR_1X32( XD3 ); \
-   XA2 = SWAP_64( XA2 ); \
-   XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
-   XD2 = SWAP_64( XD2 ); \
-   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
-             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
-   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
-             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
-   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
-             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
-   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
-             XC0, XC1, XC2, XD0, XD1, XD2, 18 ); \
+   XB3 = XOR( XB3, T ); \
+   XB3 = XOR( XB3, TB ); \
+\
+   TA = ADD32( XA3, XA0 ); \
+   TB = ADD32( XB3, XB0 ); \
+   T = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
+   XA2 = XOR( XA2, TA ); \
+   T = _mm_slli_epi32( TB, 9 ); \
+   TB = _mm_srli_epi32( TB, 23 ); \
+   XB2 = XOR( XB2, T ); \
+   XB2 = XOR( XB2, TB ); \
+\
+   TA = ADD32( XA2, XA3 ); \
+   TB = ADD32( XB2, XB3 ); \
+   T = _mm_slli_epi32( TA, 13 ); \
+   TA = _mm_srli_epi32( TA, 19 ); \
    XA3 = ROL_1X32( XA3 ); \
    XB3 = ROL_1X32( XB3 ); \
-   XC3 = ROL_1X32( XC3 ); \
-   XD3 = ROL_1X32( XD3 ); \
-   XA1 = ROR_1X32( XA1 ); \
-   XB1 = ROR_1X32( XB1 ); \
-   XC1 = ROR_1X32( XC1 ); \
-   XD1 = ROR_1X32( XD1 ); \
+   XA1 = XOR( XA1, T ); \
+   XA1 = XOR( XA1, TA ); \
+   T = _mm_slli_epi32( TB, 13 ); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XB1 = XOR( XB1, T ); \
+   XB1 = XOR( XB1, TB ); \
+\
+   TA = ADD32( XA1, XA2 ); \
+   TB = ADD32( XB1, XB2 ); \
+   T = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
-   XD2 = SWAP_64( XD2 );
-
-#define SALSA_2ROUNDS_FINAL_SIMD128_4BUF \
-   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
-             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
-   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
-             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
-   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
-             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
-   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
-             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
+   XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
+   T = _mm_slli_epi32( TB, 18 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
+   XB1 = ROR_1X32( XB1 ); \
+} while (0);
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \
+   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
+   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
+   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
+   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
    XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
-   XD1 = ROL_1X32( XD1 ); \
    XA3 = ROR_1X32( XA3 ); \
    XB3 = ROR_1X32( XB3 ); \
-   XC3 = ROR_1X32( XC3 ); \
-   XD3 = ROR_1X32( XD3 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
-   XD2 = SWAP_64( XD2 ); \
-   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
-             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
-   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
-             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
-   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
-             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
-   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
-             XC0, XC1, XC2, XD0, XD1, XD2, 18 );
+   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
+   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
+   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
+   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 );
+
 
 // Inlined ARX
 #define SALSA_2ROUNDS_SIMD128_3BUF \
@@ -402,7 +417,8 @@ do{ \
    
 
 // slow rol, an attempt to optimze non-avx512 bit rotations
-#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROL \
+// Contains target specific instructions, only for use with 128 bit vectors
+#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
 do{ \
    TYPE TA = ADD32( XA0, XA3 ); \
    TYPE TB = ADD32( XB0, XB3 ); \
@@ -410,14 +426,14 @@ do{ \
    TYPE T  = _mm_slli_epi32( TA, 7 ); \
    TA = _mm_srli_epi32( TA, 25 ); \
    XA1 = XOR( XA1, T  ); \
-   T = _mm_slli_epi32( TB, 7 );\
    XA1 = XOR( XA1, TA  ); \
+   T = _mm_slli_epi32( TB, 7 );\
    TB = _mm_srli_epi32( TB, 25 ); \
    XB1 = XOR( XB1, T ); \
-   T = _mm_slli_epi32( TC, 7 );\
    XB1 = XOR( XB1, TB ); \
-   XC1 = XOR( XC1, T ); \
+   T = _mm_slli_epi32( TC, 7 );\
    TC = _mm_srli_epi32( TC, 25 );\
+   XC1 = XOR( XC1, T ); \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA0 ); \
@@ -426,14 +442,14 @@ do{ \
    T  = _mm_slli_epi32( TA, 9 ); \
    TA = _mm_srli_epi32( TA, 23 ); \
    XA2 = XOR( XA2, T ); \
+   XA2 = XOR( XA2, TA ); \
    T = _mm_slli_epi32( TB, 9 );\
    TB = _mm_srli_epi32( TB, 23 );\
-   XA2 = XOR( XA2, TA ); \
    XB2 = XOR( XB2, T ); \
-   T = _mm_slli_epi32( TC, 9 );\
    XB2 = XOR( XB2, TB ); \
-   XC2 = XOR( XC2, T ); \
+   T = _mm_slli_epi32( TC, 9 );\
    TC = _mm_srli_epi32( TC, 23 );\
+   XC2 = XOR( XC2, T ); \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA1 ); \
@@ -442,17 +458,17 @@ do{ \
    T  = _mm_slli_epi32( TA, 13); \
    TA = _mm_srli_epi32( TA, 19 ); \
    XA1 = ROL_1X32( XA1 ); \
-   XA3 = XOR( XA3, T ); \
    XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = XOR( XA3, T ); \
+   XA3 = XOR( XA3, TA ); \
    T  = _mm_slli_epi32( TB, 13); \
    TB = _mm_srli_epi32( TB, 19 ); \
-   XA3 = XOR( XA3, TA ); \
    XB3 = XOR( XB3, T ); \
+   XB3 = XOR( XB3, TB ); \
    T  = _mm_slli_epi32( TC, 13); \
    TC = _mm_srli_epi32( TC, 19 ); \
-   XB3 = XOR( XB3, TB ); \
    XC3 = XOR( XC3, T ); \
-   XC1 = ROL_1X32( XC1 ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA2 ); \
@@ -461,70 +477,94 @@ do{ \
    T  = _mm_slli_epi32( TA, 18 ); \
    TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
    XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
    T  = _mm_slli_epi32( TB, 18 ); \
-   XB2 = SWAP_64( XB2 ); \
    TB = _mm_srli_epi32( TB, 14 ); \
    XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
    T = _mm_slli_epi32( TC, 18 ); \
-   XA0 = XOR( XA0, TA ); \
    TC = _mm_srli_epi32( TC, 14 ); \
    XC0 = XOR( XC0, T ); \
-   XB0 = XOR( XB0, TB ); \
-   XC2 = SWAP_64( XC2 ); \
    XC0 = XOR( XC0, TC ); \
 \
    TA = ADD32( XA0, XA1 ); \
    TB = ADD32( XB0, XB1 ); \
    TC = ADD32( XC0, XC1 ); \
-   TA = ROL32( TA, 7 ); \
+   T = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
    XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, T ); \
    XA3 = XOR( XA3, TA ); \
-   TB = ROL32( TB, 7 ); \
+   T = _mm_slli_epi32( TB, 7 ); \
+   TB = _mm_srli_epi32( TB, 25 ); \
    XB3 = ROR_1X32( XB3 ); \
+   XB3 = XOR( XB3, T ); \
    XB3 = XOR( XB3, TB ); \
-   TC = ROL32( TC, 7 ); \
+   T = _mm_slli_epi32( TC, 7 ); \
+   TC = _mm_srli_epi32( TC, 25 ); \
    XC3 = ROR_1X32( XC3 ); \
+   XC3 = XOR( XC3, T ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA0 ); \
    TB = ADD32( XB3, XB0 ); \
    TC = ADD32( XC3, XC0 ); \
-   TA = ROL32( TA, 9 ); \
-   TB = ROL32( TB, 9 ); \
-   TC = ROL32( TC, 9 ); \
+   T = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
    XA2 = XOR( XA2, TA ); \
+   T = _mm_slli_epi32( TB, 9 ); \
+   TB = _mm_srli_epi32( TB, 23 ); \
+   XB2 = XOR( XB2, T ); \
    XB2 = XOR( XB2, TB ); \
+   T = _mm_slli_epi32( TC, 9 ); \
+   TC = _mm_srli_epi32( TC, 23 ); \
+   XC2 = XOR( XC2, T ); \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA3 ); \
    TB = ADD32( XB2, XB3 ); \
-   TA = ROL32( TA, 13 ); \
    TC = ADD32( XC2, XC3 ); \
+   T = _mm_slli_epi32( TA, 13 ); \
+   TA = _mm_srli_epi32( TA, 19 ); \
    XA3 = ROL_1X32( XA3 ); \
-   TB = ROL32( TB, 13 ); \
    XB3 = ROL_1X32( XB3 ); \
-   XA1 = XOR( XA1, TA ); \
-   TC = ROL32( TC, 13 ); \
    XC3 = ROL_1X32( XC3 ); \
+   XA1 = XOR( XA1, T ); \
+   XA1 = XOR( XA1, TA ); \
+   T = _mm_slli_epi32( TB, 13 ); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XB1 = XOR( XB1, T ); \
    XB1 = XOR( XB1, TB ); \
+   T = _mm_slli_epi32( TC, 13 ); \
+   TC = _mm_srli_epi32( TC, 19 ); \
+   XC1 = XOR( XC1, T ); \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA2 ); \
    TB = ADD32( XB1, XB2 ); \
-   TA = ROL32( TA, 18); \
    TC = ADD32( XC1, XC2 ); \
+   T = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
-   TB = ROL32( TB, 18); \
-   XA0 = XOR( XA0, TA ); \
    XB2 = SWAP_64( XB2 ); \
-   TC = ROL32( TC, 18); \
-   XB0 = XOR( XB0, TB ); \
+   XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
+   T = _mm_slli_epi32( TB, 18 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
    XC2 = SWAP_64( XC2 ); \
    XA1 = ROR_1X32( XA1 ); \
+   XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
+   T = _mm_slli_epi32( TC, 18 ); \
+   TC = _mm_srli_epi32( TC, 14 ); \
    XB1 = ROR_1X32( XB1 ); \
-   XC0 = XOR( XC0, TC ); \
    XC1 = ROR_1X32( XC1 ); \
+   XC0 = XOR( XC0, T ); \
+   XC0 = XOR( XC0, TC ); \
 } while (0);
 
 
@@ -614,6 +654,12 @@ do{ \
    SALSA_2ROUNDS_SIMD128_2BUF; \
    SALSA_2ROUNDS_SIMD128_2BUF;
 
+#define SALSA_8ROUNDS_SIMD128_2BUF_SLOROT \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT;
+
 #define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \
    SALSA_2ROUNDS_SIMD128_2BUF; \
    SALSA_2ROUNDS_SIMD128_2BUF; \
@@ -626,6 +672,12 @@ do{ \
    SALSA_2ROUNDS_SIMD128_3BUF; \
    SALSA_2ROUNDS_SIMD128_3BUF;
 
+#define SALSA_8ROUNDS_SIMD128_3BUF_SLOROT \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT;
+
 #define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \
    SALSA_2ROUNDS_SIMD128_3BUF; \
    SALSA_2ROUNDS_SIMD128_3BUF; \
@@ -746,13 +798,13 @@ static void xor_salsa8_16way( __m512i * const B, const __m512i * const C)
 
 void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128*16 );
+      memcpy( &V[n * 32], X, 128*16 );
       xor_salsa8_16way( &X[ 0], &X[16] );
       xor_salsa8_16way( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m512_ovly *vptr[16];   // pointer to V offset for each lane 
       m512_ovly *x16 = (m512_ovly*)(&X[16]);
@@ -765,12 +817,12 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
          vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
       }
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
          m512_ovly v;    // V value assembled from different indexes
          for ( int l = 0; l < 8; l++ )
-            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
-         X[ k ] = _mm512_xor_si512( X[ k ], v.m512 );
+            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
+         X[i] = _mm512_xor_si512( X[i], v.m512 );
       }
 
       xor_salsa8_16way( &X[ 0], &X[16] );
@@ -852,14 +904,14 @@ static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 
 void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 4*128 );
+      memcpy( &V[n * 32], X, 4*128 );
       salsa8_simd128_4way( &X[ 0], &X[16] );
       salsa8_simd128_4way( &X[16], &X[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       uint32_t x16[4];   // index into V for each lane
       memcpy( x16, &X[16], 16 );
@@ -869,12 +921,12 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
       x16[3] = 32 * ( x16[3] & ( N-1) );
       m128_ovly *v = (m128_ovly*)V;
 
-      for( int k = 0; k < 32; k++ )
+      for( int i = 0; i < 32; i++ )
       {
-         X[k] = _mm_xor_si128( X[k], _mm_set_epi32( v[ x16[3] + k ].u32[3],
-                                                    v[ x16[2] + k ].u32[2],
-                                                    v[ x16[1] + k ].u32[1],
-                                                    v[ x16[0] + k ].u32[0] ) );
+         X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3],
+                                                    v[ x16[2] + i ].u32[2],
+                                                    v[ x16[1] + i ].u32[1],
+                                                    v[ x16[0] + i ].u32[0] ) );
       }
 
       salsa8_simd128_4way( &X[ 0], &X[16] );
@@ -882,49 +934,60 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
    }
 }
 
-// not working, occasional accepted shares, not up to date.
+// 4x memory usage
+// Working
 // 4x128 interleaving
-static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
+static void salsa_shuffle_4way_simd128( __m512i *X )
 {
-   __m512i X0, X1, X2, X3;
-   uint32_t *b = (uint32_t*)B;
-   m512_ovly y[4], z[4];
-
-   // mix C into B then shuffle B into X
-   B[0] = _mm512_xor_si512( B[0], C[0] );
-   B[1] = _mm512_xor_si512( B[1], C[1] );
-   B[2] = _mm512_xor_si512( B[2], C[2] );
-   B[3] = _mm512_xor_si512( B[3], C[3] );
+   __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
 
-   // { l3u15, l3u10, l3u5, l3u0,  l2u15, l2u10, l2u5, l2u0,
-   //   l1u15, l1u10, l1u5, l1u0,  l0u15, l0u10, l0u5, l0u0 }
+   Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
+   Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
 
-   //  b index = row index     + lane index + unit index
-   //          = ( 8 * (u/4) ) +  ( 4*l )   +  ( u%4 )
+   Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
+   Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
 
-   X0 = _mm512_set_epi32( b[63], b[46], b[29], b[12],   // lane 3[3:0]
-                          b[59], b[42], b[25], b[ 8],   // lane 2[3:0]
-                          b[55], b[38], b[21], b[ 4],   // lane 1[3:0]
-                          b[51], b[34], b[17], b[ 0] ); // lane 0[3:0]
+   Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
+   Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
 
-   X1 = _mm512_set_epi32( b[15], b[62], b[45], b[28], 
-                          b[11], b[58], b[41], b[24],  
-                          b[ 7], b[54], b[37], b[20],
-                          b[ 3], b[50], b[33], b[16] ); // lane 0[7:4]
+   Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
+   Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
 
-   X2 = _mm512_set_epi32( b[31], b[14], b[61], b[44],
-                          b[27], b[10], b[57], b[40],
-                          b[23], b[ 6], b[53], b[36],
-                          b[19], b[ 2], b[49], b[32] );
+   X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
+   X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
+   X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
+   X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
+}
 
-   X3 = _mm512_set_epi32( b[47], b[30], b[13], b[60],
-                          b[43], b[26], b[ 9], b[56],
-                          b[39], b[22], b[ 5], b[52],
-                          b[35], b[18], b[ 1], b[48] );
+static void salsa_unshuffle_4way_simd128( __m512i *X )
+{
+   __m512i Y0, Y1, Y2, Y3;
+
+   Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
+   Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
+   Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
+   Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
+
+   Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
+   Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
+   Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
+   Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
+
+   X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
+   X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
+   X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
+   X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
+}
 
+static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
+{
+   __m512i X0, X1, X2, X3;
 
+   X0 = B[0] = _mm512_xor_si512( B[0], C[0] );
+   X1 = B[1] = _mm512_xor_si512( B[1], C[1] );
+   X2 = B[2] = _mm512_xor_si512( B[2], C[2] );
+   X3 = B[3] = _mm512_xor_si512( B[3], C[3] );
 
-   // define targets for macros used in round function template
    #define ROL_1X32    mm512_shufll128_32  // shuffle within 128 bit lanes
    #define ROR_1X32    mm512_shuflr128_32
    #define SWAP_64     mm512_swap128_64
@@ -932,7 +995,7 @@ static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
    #define ADD32       _mm512_add_epi32
    #define XOR         _mm512_xor_si512
 
-   SALSA_8ROUNDS_FINAL_SIMD128;
+   SALSA_8ROUNDS_SIMD128;
 
    #undef ROL_1X32
    #undef ROR_1X32
@@ -941,123 +1004,25 @@ static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
    #undef ADD32
    #undef XOR 
 
-   y[0].m512 = X0;
-   y[1].m512 = X1;
-   y[2].m512 = X2;
-   y[3].m512 = X3;
-
-   // lane 0
-   z[0].u32[ 0    ] = y[0].u32[ 0];
-   z[0].u32[ 3    ] = y[1].u32[ 0];
-   z[0].u32[ 2    ] = y[2].u32[ 0];
-   z[0].u32[ 1    ] = y[3].u32[ 0];
-
-   // lane 1
-   z[0].u32[ 0+ 4 ] = y[0].u32[ 4];
-   z[0].u32[ 3+ 4 ] = y[1].u32[ 4];
-   z[0].u32[ 2+ 4 ] = y[2].u32[ 4];
-   z[0].u32[ 1+ 4 ] = y[3].u32[ 4];
-
-   // lane 2
-   z[0].u32[ 0+ 8 ] = y[0].u32[ 8];
-   z[0].u32[ 3+ 8 ] = y[1].u32[ 8];
-   z[0].u32[ 2+ 8 ] = y[2].u32[ 8];
-   z[0].u32[ 1+ 8 ] = y[3].u32[ 8];
-   
-   // lane 3
-   z[0].u32[ 0+12 ] = y[0].u32[12];
-   z[0].u32[ 3+12 ] = y[1].u32[12];
-   z[0].u32[ 2+12 ] = y[2].u32[12];
-   z[0].u32[ 1+12 ] = y[3].u32[12];
-
-   // lane 0
-   z[1].u32[ 1    ] = y[0].u32[ 1];
-   z[1].u32[ 0    ] = y[1].u32[ 1];
-   z[1].u32[ 3    ] = y[2].u32[ 1];
-   z[1].u32[ 2    ] = y[3].u32[ 1];
-
-   //lane 1
-   z[1].u32[ 1+ 4 ] = y[0].u32[ 5];
-   z[1].u32[ 0+ 4 ] = y[1].u32[ 5];
-   z[1].u32[ 3+ 4 ] = y[2].u32[ 5];
-   z[1].u32[ 2+ 4 ] = y[3].u32[ 5];
-
-   // lane 2
-   z[1].u32[ 1+ 8 ] = y[0].u32[ 9];
-   z[1].u32[ 0+ 8 ] = y[1].u32[ 9];
-   z[1].u32[ 3+ 8 ] = y[2].u32[ 9];
-   z[1].u32[ 2+ 8 ] = y[3].u32[ 9];
-
-   // lane 3
-   z[1].u32[ 1+12 ] = y[0].u32[13];
-   z[1].u32[ 0+12 ] = y[1].u32[13];
-   z[1].u32[ 3+12 ] = y[2].u32[13];
-   z[1].u32[ 2+12 ] = y[3].u32[13];
-  
-   // lane 0
-   z[2].u32[ 2    ] = y[0].u32[2];
-   z[2].u32[ 1    ] = y[1].u32[2];
-   z[2].u32[ 0    ] = y[2].u32[2];
-   z[2].u32[ 3    ] = y[3].u32[2];
-
-   // lane 1
-   z[2].u32[ 2+ 4 ] = y[0].u32[6];
-   z[2].u32[ 1+ 4 ] = y[1].u32[6];
-   z[2].u32[ 0+ 4 ] = y[2].u32[6];
-   z[2].u32[ 3+ 4 ] = y[3].u32[6];
-
-   // lane 2
-   z[2].u32[ 2+ 8 ] = y[0].u32[10];
-   z[2].u32[ 1+ 8 ] = y[1].u32[10];
-   z[2].u32[ 0+ 8 ] = y[2].u32[10];
-   z[2].u32[ 3+ 8 ] = y[3].u32[10];
-
-   // lane 3
-   z[2].u32[ 2+12 ] = y[0].u32[14];
-   z[2].u32[ 1+12 ] = y[1].u32[14];
-   z[2].u32[ 0+12 ] = y[2].u32[14];
-   z[2].u32[ 3+12 ] = y[3].u32[14];
-   
-   // lane 0
-   z[3].u32[ 3    ] = y[0].u32[ 3];
-   z[3].u32[ 2    ] = y[1].u32[ 3];
-   z[3].u32[ 1    ] = y[2].u32[ 3];
-   z[3].u32[ 0    ] = y[3].u32[ 3];
-
-   // lane 1
-   z[3].u32[ 3+ 4 ] = y[0].u32[ 7];
-   z[3].u32[ 2+ 4 ] = y[1].u32[ 7];
-   z[3].u32[ 1+ 4 ] = y[2].u32[ 7];
-   z[3].u32[ 0+ 4 ] = y[3].u32[ 7];
-
-   // lane 2
-   z[3].u32[ 3+ 8 ] = y[0].u32[11];
-   z[3].u32[ 2+ 8 ] = y[1].u32[11];
-   z[3].u32[ 1+ 8 ] = y[2].u32[11];
-   z[3].u32[ 0+ 8 ] = y[3].u32[11];
-
-   // lane 1
-   z[3].u32[ 3+12 ] = y[0].u32[15];
-   z[3].u32[ 2+12 ] = y[1].u32[15];
-   z[3].u32[ 1+12 ] = y[2].u32[15];
-   z[3].u32[ 0+12 ] = y[3].u32[15];
-
-   B[0] = _mm512_add_epi32( B[0], z[0].m512 );
-   B[1] = _mm512_add_epi32( B[1], z[1].m512 );
-   B[2] = _mm512_add_epi32( B[2], z[2].m512 );
-   B[3] = _mm512_add_epi32( B[3], z[3].m512 );
+   B[0] = _mm512_add_epi32( B[0], X0 );
+   B[1] = _mm512_add_epi32( B[1], X1 );
+   B[2] = _mm512_add_epi32( B[2], X2 );
+   B[3] = _mm512_add_epi32( B[3], X3 );
 }
 
 void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   salsa_shuffle_4way_simd128( X );
+   salsa_shuffle_4way_simd128( X+4 );
+   
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 8], X, 128*4 );
+      memcpy( &V[n * 8], X, 128*4 );
       salsa8_4way_simd128( &X[0], &X[4] );
       salsa8_4way_simd128( &X[4], &X[0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m512_ovly x16;
       x16 = ( (m512_ovly*)X )[4];
@@ -1066,25 +1031,22 @@ void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N )
       uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) );
       uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
-         X[k] = _mm512_xor_si512( X[k], m512_const_128( 
-                                   ( (m512_ovly*)V )[ j3+k ].m128[3],
-                                   ( (m512_ovly*)V )[ j2+k ].m128[2],
-                                   ( (m512_ovly*)V )[ j1+k ].m128[1],
-                                   ( (m512_ovly*)V )[ j0+k ].m128[0] ) );
+      for ( int i = 0; i < 8; i++ )
+      { 
+         __m512i v10 = _mm512_mask_blend_epi32( 0x000f, V[ j1+i ], V[ j0+i ] );
+         __m512i v32 = _mm512_mask_blend_epi32( 0x0f00, V[ j3+i ], V[ j2+i ] );
+         X[i] = _mm512_xor_si512( X[i], _mm512_mask_blend_epi32( 0x00ff,
+                                                                 v32, v10 ) );
+      }
 
-/*
-      for ( int k = 0; k < 8; k++ )
-         X[k] = _mm512_xor_si512( X[k], m512_diagonal128_32( 
-                   V[ j3+k ], V[ j2+k ], V[ j1+k ], V[ j0+k ] ) );
-*/
       salsa8_4way_simd128( &X[0], &X[4] );
       salsa8_4way_simd128( &X[4], &X[0] );
    }
+
+   salsa_unshuffle_4way_simd128( X );
+   salsa_unshuffle_4way_simd128( X+4 );
 }
    
-
-
 #endif // AVX512
 
 #if defined(__AVX2__)
@@ -1142,14 +1104,14 @@ static void salsa8_8way( __m256i * const B, const __m256i * const C )
 
 void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128*8 );
+      memcpy( &V[n * 32], X, 128*8 );
       salsa8_8way( &X[ 0], &X[16] );
       salsa8_8way( &X[16], &X[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m256_ovly *vptr[8];   // pointer to V offset for each lane 
       m256_ovly *x16 = (m256_ovly*)(&X[16]);
@@ -1162,12 +1124,12 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
          vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
       }
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
          m256_ovly v;    // V value assembled from different indexes
          for ( int l = 0; l < 8; l++ )
-            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
-         X[ k ] = _mm256_xor_si256( X[ k ], v.m256 );
+            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
+         X[i] = _mm256_xor_si256( X[i], v.m256 );
       }
 
       salsa8_8way( &X[ 0], &X[16] );
@@ -1176,7 +1138,7 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 }
 
 // 2x memory usage
-// Working, not up to date, needs stream optimization.
+// Working
 // Essentially Pooler 6way
 // 2x128 interleaved simd128
 //   ------- lane 1 -------    ------- lane 0 -------
@@ -1185,31 +1147,56 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 // { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
 // { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]
 
-static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
+static void salsa_shuffle_2way_simd128( __m256i *X )
 {
-   __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+   __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
 
-   // mix C into B then shuffle B into X
-   B[0] = _mm256_xor_si256( B[0], C[0] );
-   B[1] = _mm256_xor_si256( B[1], C[1] );
-   B[2] = _mm256_xor_si256( B[2], C[2] );
-   B[3] = _mm256_xor_si256( B[3], C[3] );
+   Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
+   Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
+
+   Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
+   Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
+
+   Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
+   Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
+
+   Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
+   Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
 
-   Y0 = _mm256_blend_epi32( B[1], B[0], 0x11 );
-   X0 = _mm256_blend_epi32( B[3], B[2], 0x44 );
-   X0 = _mm256_blend_epi32( X0, Y0, 0x33);
+   X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
+   X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
+   X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
+   X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
+}
 
-   Y1 = _mm256_blend_epi32( B[2], B[1], 0x11 );
-   X1 = _mm256_blend_epi32( B[0], B[3], 0x44 );
-   X1 = _mm256_blend_epi32( X1, Y1, 0x33 );
+static void salsa_unshuffle_2way_simd128( __m256i *X )
+{
+   __m256i Y0, Y1, Y2, Y3;
+
+   Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
+   Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
+   Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
+   Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
+
+   Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
+   Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
+   Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
+   Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
+
+   X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
+   X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
+   X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
+   X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
+}
 
-   Y2 = _mm256_blend_epi32( B[3], B[2], 0x11 );
-   X2 = _mm256_blend_epi32( B[1], B[0], 0x44 );
-   X2 = _mm256_blend_epi32( X2, Y2, 0x33 );
+static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
+{
+   __m256i X0, X1, X2, X3;
 
-   Y3 = _mm256_blend_epi32( B[0], B[3], 0x11 );
-   X3 = _mm256_blend_epi32( B[2], B[1], 0x44 );
-   X3 = _mm256_blend_epi32( X3, Y3, 0x33 );
+   X0 = B[0] = _mm256_xor_si256( B[0], C[0] );
+   X1 = B[1] = _mm256_xor_si256( B[1], C[1] );
+   X2 = B[2] = _mm256_xor_si256( B[2], C[2] );
+   X3 = B[3] = _mm256_xor_si256( B[3], C[3] );
 
    // define targets for macros used in round function template
    #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
@@ -1228,52 +1215,41 @@ static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
    #undef ADD32
    #undef XOR 
 
-   // init with X0 then blend in the other elements
+   B[0] = _mm256_add_epi32( B[0], X0 );
+   B[1] = _mm256_add_epi32( B[1], X1 );
+   B[2] = _mm256_add_epi32( B[2], X2 );
+   B[3] = _mm256_add_epi32( B[3], X3 );
+}
 
-   Y0 = _mm256_blend_epi32( X0, X1, 0x88 );
-   Y1 = _mm256_blend_epi32( X0, X1, 0x11 );   
-   Y2 = _mm256_blend_epi32( X0, X1, 0x22 );   
-   Y3 = _mm256_blend_epi32( X0, X1, 0x44 );     
+void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N )
+{
+   salsa_shuffle_2way_simd128( X );
+   salsa_shuffle_2way_simd128( X+4 );
 
-   Y0 = _mm256_blend_epi32( Y0, X2, 0x44 );
-   Y1 = _mm256_blend_epi32( Y1, X2, 0x88 );
-   Y2 = _mm256_blend_epi32( Y2, X2, 0x11 );
-   Y3 = _mm256_blend_epi32( Y3, X2, 0x22 );
-   
-   Y0 = _mm256_blend_epi32( Y0, X3, 0x22 );
-   Y1 = _mm256_blend_epi32( Y1, X3, 0x44 );
-   Y2 = _mm256_blend_epi32( Y2, X3, 0x88 );
-   Y3 = _mm256_blend_epi32( Y3, X3, 0x11 );
-   
-   B[0] = _mm256_add_epi32( B[0], Y0 );
-   B[1] = _mm256_add_epi32( B[1], Y1 );
-   B[2] = _mm256_add_epi32( B[2], Y2 );
-   B[3] = _mm256_add_epi32( B[3], Y3 );
-}
-
-void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N )
-{
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 8], X, 128*2 );
+      memcpy( &V[n * 8], X, 128*2 );
       salsa8_2way_simd128( &X[0], &X[4] );
       salsa8_2way_simd128( &X[4], &X[0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m256_ovly x16;
       x16 = ( (m256_ovly*)X )[4];
       uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) );
       uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
-         X[k] = _mm256_xor_si256( X[k], _mm256_blend_epi32( V[ j1+k ],
-                                                            V[ j0+k ], 0x0f ) );
+      for ( int i = 0; i < 8; i++ )
+         X[i] = _mm256_xor_si256( X[i], _mm256_blend_epi32( V[ j1+i ],
+                                                            V[ j0+i ], 0x0f ) );
 
       salsa8_2way_simd128( &X[0], &X[4] );
       salsa8_2way_simd128( &X[4], &X[0] );
    }
+
+   salsa_unshuffle_2way_simd128( X );
+   salsa_unshuffle_2way_simd128( X+4 );
 }
 
 // Working
@@ -1386,17 +1362,17 @@ void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N )
    __m256i *V0 = V;
    __m256i *V1 = V + 8*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         _mm256_stream_si256( V0 + i*8 + k, X0[k] );   
-         _mm256_stream_si256( V1 + i*8 + k, X1[k] );      
+         _mm256_stream_si256( V0 + n*8 + i, X0[i] );   
+         _mm256_stream_si256( V1 + n*8 + i, X1[i] );      
       }
       salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
       salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       const m256_ovly x16a = ( (m256_ovly*)X0 )[4];
       const m256_ovly x16b = ( (m256_ovly*)X1 )[4];
@@ -1406,25 +1382,16 @@ void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N )
       const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
       const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + k );
-         const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + k );
-         const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + k );
-         const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + k );
-         X0[k] = _mm256_xor_si256( X0[k],
+         const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + i );
+         const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + i );
+         const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + i );
+         const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + i );
+         X0[i] = _mm256_xor_si256( X0[i],
                        _mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) );
-         X1[k] = _mm256_xor_si256( X1[k],
+         X1[i] = _mm256_xor_si256( X1[i],
                        _mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) );
-
-
-/*
-         X0[k] = _mm256_xor_si256( X0[k],
-                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
-         X1[k] = _mm256_xor_si256( X1[k],
-                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
-*/
-
       }
 
       salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
@@ -1577,17 +1544,17 @@ void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N )
    __m256i *V1 = V + 8*N;
    __m256i *V2 = V + 16*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V0[i * 8], X0, 128*2 );
-      memcpy( &V1[i * 8], X1, 128*2 );
-      memcpy( &V2[i * 8], X2, 128*2 );
+      memcpy( &V0[n * 8], X0, 128*2 );
+      memcpy( &V1[n * 8], X1, 128*2 );
+      memcpy( &V2[n * 8], X2, 128*2 );
       salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0],
                                 &X0[4], &X1[4], &X2[4] );
       salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],
                                 &X0[0], &X1[0], &X2[0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m256_ovly x16a, x16b, x16c;
       x16a = ( (m256_ovly*)X0 )[4];
@@ -1601,14 +1568,14 @@ void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N )
       uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
       uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         X0[k] = _mm256_xor_si256( X0[k],
-                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
-         X1[k] = _mm256_xor_si256( X1[k],
-                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
-         X2[k] = _mm256_xor_si256( X2[k],
-                       _mm256_blend_epi32( V2[ j1c+k ], V2[ j0c+k ], 0x0f ) );
+         X0[i] = _mm256_xor_si256( X0[i],
+                       _mm256_blend_epi32( V0[ j1a+i ], V0[ j0a+i ], 0x0f ) );
+         X1[i] = _mm256_xor_si256( X1[i],
+                       _mm256_blend_epi32( V1[ j1b+i ], V1[ j0b+i ], 0x0f ) );
+         X2[i] = _mm256_xor_si256( X2[i],
+                       _mm256_blend_epi32( V2[ j1c+i ], V2[ j0c+i ], 0x0f ) );
       }
 
       salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], 
@@ -1707,23 +1674,23 @@ static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c )
 
 void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
-         _mm256_stream_si256( (__m256i*)V + i*8 + k, casti_m256i( X, k ) );
+      for ( int i = 0; i < 8; i++ )
+         _mm256_stream_si256( (__m256i*)V + n*8 + i, casti_m256i( X, i ) );
       salsa8_simd128_2way( &X[ 0], &X[16] );
       salsa8_simd128_2way( &X[16], &X[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       // need 2 J's
       const uint32_t j0 = 32 * ( (uint32_t)( X[16]       ) & ( N-1 ) );
       const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) );
 
-      for ( int k = 0; k < 32; k++ )
-         X[k] ^= ( ( V[ j1 + k ] & 0xffffffff00000000 )
-                 | ( V[ j0 + k ] & 0x00000000ffffffff ) );  
+      for ( int i = 0; i < 32; i++ )
+         X[i] ^= ( ( V[ j1 + i ] & 0xffffffff00000000 )
+                 | ( V[ j0 + i ] & 0x00000000ffffffff ) );  
 
       salsa8_simd128_2way( &X[ 0], &X[16] );
       salsa8_simd128_2way( &X[16], &X[ 0] );
@@ -1845,18 +1812,18 @@ void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N )
    uint64_t *V0 = V;
    uint64_t *V1 = V + 32*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         _mm256_stream_si256( (__m256i*)V0 + i*8 + k, casti_m256i( X0, k ) );
-         _mm256_stream_si256( (__m256i*)V1 + i*8 + k, casti_m256i( X1, k ) );
+         _mm256_stream_si256( (__m256i*)V0 + n*8 + i, casti_m256i( X0, i ) );
+         _mm256_stream_si256( (__m256i*)V1 + n*8 + i, casti_m256i( X1, i ) );
       }
       salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
       salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       // need 4 J's
       const uint32_t j0l = 32 * ( (const uint32_t)( X0[16]       ) & ( N-1 ) );
@@ -1864,12 +1831,12 @@ void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N )
       const uint32_t j1l = 32 * ( (const uint32_t)( X1[16]       ) & ( N-1 ) );
       const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
          
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
-         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
-                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
-         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
-                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+         X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + i ] & 0x00000000ffffffff ) );
+         X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + i ] & 0x00000000ffffffff ) );
       }
       salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
       salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
@@ -2025,18 +1992,18 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
    uint64_t *V1 = V + 32*N;
    uint64_t *V2 = V + 64*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V0[i * 32], X0, 2*128 );
-      memcpy( &V1[i * 32], X1, 2*128 );
-      memcpy( &V2[i * 32], X2, 2*128 );
+      memcpy( &V0[ n*32 ], X0, 2*128 );
+      memcpy( &V1[ n*32 ], X1, 2*128 );
+      memcpy( &V2[ n*32 ], X2, 2*128 );
       salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
                                 &X0[16], &X1[16], &X2[16] );
       salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
                                 &X0[ 0], &X1[ 0], &X2[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
       uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
@@ -2045,14 +2012,14 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
       uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
       uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
-         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
-                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
-         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
-                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
-         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
-                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
+         X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + i ] & 0x00000000ffffffff ) );
+         X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + i ] & 0x00000000ffffffff ) );
+         X2[i] ^= ( ( V2[ j2h + i ] & 0xffffffff00000000 )
+                  | ( V2[ j2l + i ] & 0x00000000ffffffff ) );
       }
       salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
                                 &X0[16], &X1[16], &X2[16] );
@@ -2061,229 +2028,6 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
    }
 }
 
-// Working, deprecated
-// 8x memory usage
-// 2x32 interleaving
-static void salsa8_simd128_2way_4buf( uint64_t *BA, uint64_t *BB,
-          uint64_t *BC, uint64_t *BD, const uint64_t *CA, const uint64_t *CB,
-          const uint64_t *CC, const uint64_t *CD )
-{
-   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
-   __m256i *ba = (__m256i*)BA;
-   __m256i *bb = (__m256i*)BB;
-   __m256i *bc = (__m256i*)BC;
-   __m256i *bd = (__m256i*)BD;
-   const __m256i *ca = (const __m256i*)CA;
-   const __m256i *cb = (const __m256i*)CB;
-   const __m256i *cc = (const __m256i*)CC;
-   const __m256i *cd = (const __m256i*)CD;
-   m256_ovly ya[4], yb[4], yc[4], yd[4],
-             za[4], zb[4], zc[4], zd[4];
-
-   // mix C into B then shuffle B into X
-   ba[0] = _mm256_xor_si256( ba[0], ca[0] );
-   bb[0] = _mm256_xor_si256( bb[0], cb[0] );
-   bc[0] = _mm256_xor_si256( bc[0], cc[0] );
-   bd[0] = _mm256_xor_si256( bd[0], cd[0] );
-   ba[1] = _mm256_xor_si256( ba[1], ca[1] );
-   bb[1] = _mm256_xor_si256( bb[1], cb[1] );
-   bc[1] = _mm256_xor_si256( bc[1], cc[1] );
-   bd[1] = _mm256_xor_si256( bd[1], cd[1] );
-   ba[2] = _mm256_xor_si256( ba[2], ca[2] );
-   bb[2] = _mm256_xor_si256( bb[2], cb[2] );
-   bc[2] = _mm256_xor_si256( bc[2], cc[2] );
-   bd[2] = _mm256_xor_si256( bd[2], cd[2] );
-   ba[3] = _mm256_xor_si256( ba[3], ca[3] );
-   bb[3] = _mm256_xor_si256( bb[3], cb[3] );
-   bc[3] = _mm256_xor_si256( bc[3], cc[3] );
-   bd[3] = _mm256_xor_si256( bd[3], cd[3] );
-
-   XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] );
-   XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] );
-   XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] );
-   XD0 = _mm256_set_epi64x( BD[15], BD[10], BD[ 5], BD[ 0] );
-   XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] );
-   XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] );
-   XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] );
-   XD1 = _mm256_set_epi64x( BD[ 3], BD[14], BD[ 9], BD[ 4] );
-   XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] );
-   XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] );
-   XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] );
-   XD2 = _mm256_set_epi64x( BD[ 7], BD[ 2], BD[13], BD[ 8] );
-   XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] );
-   XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] );
-   XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] );
-   XD3 = _mm256_set_epi64x( BD[11], BD[ 6], BD[ 1], BD[12] );
-
-   // define targets for macros used in round function template
-   #define ROL_1X32    mm256_shufll_64
-   #define ROR_1X32    mm256_shuflr_64
-   #define SWAP_64     mm256_swap_128
-   #define ROL32       mm256_rol_32
-   #define ADD32       _mm256_add_epi32
-   #define XOR         _mm256_xor_si256
-   #define TYPE        __m256i
-
-   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
-
-   #undef ROL_1X32
-   #undef ROR_1X32
-   #undef SWAP_64
-   #undef ROL32
-   #undef ADD32
-   #undef XOR 
-   #undef TYPE
-
-   ya[0].m256 = XA0;    yb[0].m256 = XB0;
-   yc[0].m256 = XC0;    yd[0].m256 = XD0;
-   ya[1].m256 = XA1;    yb[1].m256 = XB1;
-   yc[1].m256 = XC1;    yd[1].m256 = XD1;
-   ya[2].m256 = XA2;    yb[2].m256 = XB2;
-   yc[2].m256 = XC2;    yd[2].m256 = XD2;
-   ya[3].m256 = XA3;    yb[3].m256 = XB3;
-   yc[3].m256 = XC3;    yd[3].m256 = XD3;
-
-   za[0].u64[0] = ya[0].u64[0];
-   zb[0].u64[0] = yb[0].u64[0];
-   zc[0].u64[0] = yc[0].u64[0];
-   zd[0].u64[0] = yd[0].u64[0];
-   za[0].u64[3] = ya[1].u64[0];
-   zb[0].u64[3] = yb[1].u64[0];
-   zc[0].u64[3] = yc[1].u64[0];
-   zd[0].u64[3] = yd[1].u64[0];
-   za[0].u64[2] = ya[2].u64[0];
-   zb[0].u64[2] = yb[2].u64[0];
-   zc[0].u64[2] = yc[2].u64[0];
-   zd[0].u64[2] = yd[2].u64[0];
-   za[0].u64[1] = ya[3].u64[0];
-   zb[0].u64[1] = yb[3].u64[0];
-   zc[0].u64[1] = yc[3].u64[0];
-   zd[0].u64[1] = yd[3].u64[0];
-
-   za[1].u64[1] = ya[0].u64[1];
-   zb[1].u64[1] = yb[0].u64[1];
-   zc[1].u64[1] = yc[0].u64[1];
-   zd[1].u64[1] = yd[0].u64[1];
-   za[1].u64[0] = ya[1].u64[1];
-   zb[1].u64[0] = yb[1].u64[1];
-   zc[1].u64[0] = yc[1].u64[1];
-   zd[1].u64[0] = yd[1].u64[1];
-   za[1].u64[3] = ya[2].u64[1];
-   zb[1].u64[3] = yb[2].u64[1];
-   zc[1].u64[3] = yc[2].u64[1];
-   zd[1].u64[3] = yd[2].u64[1];
-   za[1].u64[2] = ya[3].u64[1];
-   zb[1].u64[2] = yb[3].u64[1];
-   zc[1].u64[2] = yc[3].u64[1];
-   zd[1].u64[2] = yd[3].u64[1];
-
-   za[2].u64[2] = ya[0].u64[2];
-   zb[2].u64[2] = yb[0].u64[2];
-   zc[2].u64[2] = yc[0].u64[2];
-   zd[2].u64[2] = yd[0].u64[2];
-   za[2].u64[1] = ya[1].u64[2];
-   zb[2].u64[1] = yb[1].u64[2];
-   zc[2].u64[1] = yc[1].u64[2];
-   zd[2].u64[1] = yd[1].u64[2];
-   za[2].u64[0] = ya[2].u64[2];
-   zb[2].u64[0] = yb[2].u64[2];
-   zc[2].u64[0] = yc[2].u64[2];
-   zd[2].u64[0] = yd[2].u64[2];
-   za[2].u64[3] = ya[3].u64[2];
-   zb[2].u64[3] = yb[3].u64[2];
-   zc[2].u64[3] = yc[3].u64[2];
-   zd[2].u64[3] = yd[3].u64[2];
-
-   za[3].u64[3] = ya[0].u64[3];
-   zb[3].u64[3] = yb[0].u64[3];
-   zc[3].u64[3] = yc[0].u64[3];
-   zd[3].u64[3] = yd[0].u64[3];
-   za[3].u64[2] = ya[1].u64[3];
-   zb[3].u64[2] = yb[1].u64[3];
-   zc[3].u64[2] = yc[1].u64[3];
-   zd[3].u64[2] = yd[1].u64[3];
-   za[3].u64[1] = ya[2].u64[3];
-   zb[3].u64[1] = yb[2].u64[3];
-   zc[3].u64[1] = yc[2].u64[3];
-   zd[3].u64[1] = yd[2].u64[3];
-   za[3].u64[0] = ya[3].u64[3];
-   zb[3].u64[0] = yb[3].u64[3];
-   zc[3].u64[0] = yc[3].u64[3];
-   zd[3].u64[0] = yd[3].u64[3];
-
-   ba[0] = _mm256_add_epi32( ba[0], za[0].m256 );
-   bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 );
-   bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 );
-   bd[0] = _mm256_add_epi32( bd[0], zd[0].m256 );
-   ba[1] = _mm256_add_epi32( ba[1], za[1].m256 );
-   bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 );
-   bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 );
-   bd[1] = _mm256_add_epi32( bd[1], zd[1].m256 );
-   ba[2] = _mm256_add_epi32( ba[2], za[2].m256 );
-   bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 );
-   bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 );
-   bd[2] = _mm256_add_epi32( bd[2], zd[2].m256 );
-   ba[3] = _mm256_add_epi32( ba[3], za[3].m256 );
-   bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 );
-   bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 );
-   bd[3] = _mm256_add_epi32( bd[3], zd[3].m256 );
-}
-
-void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N )
-
-{
-   uint64_t *X0 = X;
-   uint64_t *X1 = X+32;
-   uint64_t *X2 = X+64;
-   uint64_t *X3 = X+96;
-   uint64_t *V0 = V;
-   uint64_t *V1 = V + 32*N;
-   uint64_t *V2 = V + 64*N;
-   uint64_t *V3 = V + 96*N;
-
-   for ( int i = 0; i < N; i++ )
-   {
-      memcpy( &V0[i * 32], X0, 2*128 );
-      memcpy( &V1[i * 32], X1, 2*128 );
-      memcpy( &V2[i * 32], X2, 2*128 );
-      memcpy( &V3[i * 32], X3, 2*128 );
-      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                                &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
-   }
-
-   for ( int i = 0; i < N; i++ )
-   {
-      // need 4 J's
-      uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
-      uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
-      uint32_t j1l = 32 * ( (uint32_t)( X1[16]       ) & ( N-1 ) );
-      uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
-      uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
-      uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
-      uint32_t j3l = 32 * ( (uint32_t)( X3[16]       ) & ( N-1 ) );
-      uint32_t j3h = 32 * ( (uint32_t)( X3[16] >> 32 ) & ( N-1 ) );
-
-      for ( int k = 0; k < 32; k++ )
-      {
-         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
-                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
-         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
-                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
-         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
-                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
-         X3[k] ^= ( ( V3[ j3h + k ] & 0xffffffff00000000 )
-                  | ( V3[ j3l + k ] & 0x00000000ffffffff ) );
-      }
-      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                                &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
-   }
-}
-   
 
 #endif  // AVX2
 
@@ -2344,13 +2088,13 @@ static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
 
 void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128*4 );
+      memcpy( &V[ n*32 ], X, 128*4 );
       xor_salsa8_4way( &X[ 0], &X[16] );
       xor_salsa8_4way( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m128_ovly *vptr[4]; 
       m128_ovly *x16 = (m128_ovly*)(&X[16]);
@@ -2361,12 +2105,12 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
          vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); 
       }
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
          m128_ovly v;    
          for ( int l = 0; l < 4; l++ )
-            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
-         X[ k ] = _mm_xor_si128( X[ k ], v.m128 );
+            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
+         X[i] = _mm_xor_si128( X[i], v.m128 );
       }
 
       xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2546,19 +2290,19 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
 
 void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
-         _mm_stream_si128( (__m128i*)V + i*8 + k, casti_m128i( X, k ) );
+      for ( int i = 0; i < 8; i++ )
+         _mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) );
 
       salsa8_simd128( &X[ 0], &X[16] );
       salsa8_simd128( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       const int j = 32 * ( X[16] & ( N - 1 ) );
-      for ( int k = 0; k < 32; k++ )
-         X[k] ^= V[j + k];
+      for ( int i = 0; i < 32; i++ )
+         X[i] ^= V[ j+i ];
       salsa8_simd128( &X[ 0], &X[16] );
       salsa8_simd128( &X[16], &X[ 0] );
    }
@@ -2566,253 +2310,290 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 
 // Double buffered, 2x memory usage
 // No interleaving
-static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
-                       const uint32_t * const ca, const uint32_t * const cb )
-{
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
-
-   // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
 
-   // mix C into B then shuffle B into X
-   BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
+{
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
 
 #if defined(__SSE4_1__)
 
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+//   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
 
 #if defined(__AVX2__)
-   
-   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
-   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
-   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
-   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
-   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
-   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
-   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
-   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
-   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
-   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
-   
-   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
-   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
-   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
-   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
-   
-   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
-   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
 
-   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
-   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
+   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
+   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
+   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
 
-   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
-   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
+   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
+   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
 
-   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
-   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
+   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
+   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
 
-#else // SSE4_1
+   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
+   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
+   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
+   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
+
+   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
+   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
+
+   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
+   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
+
+   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
+   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
+
+   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
+   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
+
+#else
 
-   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
-   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
-   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
-   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+//  SSE4.1
 
-   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
-   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
-   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
-   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
+   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
+   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
+   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
 
-   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
-   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
-   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
-   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
+   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
+   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
 
-   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
-   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
-   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
-   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
+   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
+   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
 
-   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
-   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
+   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
+   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
+   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
 
-   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
-   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
+   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
 
-   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
-   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
+   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
 
-   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
-   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
+   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
+
+   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
+   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
 
 #endif  // AVX2 else SSE4_1
 
-   SALSA_8ROUNDS_SIMD128_2BUF;
+#else   // SSE2
+  
+   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
+
+   XA[0] = YA0;
+   XB[0] = YB0;
+   XA[1] = YA1;
+   XB[1] = YB1;
+   XA[2] = YA2;
+   XB[2] = YB2;
+   XA[3] = YA3;
+   XB[3] = YB3;
+
+#endif
+}
+
+static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
+{
+
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
 
 #if defined(__AVX2__)
 
-   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
-   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
-   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
-   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
-   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
-   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
-   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
-   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
-
-   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
-   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
-   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
-   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
-   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
-   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
-   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
-   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
+   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
+   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
+   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
+   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
+   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
+   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
+
+   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
+   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
+   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
+   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
+   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
+   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
+   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
+   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
 
-#else  // SSE4_1
+#else   // SSE4_1
 
-   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
-   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
-   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
-   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
-   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
-   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
-   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
-   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
-
-   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
-   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
-   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
-   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
-   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
-   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
-   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
-   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
+   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
+   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
+   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
+   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
+   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
+   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
+
+   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
+   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
+   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
+   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
+   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
+   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
+   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
+   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
 
-#endif // AVX2 else SSE4_1
-   
-   BA[0] = _mm_add_epi32( BA[0], YA0 );
-   BB[0] = _mm_add_epi32( BB[0], YB0 );
-   BA[1] = _mm_add_epi32( BA[1], YA1 );
-   BB[1] = _mm_add_epi32( BB[1], YB1 );
-   BA[2] = _mm_add_epi32( BA[2], YA2 );
-   BB[2] = _mm_add_epi32( BB[2], YB2 );
-   BA[3] = _mm_add_epi32( BA[3], YA3 );
-   BB[3] = _mm_add_epi32( BB[3], YB3 );
+#endif  // AVX2 else SSE4_1
 
 #else  // SSE2
 
    m128_ovly ya[4], za[4], yb[4], zb[4];
 
-   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
-   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
-   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
-   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
-   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
-   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
-   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
-   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
-   
-   SALSA_8ROUNDS_FINAL_SIMD128_2BUF;
-
-   // Final round doesn't shuffle data back to original input order,
-   // process it as is.
-
-   ya[0].m128 = XA0;
-   yb[0].m128 = XB0;
-   ya[1].m128 = XA1;
-   yb[1].m128 = XB1;
-   ya[2].m128 = XA2;
-   yb[2].m128 = XB2;
-   ya[3].m128 = XA3;
-   yb[3].m128 = XB3;
+   ya[0].m128 = XA[0];
+   yb[0].m128 = XB[0];
+   ya[1].m128 = XA[1];
+   yb[1].m128 = XB[1];
+   ya[2].m128 = XA[2];
+   yb[2].m128 = XB[2];
+   ya[3].m128 = XA[3];
+   yb[3].m128 = XB[3];
 
    za[0].u32[0] = ya[0].u32[0];
    zb[0].u32[0] = yb[0].u32[0];
-   za[0].u32[3] = ya[1].u32[0];
-   zb[0].u32[3] = yb[1].u32[0];
-   za[0].u32[2] = ya[2].u32[0];
-   zb[0].u32[2] = yb[2].u32[0];
-   za[0].u32[1] = ya[3].u32[0];
-   zb[0].u32[1] = yb[3].u32[0];
-
+   za[0].u32[1] = ya[3].u32[1];
+   zb[0].u32[1] = yb[3].u32[1];
+   za[0].u32[2] = ya[2].u32[2];
+   zb[0].u32[2] = yb[2].u32[2];
+   za[0].u32[3] = ya[1].u32[3];
+   zb[0].u32[3] = yb[1].u32[3];
+
+   za[1].u32[0] = ya[1].u32[0];
+   zb[1].u32[0] = yb[1].u32[0];
    za[1].u32[1] = ya[0].u32[1];
    zb[1].u32[1] = yb[0].u32[1];
-   za[1].u32[0] = ya[1].u32[1];
-   zb[1].u32[0] = yb[1].u32[1];
-   za[1].u32[3] = ya[2].u32[1];
-   zb[1].u32[3] = yb[2].u32[1];
-   za[1].u32[2] = ya[3].u32[1];
-   zb[1].u32[2] = yb[3].u32[1];
-
+   za[1].u32[2] = ya[3].u32[2];
+   zb[1].u32[2] = yb[3].u32[2];
+   za[1].u32[3] = ya[2].u32[3];
+   zb[1].u32[3] = yb[2].u32[3];
+
+   za[2].u32[0] = ya[2].u32[0];
+   zb[2].u32[0] = yb[2].u32[0];
+   za[2].u32[1] = ya[1].u32[1];
+   zb[2].u32[1] = yb[1].u32[1];
    za[2].u32[2] = ya[0].u32[2];
    zb[2].u32[2] = yb[0].u32[2];
-   za[2].u32[1] = ya[1].u32[2];
-   zb[2].u32[1] = yb[1].u32[2];
-   za[2].u32[0] = ya[2].u32[2];
-   zb[2].u32[0] = yb[2].u32[2];
-   za[2].u32[3] = ya[3].u32[2];
-   zb[2].u32[3] = yb[3].u32[2];
-
+   za[2].u32[3] = ya[3].u32[3];
+   zb[2].u32[3] = yb[3].u32[3];
+
+   za[3].u32[0] = ya[3].u32[0];
+   zb[3].u32[0] = yb[3].u32[0];
+   za[3].u32[1] = ya[2].u32[1];
+   zb[3].u32[1] = yb[2].u32[1];
+   za[3].u32[2] = ya[1].u32[2];
+   zb[3].u32[2] = yb[1].u32[2];
    za[3].u32[3] = ya[0].u32[3];
    zb[3].u32[3] = yb[0].u32[3];
-   za[3].u32[2] = ya[1].u32[3];
-   zb[3].u32[2] = yb[1].u32[3];
-   za[3].u32[1] = ya[2].u32[3];
-   zb[3].u32[1] = yb[2].u32[3];
-   za[3].u32[0] = ya[3].u32[3];
-   zb[3].u32[0] = yb[3].u32[3];
-
-   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
-   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
-   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
-   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
-   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
-   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
-   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
-   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
 
+   XA[0] = za[0].m128;
+   XB[0] = zb[0].m128;
+   XA[1] = za[1].m128;
+   XB[1] = zb[1].m128;
+   XA[2] = za[2].m128;
+   XB[2] = zb[2].m128;
+   XA[3] = za[3].m128;
+   XB[3] = zb[3].m128;
+   
+#endif
+}
+
+static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
+                       const uint32_t * const ca, const uint32_t * const cb )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+             
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+#else
+
+   SALSA_8ROUNDS_SIMD128_2BUF_SLOROT;
+   
 #endif
 
+   BA[0] = _mm_add_epi32( BA[0], XA0 );
+   BB[0] = _mm_add_epi32( BB[0], XB0 );
+   BA[1] = _mm_add_epi32( BA[1], XA1 );
+   BB[1] = _mm_add_epi32( BB[1], XB1 );
+   BA[2] = _mm_add_epi32( BA[2], XA2 );
+   BB[2] = _mm_add_epi32( BB[2], XB2 );
+   BA[3] = _mm_add_epi32( BA[3], XA3 );
+   BB[3] = _mm_add_epi32( BB[3], XB3 );
+
    #undef ROL_1X32
    #undef ROR_1X32
    #undef SWAP_64
@@ -2822,570 +2603,425 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
    #undef TYPE
 }
 
-
-// X: 2 sequential buffers
-// V: 2 sequential buffers interleaved by the size of N
-// interleaved buffers { v00, v01, v10, v11, v20... }
-//
 void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-  uint32_t *X0 = X;
-  uint32_t *X1 = X+32;
-  uint32_t *V0 = V;
-  uint32_t *V1 = V + 32*N;
+   uint32_t *X0 = X;
+   uint32_t *X1 = X+32;
+   uint32_t *V0 = V;
+   uint32_t *V1 = V + 32*N;
 
-   for ( int i = 0; i < N; i++ )
+   salsa_simd128_shuffle_2buf( X0,    X1    );
+   salsa_simd128_shuffle_2buf( X0+16, X1+16 );
+
+   for ( int n = 0; n < N; n++ )
    {
-   #if defined(__AVX2__)
+   #if defined(__AVX__)
+
+      for ( int i = 0; i < 4; i++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) );
+         _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) );
+      }
 
-      for ( int k = 0; k < 4; k++ )
+   #elif defined(__SSE4_1__)
+
+      for ( int i = 0; i < 8; i++ )
       {
-         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
-         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
+         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
+         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
       }
 
    #else
 
-      memcpy( &V0[ i*32 ], X0, 128 );
-      memcpy( &V1[ i*32 ], X1, 128 );
+      memcpy( &V0[ n*32 ], X0, 128 );
+      memcpy( &V1[ n*32 ], X1, 128 );
 
    #endif
 
-      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
-      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+      salsa8_simd128_2buf( X0,    X1,    X0+16, X1+16 );
+      salsa8_simd128_2buf( X0+16, X1+16, X0   , X1    );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
    #if defined(__AVX2__)
 
       const int j0 = 4 * ( X0[16] & ( N-1 ) );
       const int j1 = 4 * ( X1[16] & ( N-1 ) );
-      for ( int k = 0; k < 4; k++ )
-      {
-         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
-         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
-//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
-//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
-         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
-         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
-      }
+
+      const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0   );
+      const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1   );
+      const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 );
+      const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 );
+      const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 );
+      const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 );
+      const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 );
+      const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 );
+
+      casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 );
+      casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 );
+      casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 );
+      casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 );
+      casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 );
+      casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 );
+      casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 );
+      casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 );
 
    #else
 
       const int j0 = 8 * ( X0[16] & ( N-1 ) );
       const int j1 = 8 * ( X1[16] & ( N-1 ) );
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
-         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
-         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
+         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
+         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
       }
 
    #endif
 
-/*      
-      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
-
-      for ( int k = 0; k < 16; k++ )
-      {
-         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
-         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
-         ( (uint64_t*)X0 )[k] ^= v0;
-         ( (uint64_t*)X1 )[k] ^= v1;
-      }
-*/
-
-/*
-      const int j0 = 32 * ( X0[16] & ( N-1 ) );
-      const int j1 = 32 * ( X1[16] & ( N-1 ) );
-
-      for ( int k = 0; k < 32; k++ )
-      {
-         const uint32_t v0 = V0[ j0+k ];
-         const uint32_t v1 = V1[ j1+k ]; 
-         X0[k] ^= v0;
-         X1[k] ^= v1;
-      }
-*/
-
-      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
-      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+      salsa8_simd128_2buf( X0,    X1,    X0+16, X1+16 );
+      salsa8_simd128_2buf( X0+16, X1+16, X0   , X1    );
    }
+
+   salsa_simd128_unshuffle_2buf( X0,    X1    );
+   salsa_simd128_unshuffle_2buf( X0+16, X1+16 );
 }
 
 
-// Triple buffered, 3x memory usage
-// No interleaving
-static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
-               const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
+static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
+                                        uint32_t *xc )
 {
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-           XC0, XC1, XC2, XC3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   __m128i *BC = (__m128i*)bc;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
-   const __m128i *CC = (const __m128i*)cc;
-
-   // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
-
-   // mix C into B then shuffle B into X
-   BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   BC[0] = _mm_xor_si128( BC[0], CC[0] );
-   BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   BC[1] = _mm_xor_si128( BC[1], CC[1] );
-   BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   BC[2] = _mm_xor_si128( BC[2], CC[2] );
-   BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   BB[3] = _mm_xor_si128( BB[3], CB[3] );
-   BC[3] = _mm_xor_si128( BC[3], CC[3] );
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+   __m128i *XC = (__m128i*)xc;
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
 
 #if defined(__SSE4_1__)
 
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
 
 #if defined(__AVX2__)
 
-   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
-   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
-   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
-   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
-   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
-   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
-   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
-   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
-   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
-   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
-   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
-   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
-   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
-   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
-   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
-   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
-   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
-   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
-   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
-   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
-   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
-
-   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
-   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
-   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
-
-   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
-   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
-   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
-
-   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
-   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
-   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
-
-   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
-   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
-   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
+   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
+   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
+   YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
+   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
+   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
+   ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
+   YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
+   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
+   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
+   ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
+   YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
+   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
+   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
+   ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
+
+   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
+   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
+   YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
+   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
+   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
+   ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
+
+   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
+   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
+   XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
+
+   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
+   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
+   XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
+
+   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
+   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
+   XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
+
+   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
+   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
+   XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
+
+#else   
+
+//  SSE4.1
+
+   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
+   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
+   YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
+   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
+   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
+   ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
+
+   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
+   YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
+   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
+   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
+   ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
+
+   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
+   YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
+   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
+   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
+   ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
+
+   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
+   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
+   YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
+   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
+   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
+   ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
+
+   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
+   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
+   XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
+
+   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
+   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
+   XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
+
+   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
+   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
+   XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
+
+   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
+   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
+   XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
 
-#else   // SSE4_1
+#endif  // AVX2 else SSE4_1
 
-   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
-   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
-   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
-   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
-   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
-   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
-   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
-   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
-   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
-
-   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
-   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
-   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
-   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
-   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
-   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
-   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
-   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
-   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
-
-   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
-   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
-   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
-   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
-   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
-   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
-   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
-   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
-   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
-
-   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
-   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
-   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
-   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
-   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
-   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
-   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
-   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
-   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
-   
-#endif  // AVX2 else SSE3_1
+#else   // SSE2
+
+   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
+   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
+   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
+   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
+   YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] );
+
+   XA[0] = YA0;
+   XB[0] = YB0;
+   XC[0] = YC0;
+   XA[1] = YA1;
+   XB[1] = YB1;
+   XC[1] = YC1;
+   XA[2] = YA2;
+   XB[2] = YB2;
+   XC[2] = YC2;
+   XA[3] = YA3;
+   XB[3] = YB3;
+   XC[3] = YC3;
 
-   SALSA_8ROUNDS_SIMD128_3BUF;
+#endif
+}
+
+static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
+                                          uint32_t* xc )
+{
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+   __m128i *XC = (__m128i*)xc;
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
 
 #if defined(__AVX2__)
 
-   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
-   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
-   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
-   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
-   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
-   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
-   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
-   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
-   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
-   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
-   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
-   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
-
-   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
-   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
-   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
-   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
-   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
-   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
-   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
-   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
-   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
-   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
-   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
-   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
+   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
+   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
+   YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
+   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
+   YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
+   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
+   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
+   YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
+   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
+   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
+   YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
+   YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
+   YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
+   YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
+   YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
+
+   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
+   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
+   XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
+   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
+   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
+   XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
+   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
+   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
+   XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
+   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
+   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
+   XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
 
 #else   // SSE4_1
 
-   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
-   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
-   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
-   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
-   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
-   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
-   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
-   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
-   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
-   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
-   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
-   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
-
-   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
-   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
-   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
-   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
-   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
-   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
-   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
-   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
-   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
-   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
-   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
-   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
+   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
+   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
+   YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
+   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
+   YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
+   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
+   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
+   YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
+   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
+   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
+   YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
+   YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
+   YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
+   YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
+   YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
+
+   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
+   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
+   XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
+   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
+   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
+   XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
+   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
+   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
+   XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
+   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
+   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
+   XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
 
 #endif  // AVX2 else SSE4_1
 
-   BA[0] = _mm_add_epi32( BA[0], YA0 );
-   BB[0] = _mm_add_epi32( BB[0], YB0 );
-   BC[0] = _mm_add_epi32( BC[0], YC0 );
-   BA[1] = _mm_add_epi32( BA[1], YA1 );
-   BB[1] = _mm_add_epi32( BB[1], YB1 );
-   BC[1] = _mm_add_epi32( BC[1], YC1 );
-   BA[2] = _mm_add_epi32( BA[2], YA2 );
-   BB[2] = _mm_add_epi32( BB[2], YB2 );
-   BC[2] = _mm_add_epi32( BC[2], YC2 );
-   BA[3] = _mm_add_epi32( BA[3], YA3 );
-   BB[3] = _mm_add_epi32( BB[3], YB3 );
-   BC[3] = _mm_add_epi32( BC[3], YC3 );
-
 #else  // SSE2
 
-   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4];
-
-   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
-   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
-   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
-   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
-   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
-   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
-   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
-   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
-   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
-   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
-   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
-   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
-
-   SALSA_8ROUNDS_FINAL_SIMD128_3BUF;
-
-   // Final round doesn't shuffle data back to original input order,
-   // process it as is.
-
-   ya[0].m128 = XA0;
-   yb[0].m128 = XB0;
-   yc[0].m128 = XC0;   
-   ya[1].m128 = XA1;
-   yb[1].m128 = XB1;   
-   yc[1].m128 = XC1;   
-   ya[2].m128 = XA2;
-   yb[2].m128 = XB2;   
-   yc[2].m128 = XC2;   
-   ya[3].m128 = XA3;
-   yb[3].m128 = XB3;
-   yc[3].m128 = XC3;
+   m128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
+
+   ya[0].m128 = XA[0];
+   yb[0].m128 = XB[0];
+   yc[0].m128 = XC[0];
+   ya[1].m128 = XA[1];
+   yb[1].m128 = XB[1];
+   yc[1].m128 = XC[1];
+   ya[2].m128 = XA[2];
+   yb[2].m128 = XB[2];
+   yc[2].m128 = XC[2];
+   ya[3].m128 = XA[3];
+   yb[3].m128 = XB[3];
+   yc[3].m128 = XC[3];
 
    za[0].u32[0] = ya[0].u32[0];
    zb[0].u32[0] = yb[0].u32[0];
    zc[0].u32[0] = yc[0].u32[0];
-   za[0].u32[3] = ya[1].u32[0];
-   zb[0].u32[3] = yb[1].u32[0];
-   zc[0].u32[3] = yc[1].u32[0];
-   za[0].u32[2] = ya[2].u32[0];
-   zb[0].u32[2] = yb[2].u32[0];
-   zc[0].u32[2] = yc[2].u32[0];
-   za[0].u32[1] = ya[3].u32[0];
-   zb[0].u32[1] = yb[3].u32[0];
-   zc[0].u32[1] = yc[3].u32[0];
-
+   za[0].u32[1] = ya[3].u32[1];
+   zb[0].u32[1] = yb[3].u32[1];
+   zc[0].u32[1] = yc[3].u32[1];
+   za[0].u32[2] = ya[2].u32[2];
+   zb[0].u32[2] = yb[2].u32[2];
+   zc[0].u32[2] = yc[2].u32[2];
+   za[0].u32[3] = ya[1].u32[3];
+   zb[0].u32[3] = yb[1].u32[3];
+   zc[0].u32[3] = yc[1].u32[3];
+
+   za[1].u32[0] = ya[1].u32[0];
+   zb[1].u32[0] = yb[1].u32[0];
+   zc[1].u32[0] = yc[1].u32[0];
    za[1].u32[1] = ya[0].u32[1];
    zb[1].u32[1] = yb[0].u32[1];
    zc[1].u32[1] = yc[0].u32[1];
-   za[1].u32[0] = ya[1].u32[1];
-   zb[1].u32[0] = yb[1].u32[1];
-   zc[1].u32[0] = yc[1].u32[1];
-   za[1].u32[3] = ya[2].u32[1];
-   zb[1].u32[3] = yb[2].u32[1];
-   zc[1].u32[3] = yc[2].u32[1];
-   za[1].u32[2] = ya[3].u32[1];
-   zb[1].u32[2] = yb[3].u32[1];
-   zc[1].u32[2] = yc[3].u32[1];
-
+   za[1].u32[2] = ya[3].u32[2];
+   zb[1].u32[2] = yb[3].u32[2];
+   zc[1].u32[2] = yc[3].u32[2];
+   za[1].u32[3] = ya[2].u32[3];
+   zb[1].u32[3] = yb[2].u32[3];
+   zc[1].u32[3] = yc[2].u32[3];
+
+   za[2].u32[0] = ya[2].u32[0];
+   zb[2].u32[0] = yb[2].u32[0];
+   zc[2].u32[0] = yc[2].u32[0];
+   za[2].u32[1] = ya[1].u32[1];
+   zb[2].u32[1] = yb[1].u32[1];
+   zc[2].u32[1] = yc[1].u32[1];
    za[2].u32[2] = ya[0].u32[2];
    zb[2].u32[2] = yb[0].u32[2];
    zc[2].u32[2] = yc[0].u32[2];
-   za[2].u32[1] = ya[1].u32[2];
-   zb[2].u32[1] = yb[1].u32[2];
-   zc[2].u32[1] = yc[1].u32[2];
-   za[2].u32[0] = ya[2].u32[2];
-   zb[2].u32[0] = yb[2].u32[2];
-   zc[2].u32[0] = yc[2].u32[2];
-   za[2].u32[3] = ya[3].u32[2];
-   zb[2].u32[3] = yb[3].u32[2];
-   zc[2].u32[3] = yc[3].u32[2];
-
+   za[2].u32[3] = ya[3].u32[3];
+   zb[2].u32[3] = yb[3].u32[3];
+   zc[2].u32[3] = yc[3].u32[3];
+
+   za[3].u32[0] = ya[3].u32[0];
+   zb[3].u32[0] = yb[3].u32[0];
+   zc[3].u32[0] = yc[3].u32[0];
+   za[3].u32[1] = ya[2].u32[1];
+   zb[3].u32[1] = yb[2].u32[1];
+   zc[3].u32[1] = yc[2].u32[1];
+   za[3].u32[2] = ya[1].u32[2];
+   zb[3].u32[2] = yb[1].u32[2];
+   zc[3].u32[2] = yc[1].u32[2];
    za[3].u32[3] = ya[0].u32[3];
    zb[3].u32[3] = yb[0].u32[3];
    zc[3].u32[3] = yc[0].u32[3];
-   za[3].u32[2] = ya[1].u32[3];
-   zb[3].u32[2] = yb[1].u32[3];
-   zc[3].u32[2] = yc[1].u32[3];
-   za[3].u32[1] = ya[2].u32[3];
-   zb[3].u32[1] = yb[2].u32[3];
-   zc[3].u32[1] = yc[2].u32[3];
-   za[3].u32[0] = ya[3].u32[3];
-   zb[3].u32[0] = yb[3].u32[3];
-   zc[3].u32[0] = yc[3].u32[3];
-
-   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
-   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
-   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
-   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
-   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
-   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
-   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
-   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
-   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
-   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
-   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
-   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
-
-#endif
-
-   #undef ROL_1X32
-   #undef ROR_1X32
-   #undef SWAP_64
-   #undef ROL32
-   #undef ADD32
-   #undef XOR
-   #undef TYPE
-}
-
-void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
-{
-  uint32_t *X0 = X;
-  uint32_t *X1 = X+32;
-  uint32_t *X2 = X+64;
-  uint32_t *V0 = V;
-  uint32_t *V1 = V + 32*N;
-  uint32_t *V2 = V + 64*N;
-
-   for ( int i = 0; i < N; i++ )
-   {
-   #if defined(__AVX2__) 
-
-      for ( int k = 0; k < 4; k++ )
-      {
-         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
-         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
-         _mm256_stream_si256( (__m256i*)V2 + i*4 + k, casti_m256i( X2, k ) );
-      }
-
-   #else
-
-      memcpy( &V0[ i*32 ], X0, 128 );
-      memcpy( &V1[ i*32 ], X1, 128 );
-      memcpy( &V2[ i*32 ], X2, 128 );
-
-   #endif
-
-      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
-                           &X0[16], &X1[16], &X2[16] );
-      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0] );
-   }
-
-   for ( int i = 0; i < N; i++ )
-   {
-   #if defined(__AVX2__)
-
-      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
-
-      for ( int k = 0; k < 4; k++ )
-      {
-         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
-         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
-         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
-//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
-//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
-//         const __m256i v2 = _mm256_load_si256( ( (__m256i*)V2 ) +j2+k );
-         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
-         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
-         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
-      }
-
-   #else
-
-      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
-      for ( int k = 0; k < 8; k++ )
-      {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
-         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
-         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
-         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
-         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
-      }
-
-   #endif
-
-/*      
-      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
-
-      for ( int k = 0; k < 16; k++ )
-      {
-         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
-         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
-         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
-         ( (uint64_t*)X0 )[k] ^= v0;
-         ( (uint64_t*)X1 )[k] ^= v1;
-         ( (uint64_t*)X2 )[k] ^= v2;
-      }
-*/      
-
-/*
-      const int j0 = 32 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 32 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 32 * ( X2[16] & ( N - 1 ) );
 
-      for ( int k = 0; k < 32; k++ )
-      {
-         const uint32_t v0 = V0[ j0+k ];
-         const uint32_t v1 = V1[ j1+k ];         
-         const uint32_t v2 = V2[ j2+k ];
-         X0[k] ^= v0;
-         X1[k] ^= v1;
-         X2[k] ^= v2;
-      }
-*/
-   
-      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
-                           &X0[16], &X1[16], &X2[16] );
-      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0] );
-   }
-}
+   XA[0] = za[0].m128;
+   XB[0] = zb[0].m128;
+   XC[0] = zc[0].m128;
+   XA[1] = za[1].m128;
+   XB[1] = zb[1].m128;
+   XC[1] = zc[1].m128;
+   XA[2] = za[2].m128;
+   XB[2] = zb[2].m128;
+   XC[2] = zc[2].m128;
+   XA[3] = za[3].m128;
+   XB[3] = zb[3].m128;
+   XC[3] = zc[3].m128;
+
+#endif   
+}   
 
-// Working.
-// Quadruple buffered, 4x memory usage
+// Triple buffered, 3x memory usage
 // No interleaving
-static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
-                     uint32_t *bd, const uint32_t *ca, const uint32_t *cb,
-                     const uint32_t *cc,  const uint32_t *cd )
+static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
+               const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
 {
    __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
+           XC0, XC1, XC2, XC3;
    __m128i *BA = (__m128i*)ba;
    __m128i *BB = (__m128i*)bb;
    __m128i *BC = (__m128i*)bc;
-   __m128i *BD = (__m128i*)bd;
    const __m128i *CA = (const __m128i*)ca;
    const __m128i *CB = (const __m128i*)cb;
    const __m128i *CC = (const __m128i*)cc;
-   const __m128i *CD = (const __m128i*)cd;
 
    // define targets for macros used in round function template
    #define ROL_1X32    mm128_shufll_32
@@ -3396,397 +3032,42 @@ static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
    #define XOR         _mm_xor_si128
    #define TYPE        __m128i
 
-   // mix C into B then shuffle B into X
-   BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   BC[0] = _mm_xor_si128( BC[0], CC[0] );
-   BD[0] = _mm_xor_si128( BD[0], CD[0] );
-   BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   BC[1] = _mm_xor_si128( BC[1], CC[1] );
-   BD[1] = _mm_xor_si128( BD[1], CD[1] );
-   BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   BC[2] = _mm_xor_si128( BC[2], CC[2] );
-   BD[2] = _mm_xor_si128( BD[2], CD[2] );
-   BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   BB[3] = _mm_xor_si128( BB[3], CB[3] );
-   BC[3] = _mm_xor_si128( BC[3], CC[3] );
-   BD[3] = _mm_xor_si128( BD[3], CD[3] );
-
-#if defined(__SSE4_1__)
-
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3,
-           YC0, YC1, YC2, YC3, YD0, YD1, YD2, YD3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
-   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
-   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
-   YD0 = _mm_blend_epi32( BD[1], BD[0], 0x1 );
-   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
-   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
-   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
-   XD0 = _mm_blend_epi32( BD[3], BD[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
-   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
-   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
-   YD1 = _mm_blend_epi32( BD[2], BD[1], 0x1 );
-   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
-   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
-   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
-   XD1 = _mm_blend_epi32( BD[0], BD[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
-   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
-   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
-   YD2 = _mm_blend_epi32( BD[3], BD[2], 0x1 );
-   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
-   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
-   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
-   XD2 = _mm_blend_epi32( BD[1], BD[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
-   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
-   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
-   YD3 = _mm_blend_epi32( BD[0], BD[3], 0x1 );
-   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
-   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
-   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
-   XD3 = _mm_blend_epi32( BD[2], BD[1], 0x4 );
-
-   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
-   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
-   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
-   XD0 = _mm_blend_epi32( XD0, YD0, 0x3 );
-
-   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
-   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
-   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
-   XD1 = _mm_blend_epi32( XD1, YD1, 0x3 );
-
-   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
-   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
-   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
-   XD2 = _mm_blend_epi32( XD2, YD2, 0x3 );
-
-   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
-   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
-   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
-   XD3 = _mm_blend_epi32( XD3, YD3, 0x3 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
-   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
-   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
-   YD0 = _mm_blend_epi16( BD[1], BD[0], 0x03 );
-   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
-   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
-   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
-   XD0 = _mm_blend_epi16( BD[3], BD[2], 0x30 );
-   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
-   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
-   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
-   XD0 = _mm_blend_epi16( XD0, YD0, 0x0f );
-
-   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
-   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
-   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
-   YD1 = _mm_blend_epi16( BD[2], BD[1], 0x03 );
-   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
-   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
-   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
-   XD1 = _mm_blend_epi16( BD[0], BD[3], 0x30 );
-   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
-   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
-   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
-   XD1 = _mm_blend_epi16( XD1, YD1, 0x0f );
-
-   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
-   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
-   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
-   YD2 = _mm_blend_epi16( BD[3], BD[2], 0x03 );
-   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
-   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
-   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
-   XD2 = _mm_blend_epi16( BD[1], BD[0], 0x30 );
-   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
-   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
-   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
-   XD2 = _mm_blend_epi16( XD2, YD2, 0x0f );
-
-   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
-   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
-   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
-   YD3 = _mm_blend_epi16( BD[0], BD[3], 0x03 );
-   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
-   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
-   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
-   XD3 = _mm_blend_epi16( BD[2], BD[1], 0x30 );
-   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
-   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
-   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
-   XD3 = _mm_blend_epi16( XD3, YD3, 0x0f );
-
-#endif  // AVX2 else SSE3_1
-
-   SALSA_8ROUNDS_SIMD128_4BUF;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
-   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
-   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
-   YD0 = _mm_blend_epi32( XD0, XD1, 0x8 );
-   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
-   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
-   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
-   YD1 = _mm_blend_epi32( XD0, XD1, 0x1 );
-   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
-   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
-   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
-   YD2 = _mm_blend_epi32( XD0, XD1, 0x2 );
-   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
-   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
-   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
-   YD3 = _mm_blend_epi32( XD0, XD1, 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
-   YD0 = _mm_blend_epi32( YD0, XD2, 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
-   YD1 = _mm_blend_epi32( YD1, XD2, 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
-   YD2 = _mm_blend_epi32( YD2, XD2, 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
-   YD3 = _mm_blend_epi32( YD3, XD2, 0x2 );
-
-   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
-   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
-   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
-   YD0 = _mm_blend_epi32( YD0, XD3, 0x2 );
-   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
-   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
-   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
-   YD1 = _mm_blend_epi32( YD1, XD3, 0x4 );
-   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
-   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
-   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
-   YD2 = _mm_blend_epi32( YD2, XD3, 0x8 );
-   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
-   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
-   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
-   YD3 = _mm_blend_epi32( YD3, XD3, 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
-   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
-   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
-   YD0 = _mm_blend_epi16( XD0, XD1, 0xc0 );
-   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
-   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
-   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
-   YD1 = _mm_blend_epi16( XD0, XD1, 0x03 );
-   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
-   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
-   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
-   YD2 = _mm_blend_epi16( XD0, XD1, 0x0c );
-   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
-   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
-   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
-   YD3 = _mm_blend_epi16( XD0, XD1, 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
-   YD0 = _mm_blend_epi16( YD0, XD2, 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
-   YD1 = _mm_blend_epi16( YD1, XD2, 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
-   YD2 = _mm_blend_epi16( YD2, XD2, 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
-   YD3 = _mm_blend_epi16( YD3, XD2, 0x0c );
-
-   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
-   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
-   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
-   YD0 = _mm_blend_epi16( YD0, XD3, 0x0c );
-   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
-   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
-   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
-   YD1 = _mm_blend_epi16( YD1, XD3, 0x30 );
-   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
-   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
-   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
-   YD2 = _mm_blend_epi16( YD2, XD3, 0xc0 );
-   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
-   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
-   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
-   YD3 = _mm_blend_epi16( YD3, XD3, 0x03 );
-
-#endif  // AVX2 else SSE4_1
-
-   BA[0] = _mm_add_epi32( BA[0], YA0 );
-   BB[0] = _mm_add_epi32( BB[0], YB0 );
-   BC[0] = _mm_add_epi32( BC[0], YC0 );
-   BD[0] = _mm_add_epi32( BD[0], YD0 );
-   BA[1] = _mm_add_epi32( BA[1], YA1 );
-   BB[1] = _mm_add_epi32( BB[1], YB1 );
-   BC[1] = _mm_add_epi32( BC[1], YC1 );
-   BD[1] = _mm_add_epi32( BD[1], YD1 );
-   BA[2] = _mm_add_epi32( BA[2], YA2 );
-   BB[2] = _mm_add_epi32( BB[2], YB2 );
-   BC[2] = _mm_add_epi32( BC[2], YC2 );
-   BD[2] = _mm_add_epi32( BD[2], YD2 );
-   BA[3] = _mm_add_epi32( BA[3], YA3 );
-   BB[3] = _mm_add_epi32( BB[3], YB3 );
-   BC[3] = _mm_add_epi32( BC[3], YC3 );
-   BD[3] = _mm_add_epi32( BD[3], YD3 );
-
-#else  // SSE2
-
-   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4], yd[4], zd[4];
-
-   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
-   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
-   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
-   XD0 = _mm_set_epi32( bd[15], bd[10], bd[ 5], bd[ 0] );
-   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
-   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
-   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
-   XD1 = _mm_set_epi32( bd[ 3], bd[14], bd[ 9], bd[ 4] );
-   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
-   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
-   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
-   XD2 = _mm_set_epi32( bd[ 7], bd[ 2], bd[13], bd[ 8] );
-   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
-   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
-   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
-   XD3 = _mm_set_epi32( bd[11], bd[ 6], bd[ 1], bd[12] );
-
-   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
-
-   ya[0].m128 = XA0;
-   yb[0].m128 = XB0;
-   yc[0].m128 = XC0;
-   yd[0].m128 = XD0;
-   ya[1].m128 = XA1;
-   yb[1].m128 = XB1;
-   yc[1].m128 = XC1;
-   yd[1].m128 = XD1;
-   ya[2].m128 = XA2;
-   yb[2].m128 = XB2;
-   yc[2].m128 = XC2;
-   yd[2].m128 = XD2;
-   ya[3].m128 = XA3;
-   yb[3].m128 = XB3;
-   yc[3].m128 = XC3;
-   yd[3].m128 = XD3;
-
-   za[0].u32[0] = ya[0].u32[0];
-   zb[0].u32[0] = yb[0].u32[0];
-   zc[0].u32[0] = yc[0].u32[0];
-   zd[0].u32[0] = yd[0].u32[0];
-   za[0].u32[3] = ya[1].u32[0];
-   zb[0].u32[3] = yb[1].u32[0];
-   zc[0].u32[3] = yc[1].u32[0];
-   zd[0].u32[3] = yd[1].u32[0];
-   za[0].u32[2] = ya[2].u32[0];
-   zb[0].u32[2] = yb[2].u32[0];
-   zc[0].u32[2] = yc[2].u32[0];
-   zd[0].u32[2] = yd[2].u32[0];
-   za[0].u32[1] = ya[3].u32[0];
-   zb[0].u32[1] = yb[3].u32[0];
-   zc[0].u32[1] = yc[3].u32[0];
-   zd[0].u32[1] = yd[3].u32[0];
-
-   za[1].u32[1] = ya[0].u32[1];
-   zb[1].u32[1] = yb[0].u32[1];
-   zc[1].u32[1] = yc[0].u32[1];
-   zd[1].u32[1] = yd[0].u32[1];
-   za[1].u32[0] = ya[1].u32[1];
-   zb[1].u32[0] = yb[1].u32[1];
-   zc[1].u32[0] = yc[1].u32[1];
-   zd[1].u32[0] = yd[1].u32[1];
-   za[1].u32[3] = ya[2].u32[1];
-   zb[1].u32[3] = yb[2].u32[1];
-   zc[1].u32[3] = yc[2].u32[1];
-   zd[1].u32[3] = yd[2].u32[1];
-   za[1].u32[2] = ya[3].u32[1];
-   zb[1].u32[2] = yb[3].u32[1];
-   zc[1].u32[2] = yc[3].u32[1];
-   zd[1].u32[2] = yd[3].u32[1];
+   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] );
+   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] );
+   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] );
+   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] );
+      
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   
+   SALSA_8ROUNDS_SIMD128_3BUF;
 
-   za[2].u32[2] = ya[0].u32[2];
-   zb[2].u32[2] = yb[0].u32[2];
-   zc[2].u32[2] = yc[0].u32[2];
-   zd[2].u32[2] = yd[0].u32[2];
-   za[2].u32[1] = ya[1].u32[2];
-   zb[2].u32[1] = yb[1].u32[2];
-   zc[2].u32[1] = yc[1].u32[2];
-   zd[2].u32[1] = yd[1].u32[2];
-   za[2].u32[0] = ya[2].u32[2];
-   zb[2].u32[0] = yb[2].u32[2];
-   zc[2].u32[0] = yc[2].u32[2];
-   zd[2].u32[0] = yd[2].u32[2];
-   za[2].u32[3] = ya[3].u32[2];
-   zb[2].u32[3] = yb[3].u32[2];
-   zc[2].u32[3] = yc[3].u32[2];
-   zd[2].u32[3] = yd[3].u32[2];
+#else
 
-   za[3].u32[3] = ya[0].u32[3];
-   zb[3].u32[3] = yb[0].u32[3];
-   zc[3].u32[3] = yc[0].u32[3];
-   zd[3].u32[3] = yd[0].u32[3];
-   za[3].u32[2] = ya[1].u32[3];
-   zb[3].u32[2] = yb[1].u32[3];
-   zc[3].u32[2] = yc[1].u32[3];
-   zd[3].u32[2] = yd[1].u32[3];
-   za[3].u32[1] = ya[2].u32[3];
-   zb[3].u32[1] = yb[2].u32[3];
-   zc[3].u32[1] = yc[2].u32[3];
-   zd[3].u32[1] = yd[2].u32[3];
-   za[3].u32[0] = ya[3].u32[3];
-   zb[3].u32[0] = yb[3].u32[3];
-   zc[3].u32[0] = yc[3].u32[3];
-   zd[3].u32[0] = yd[3].u32[3];
-
-   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
-   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
-   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
-   BD[0] = _mm_add_epi32( BD[0], zd[0].m128 );
-   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
-   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
-   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
-   BD[1] = _mm_add_epi32( BD[1], zd[1].m128 );
-   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
-   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
-   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
-   BD[2] = _mm_add_epi32( BD[2], zd[2].m128 );
-   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
-   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
-   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
-   BD[3] = _mm_add_epi32( BD[3], zd[3].m128 );
+   SALSA_8ROUNDS_SIMD128_3BUF_SLOROT;
 
 #endif
 
+   BA[0] = _mm_add_epi32( BA[0], XA0 );
+   BB[0] = _mm_add_epi32( BB[0], XB0 );
+   BC[0] = _mm_add_epi32( BC[0], XC0 );
+   BA[1] = _mm_add_epi32( BA[1], XA1 );
+   BB[1] = _mm_add_epi32( BB[1], XB1 );
+   BC[1] = _mm_add_epi32( BC[1], XC1 );
+   BA[2] = _mm_add_epi32( BA[2], XA2 );
+   BB[2] = _mm_add_epi32( BB[2], XB2 );
+   BC[2] = _mm_add_epi32( BC[2], XC2 );
+   BA[3] = _mm_add_epi32( BA[3], XA3 );
+   BB[3] = _mm_add_epi32( BB[3], XB3 );
+   BC[3] = _mm_add_epi32( BC[3], XC3 );
+
    #undef ROL_1X32
    #undef ROR_1X32
    #undef SWAP_64
@@ -3796,105 +3077,108 @@ static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
    #undef TYPE
 }
 
-void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N )
+void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-  uint32_t *X0 = X;
-  uint32_t *X1 = X+32;
-  uint32_t *X2 = X+64;
-  uint32_t *X3 = X+96;
-  uint32_t *V0 = V;
-  uint32_t *V1 = V + 32*N;
-  uint32_t *V2 = V + 64*N;
-  uint32_t *V3 = V + 96*N;
-
-   for ( int i = 0; i < N; i++ )
+   uint32_t *X0 = X;
+   uint32_t *X1 = X+32;
+   uint32_t *X2 = X+64;
+   uint32_t *V0 = V;
+   uint32_t *V1 = V + 32*N;
+   uint32_t *V2 = V + 64*N;
+
+   salsa_simd128_shuffle_3buf( X0,    X1,    X2    );
+   salsa_simd128_shuffle_3buf( X0+16, X1+16, X2+16 );
+  
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
+   #if defined(__AVX__) 
+
+      for ( int i = 0; i < 4; i++ )
       {
-         _mm_stream_si128( (__m128i*)V0 + i*8 + k, casti_m128i( X0, k ) );
-         _mm_stream_si128( (__m128i*)V1 + i*8 + k, casti_m128i( X1, k ) );
-         _mm_stream_si128( (__m128i*)V2 + i*8 + k, casti_m128i( X2, k ) );
-         _mm_stream_si128( (__m128i*)V3 + i*8 + k, casti_m128i( X3, k ) );
+         _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) );
+         _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) );
+         _mm256_stream_si256( (__m256i*)V2 + n*4 + i, casti_m256i( X2, i ) );
       }
 
-      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                           &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
-   }
-   for ( int i = 0; i < N; i++ )
-   {
-   #if defined(__AVX2__)
-
-      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
-      const int j3 = 4 * ( X3[16] & ( N - 1 ) );
+   #elif defined(__SSE4_1__)
 
-      for ( int k = 0; k < 4; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
-         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); 
-         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
-         const __m256i v3 = _mm256_stream_load_si256( ( (__m256i*)V3 ) +j3+k );
-         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
-         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
-         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
-         casti_m256i( X3, k ) = _mm256_xor_si256( casti_m256i( X3, k ), v3 );
+         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
+         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
+         _mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) );
       }
 
    #else
-      
-      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
-      const int j3 = 8 * ( X3[16] & ( N - 1 ) );
 
-      for ( int k = 0; k < 8; k++ )
-      {
-      #if defined(__SSE4_1__)
-         const __m128i v0 = _mm_stream_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_stream_load_si128( ( (__m128i*)V1 ) +j1+k );
-         const __m128i v2 = _mm_stream_load_si128( ( (__m128i*)V2 ) +j2+k );
-         const __m128i v3 = _mm_stream_load_si128( ( (__m128i*)V3 ) +j3+k );
-      #else
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
-         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
-         const __m128i v3 = _mm_load_si128( ( (__m128i*)V3 ) +j3+k );
-      #endif
-         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
-         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
-         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
-         casti_m128i( X3, k ) = _mm_xor_si128( casti_m128i( X3, k ), v3 );
-      }
+      memcpy( &V0[ n*32 ], X0, 128 );
+      memcpy( &V1[ n*32 ], X1, 128 );
+      memcpy( &V2[ n*32 ], X2, 128 );
 
-   #endif      
+   #endif
 
-/*
-      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
-      const int j3 = 16 * ( X3[16] & ( N - 1 ) );
+      salsa8_simd128_3buf( X0,    X1,    X2   , X0+16, X1+16, X2+16 );
+      salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0,    X1,    X2    );
+   }
 
-      for ( int k = 0; k < 16; k++ )
+   for ( int n = 0; n < N; n++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N-1 ) );
+      const int j1 = 4 * ( X1[16] & ( N-1 ) );
+      const int j2 = 4 * ( X2[16] & ( N-1 ) );
+
+      const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0   );
+      const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1   );
+      const __m256i v20 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2   );
+      const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 );
+      const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 );
+      const __m256i v21 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+1 );
+      const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 );
+      const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 );
+      const __m256i v22 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+2 );
+      const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 );
+      const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 );
+      const __m256i v23 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+3 );
+
+      casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 );
+      casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 );
+      casti_m256i( X2, 0 ) = _mm256_xor_si256( casti_m256i( X2, 0 ), v20 );
+      casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 );
+      casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 );
+      casti_m256i( X2, 1 ) = _mm256_xor_si256( casti_m256i( X2, 1 ), v21 );
+      casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 );
+      casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 );
+      casti_m256i( X2, 2 ) = _mm256_xor_si256( casti_m256i( X2, 2 ), v22 );
+      casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 );
+      casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 );
+      casti_m256i( X2, 3 ) = _mm256_xor_si256( casti_m256i( X2, 3 ), v23 );
+
+   #else
+
+      const int j0 = 8 * ( X0[16] & ( N-1 ) );
+      const int j1 = 8 * ( X1[16] & ( N-1 ) );
+      const int j2 = 8 * ( X2[16] & ( N-1 ) );
+      for ( int i = 0; i < 8; i++ )
       {
-         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
-         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];
-         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
-         const uint64_t v3 = ( (uint64_t*)V3 )[ j3+k ];
-         ( (uint64_t*)X0 )[k] ^= v0;
-         ( (uint64_t*)X1 )[k] ^= v1;
-         ( (uint64_t*)X2 )[k] ^= v2;
-         ( (uint64_t*)X3 )[k] ^= v3;
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
+         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i );
+         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
+         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
+         casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 );
       }
-*/
 
-      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                           &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   #endif
+
+      salsa8_simd128_3buf( X0,    X1,    X2   , X0+16, X1+16, X2+16 );
+      salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0,    X1,    X2    );
    }
+
+   salsa_simd128_unshuffle_3buf( X0,    X1,    X2    );
+   salsa_simd128_unshuffle_3buf( X0+16, X1+16, X2+16 );
+
 }
 
 
@@ -3961,17 +3245,17 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
 
 void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128 );
+      memcpy( &V[ n*32 ], X, 128 );
       xor_salsa8( &X[ 0], &X[16] );
       xor_salsa8( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       int j = 32 * ( X[16] & ( N - 1 ) );
-      for ( int k = 0; k < 32; k++ )
-         X[k] ^= V[j + k];
+      for ( int i = 0; i < 32; i++ )
+         X[i] ^= V[ j+i ];
       xor_salsa8( &X[ 0], &X[16] );
       xor_salsa8( &X[16], &X[ 0] );
    }
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index a15b5cb1..e919ccb3 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -146,6 +146,119 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
       output[i] = bswap_32( ostate[i] );
 }
 
+#if defined(__SHA__)
+
+static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, 
+                    const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
+                    uint32_t *ostate0, uint32_t *ostate1 )
+{
+   uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16];
+   int i;
+
+   memcpy( pad0, key0 + 16, 16 );
+   memcpy( pad0 + 4, keypad, 48 );
+   memcpy( pad1, key1 + 16, 16 );
+   memcpy( pad1 + 4, keypad, 48 );
+
+   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
+		               tstate0, tstate1 );
+
+   memcpy( ihash0, tstate0, 32 );
+   memcpy( ihash1, tstate1, 32 );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      pad0[i] = ihash0[i] ^ 0x5c5c5c5c;
+      pad1[i] = ihash1[i] ^ 0x5c5c5c5c;
+   }
+   for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
+
+   sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
+                               sha256_initial_state, sha256_initial_state );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      pad0[i] = ihash0[i] ^ 0x36363636;
+      pad1[i] = ihash1[i] ^ 0x36363636;
+   }
+   for ( ; i < 16; i++ )      pad0[i] = pad1[i] = 0x36363636;
+
+   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, 
+                               sha256_initial_state, sha256_initial_state );
+}
+
+static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
+            const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
+            const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0,
+            uint32_t *output1 )
+{
+   uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8];
+   uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
+   int i, j;
+
+   sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
+                               tstate0, tstate1 );
+
+   memcpy( ibuf0, salt0 + 16, 16 );
+   memcpy( ibuf0 + 5, innerpad, 44 );
+   memcpy( obuf0 + 8, outerpad, 32 );
+   memcpy( ibuf1, salt1 + 16, 16 );
+   memcpy( ibuf1 + 5, innerpad, 44 );
+   memcpy( obuf1 + 8, outerpad, 32 );
+
+   for ( i = 0; i < 4; i++ )
+   {
+      memcpy( obuf0, istate0, 32 );
+      memcpy( obuf1, istate1, 32 );
+      ibuf0[4] = ibuf1[4] = i + 1;
+
+      sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
+                                  obuf0, obuf1 );
+      sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
+                                  ostate0, ostate1 );
+      
+      for ( j = 0; j < 8; j++ )
+      {
+         output0[ 8*i + j ] = bswap_32( ostateb0[j] );
+         output1[ 8*i + j ] = bswap_32( ostateb1[j] );
+      }
+   }
+}
+
+static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
+                    uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
+                    const uint32_t *salt0, const uint32_t *salt1,
+                    uint32_t *output0, uint32_t *output1 )
+{
+   uint32_t buf0[16], buf1[16];
+   int i;
+
+   sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
+                               tstate0, tstate1 );   
+   sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
+                               tstate0, tstate1 );
+   sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
+                               tstate0, tstate1 );
+
+   memcpy( buf0, tstate0, 32 );
+   memcpy( buf0 + 8, outerpad, 32 );
+   memcpy( buf1, tstate1, 32 );
+   memcpy( buf1 + 8, outerpad, 32 );
+
+   sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
+                               ostate0, ostate1 );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      output0[i] = bswap_32( ostate0[i] );
+      output1[i] = bswap_32( ostate1[i] );
+   }
+}
+
+
+
+#endif
+
 #ifdef HAVE_SHA256_4WAY
 
 static const uint32_t keypad_4way[4 * 12] = {
@@ -643,10 +756,10 @@ static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
            uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[8 * 8];
-   uint32_t _ALIGN(128) ostate[8 * 8];
-   uint32_t _ALIGN(128) W[8 * 32];
-   uint32_t _ALIGN(128) X[8 * 32];
+   uint32_t _ALIGN(128) tstate[ 8*8 ];
+   uint32_t _ALIGN(128) ostate[ 8*8 ];
+   uint32_t _ALIGN(128) W[ 8*32 ];
+   uint32_t _ALIGN(128) X[ 8*32 ];
    uint32_t *V = (uint32_t*)scratchpad;
 
    intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
@@ -658,53 +771,45 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );
 
    dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
+   
+   if ( opt_param_n > 0x4000 )
+   {
+      scrypt_core_simd128_3buf( X,     V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+ 96, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+192, V, N );
+   }
+   else
+   {
+      intrlv_2x128( W,     X,     X+ 32, 1024 );
+      intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
+      intrlv_2x128( W+128, X+128, X+160, 1024 );
+      intrlv_2x128( W+192, X+192, X+224, 1024 );
+      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+      dintrlv_2x128( X,     X+ 32, W,     1024 );
+      dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
+      dintrlv_2x128( X+128, X+160, W+128, 1024 );
+      dintrlv_2x128( X+192, X+224, W+192, 1024 );
+   }
 
+      
 
    // SCRYPT CORE
 
-
-   // AVX512
-
-/*
-   // AVX512 16 way working
-   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
-                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
-                    X+256+160, X+256+192, X+256+224, 1024 );
-
-   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
-
-   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
-                  X+256, X+256+32, X+256+64, X+256+96, X+256+128, 
-                  X+256+160, X+256+192, X+256+224, W, 1024 );
-*/
-/*
-   // AVX512 working
-   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
-   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
-   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); 
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
-   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
-   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
-*/   
-/*
-   // AVX512, not working, very slow
-   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
-   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
-   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
-   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
-   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
-*/
-
   // AVX2
 
-/*
+
    // AVX2   
    // disable de/interleave for testing.
-   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
-*/
+//   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+
 
 /*
    // AVX2 working
@@ -714,23 +819,18 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    intrlv_2x128( W+192, X+192, X+224, 1024 );
 
    // working
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
 
    // working
-   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
-
-   // working
-//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
 
    dintrlv_2x128( X,     X+ 32, W,     1024 );
    dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
@@ -745,18 +845,10 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    intrlv_2x32( W+128, X+128, X+160, 1024 );
    intrlv_2x32( W+192, X+192, X+224, 1024 );
 
-   // working, deprecated, not up to data
-//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
-
-     // deprecated, not up to date
-//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
-
    // working
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
 
 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -813,19 +905,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+192, V, N );
 */
-
+/**************
    scrypt_core_simd128_3buf( X,     V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_3buf( X+ 96, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+192, V, N );
-
-/*
-   // SSE2 working
-   scrypt_core_simd128_4buf( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+128, V, N );
-*/
+*************/
 
 
    if ( work_restart[thrid].restart ) return 0;
@@ -868,6 +954,39 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
                   W, 1024 );
 
 
+   if ( opt_param_n > 0x4000 )
+   {
+      scrypt_core_simd128_3buf( X,     V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+ 96, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+192, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+256, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+352, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+448, V, N );
+   }
+   else
+   {
+      intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+      intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+      intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
+      intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
+      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
+      dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+      dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+      dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
+      dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 );
+   }
+
    // SCRYPT CORE
 
 
@@ -888,23 +1007,40 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    // AVX512 working
    intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
    intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
+   intrlv_4x32( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
    scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
    dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
    dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
+   dintrlv_4x32( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
+   dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */
 /*
-   // AVX512, not working, very slow
+   // AVX512, working
    intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
    intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+   intrlv_4x128( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
    scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+256),   (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N );
    dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
    dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+   dintrlv_4x128( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
+   dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */
 
+
   // AVX2
 
 /*
@@ -919,16 +1055,19 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
    intrlv_2x128( W+128, X+128, X+160, 1024 );
    intrlv_2x128( W+192, X+192, X+224, 1024 );
-
-   // working
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+   intrlv_2x128( W+256,     X+256,     X+256+ 32, 1024 );
+   intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 );
+   intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 );
 
    // working
    scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256),      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N );
 
    // working
 //   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
@@ -938,11 +1077,23 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 //   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256),      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N );
 
    dintrlv_2x128( X,     X+ 32, W,     1024 );
    dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
    dintrlv_2x128( X+128, X+160, W+128, 1024 );
    dintrlv_2x128( X+192, X+224, W+192, 1024 );
+   dintrlv_2x128( X+256,     X+256+ 32, W+256,     1024 );
+   dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 );
+   dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 );
+   dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 );
 */
 
 /*
@@ -952,18 +1103,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    intrlv_2x32( W+128, X+128, X+160, 1024 );
    intrlv_2x32( W+192, X+192, X+224, 1024 );
 
-   // working, deprecated, not up to data
-//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
-
-     // deprecated, not up to date
-//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
-
    // working
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
 
 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -1043,7 +1189,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+448, V, N );
 */
-
+/***************
    scrypt_core_simd128_3buf( X,     V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_3buf( X+ 96, V, N );
@@ -1055,17 +1201,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    scrypt_core_simd128_3buf( X+352, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+448, V, N );
-
-/*
-   // SSE2 working
-   scrypt_core_simd128_4buf( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+128, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+256, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+384, V, N );
-*/
+********************/
 /*
    scrypt_core_3way( X,     V, N );
    if ( work_restart[thrid].restart ) return 0;
@@ -1102,6 +1238,31 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 
 #if defined(__SHA__)
 
+static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+{
+    uint32_t _ALIGN(128) tstate[ 2*8 ];
+    uint32_t _ALIGN(128) ostate[ 2*8 ];
+    uint32_t _ALIGN(128) W[ 2*32 ];
+    uint32_t *V = (uint32_t*)scratchpad;
+
+    memcpy( tstate,    midstate, 32 );
+    memcpy( tstate+ 8, midstate, 32 );
+
+    HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
+                                  ostate, ostate+8 );
+    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
+                                   input, input+20,  W, W+32 );
+
+    scrypt_core_simd128_2buf( W, V, N );
+    if ( work_restart[thrid].restart ) return 0;
+
+    PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
+                                   output, output+8 );
+
+   return 1;
+}
+
 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
            uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
@@ -1149,8 +1310,6 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
    scrypt_core_simd128( W+96, V, N );
 */
 
-   // working
-//   scrypt_core_simd128_4buf( W, V, N );
 
    if ( work_restart[thrid].restart ) return 0;
 
@@ -1171,10 +1330,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
            uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[4 * 8];
-   uint32_t _ALIGN(128) ostate[4 * 8];
-   uint32_t _ALIGN(128) W[4 * 32];
-   uint32_t _ALIGN(128) X[4 * 32];
+   uint32_t _ALIGN(128) tstate[ 4*8 ];
+   uint32_t _ALIGN(128) ostate[ 4*8 ];
+   uint32_t _ALIGN(128) W[ 4*32 ];
    uint32_t *V = (uint32_t*)scratchpad;
 
    intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
@@ -1184,7 +1342,21 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
    HMAC_SHA256_80_init_4way(W, tstate, ostate);
    PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
 
-   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+   if ( opt_param_n > 0x4000 )
+   {
+      uint32_t _ALIGN(128) X[ 4*32 ];
+      dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+      scrypt_core_simd128_2buf( X, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+64, V, N );
+      intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+   }
+   else
+      scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+
+
+
+//   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
 
 ////// SCRYPT_CORE   
 
@@ -1202,35 +1374,23 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128( X+96, V, N );
 */
-   
+/*   
    // working, double buffered linear simd, best for n2
    scrypt_core_simd128_2buf( X, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+64, V, N );
-  
+*/  
 /*
    scrypt_core_simd128_3buf( X, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128( X+96, V, N );
 */
    
-   // working
-//   scrypt_core_simd128_4buf( X, V, N );
-
-
-/* 
-   // original
-   scrypt_core(X + 0 * 32, V, N);
-	scrypt_core(X + 1 * 32, V, N);
-	scrypt_core(X + 2 * 32, V, N);
-	scrypt_core(X + 3 * 32, V, N);
-*/
-
 ////////////////////////////////
 
    if ( work_restart[thrid].restart ) return 0;
 
-   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+//   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
 
    PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
 
@@ -1247,22 +1407,22 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
-	uint32_t midstate[8];
-	uint32_t n = pdata[19] - 1;
+   uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
+   uint32_t midstate[8];
+   uint32_t n = pdata[19] - 1;
    int thr_id = mythr->id;  
    int throughput = scrypt_throughput;
-	int i;
+   int i;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
-	for ( i = 0; i < throughput; i++ )
-		memcpy( data + i * 20, pdata, 80 );
+   for ( i = 0; i < throughput; i++ )
+      memcpy( data + i * 20, pdata, 80 );
 
    sha256_transform_le( midstate, data, sha256_initial_state );
 
-	do {
+   do {
       bool rc = true;
-		for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+      for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
       if ( throughput == 16 )
@@ -1276,7 +1436,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                                      opt_param_n, thr_id );
       else
 #endif
-      if ( throughput == 4 )
+      if ( throughput == 4 ) // slower on Ryzen than 8way
 #if defined(__SHA__)
          rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
                                          opt_param_n, thr_id );
@@ -1284,10 +1444,17 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
          rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
                                      opt_param_n, thr_id );
 #endif
+#if defined(__SHA__)
       else
+      if (throughput == 2 )  // slower on Ryzen than 4way_sha & 8way
+         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
+                                         opt_param_n, thr_id );
+#endif         
+      else  // should never get here
          rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
                                 opt_param_n, thr_id );
 
+      // test the hash
       if ( rc )
       for ( i = 0; i < throughput; i++ )
       {
@@ -1319,11 +1486,11 @@ bool scrypt_miner_thread_init( int thr_id )
 
 bool register_scrypt_algo( algo_gate_t* gate )
 {
-#if defined(__SHA__)
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-#else
+//#if defined(__SHA__)
+//   gate->optimizations = SSE2_OPT | SHA_OPT;
+//#else
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
-#endif
+//#endif
    gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
    gate->scanhash         = (void*)&scanhash_scrypt;
    opt_target_factor = 65536.0;
@@ -1332,16 +1499,29 @@ bool register_scrypt_algo( algo_gate_t* gate )
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
    scrypt_throughput = 16;
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
+      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   else      
+      scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+
+/* SHA is slower than AVX2 on Ryzen
 #elif defined(__SHA__)
    scrypt_throughput = 4;
    scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+*/
+
 #elif defined(__AVX2__)
    scrypt_throughput = 8;   
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
+      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   else
+      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
 #else
    scrypt_throughput = 4;
+   if ( opt_param_n > 0x4000 )
    scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+   else
+   scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
 #endif
 
    char t_units[4] = {0};
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 7b6618c4..de3f1d43 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -51,7 +51,6 @@ typedef struct {
    __m128i buf[64>>2];
    __m128i val[8];
    uint32_t count_high, count_low;
-   bool initialized;
 } sha256_4way_context __attribute__ ((aligned (64)));
 
 void sha256_4way_init( sha256_4way_context *sc );
@@ -74,7 +73,6 @@ typedef struct {
    __m256i buf[64>>2];
    __m256i val[8];
    uint32_t count_high, count_low;
-   bool initialized;
 } sha256_8way_context __attribute__ ((aligned (128)));
 
 void sha256_8way_init( sha256_8way_context *sc );
@@ -96,7 +94,6 @@ typedef struct {
    __m512i buf[64>>2];
    __m512i val[8];
    uint32_t count_high, count_low;
-   bool initialized;
 } sha256_16way_context __attribute__ ((aligned (128)));
 
 void sha256_16way_init( sha256_16way_context *sc );
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index beac702c..1c630cc8 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -107,22 +107,19 @@ do { \
 } while (0)
 
 // LE data, no need to byte swap
-void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
-                            const __m128i *state_in )
+static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
+                                          const __m128i *in )
 {
    __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
-   __m128i W[16];
-
-   memcpy_128( W, data, 16 );
 
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
+   A = in[0];
+   B = in[1];
+   C = in[2];
+   D = in[3];
+   E = in[4];
+   F = in[5];
+   G = in[6];
+   H = in[7];
    Y_xor_Z = _mm_xor_si128( B, C );
 
    SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
@@ -179,228 +176,46 @@ void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
       SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
    }
    
-   state_out[0] = _mm_add_epi32( state_in[0], A );
-   state_out[1] = _mm_add_epi32( state_in[1], B );
-   state_out[2] = _mm_add_epi32( state_in[2], C );
-   state_out[3] = _mm_add_epi32( state_in[3], D );
-   state_out[4] = _mm_add_epi32( state_in[4], E );
-   state_out[5] = _mm_add_epi32( state_in[5], F );
-   state_out[6] = _mm_add_epi32( state_in[6], G );
-   state_out[7] = _mm_add_epi32( state_in[7], H );
+   out[0] = _mm_add_epi32( in[0], A );
+   out[1] = _mm_add_epi32( in[1], B );
+   out[2] = _mm_add_epi32( in[2], C );
+   out[3] = _mm_add_epi32( in[3], D );
+   out[4] = _mm_add_epi32( in[4], E );
+   out[5] = _mm_add_epi32( in[5], F );
+   out[6] = _mm_add_epi32( in[6], G );
+   out[7] = _mm_add_epi32( in[7], H );
 }
 
-// BE data, need to byte swap
-void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
-                            const __m128i *state_in )
+// LE data, no need to byte swap
+void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
+                               const __m128i *state_in )
 {
-   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m128i W[16];
-
-   mm128_block_bswap_32( W, data );
-   mm128_block_bswap_32( W+8, data+8 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
-   Y_xor_Z = _mm_xor_si128( B, C );
-
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   state_out[0] = _mm_add_epi32( state_in[0], A );
-   state_out[1] = _mm_add_epi32( state_in[1], B );
-   state_out[2] = _mm_add_epi32( state_in[2], C );
-   state_out[3] = _mm_add_epi32( state_in[3], D );
-   state_out[4] = _mm_add_epi32( state_in[4], E );
-   state_out[5] = _mm_add_epi32( state_in[5], F );
-   state_out[6] = _mm_add_epi32( state_in[6], G );
-   state_out[7] = _mm_add_epi32( state_in[7], H );
+   memcpy_128( W, data, 16 );
+   SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
-
-static void
-sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
+// BE data, need to byte swap input data
+void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
+                               const __m128i *state_in )
 {
-   register  __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m128i W[16];
-
-   mm128_block_bswap_32( W, in );
-   mm128_block_bswap_32( W+8, in+8 );
-
-   if ( ctx->initialized )
-   {
-      A = r[0];
-      B = r[1];
-      C = r[2];
-      D = r[3];
-      E = r[4];
-      F = r[5];
-      G = r[6];
-      H = r[7];
-   }
-   else
-   {
-      A = m128_const1_64( 0x6A09E6676A09E667 );
-      B = m128_const1_64( 0xBB67AE85BB67AE85 );
-      C = m128_const1_64( 0x3C6EF3723C6EF372 );
-      D = m128_const1_64( 0xA54FF53AA54FF53A );
-      E = m128_const1_64( 0x510E527F510E527F );
-      F = m128_const1_64( 0x9B05688C9B05688C );
-      G = m128_const1_64( 0x1F83D9AB1F83D9AB );
-      H = m128_const1_64( 0x5BE0CD195BE0CD19 );
-   }
-
-   Y_xor_Z = _mm_xor_si128( B, C );
-
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   if ( ctx->initialized )
-   {
-      r[0] = _mm_add_epi32( r[0], A );
-      r[1] = _mm_add_epi32( r[1], B );
-      r[2] = _mm_add_epi32( r[2], C );
-      r[3] = _mm_add_epi32( r[3], D );
-      r[4] = _mm_add_epi32( r[4], E );
-      r[5] = _mm_add_epi32( r[5], F );
-      r[6] = _mm_add_epi32( r[6], G );
-      r[7] = _mm_add_epi32( r[7], H );
-   }
-   else
-   {
-      ctx->initialized = true;
-      r[0] = _mm_add_epi32( A, m128_const1_64( 0x6A09E6676A09E667 ) );
-      r[1] = _mm_add_epi32( B, m128_const1_64( 0xBB67AE85BB67AE85 ) );
-      r[2] = _mm_add_epi32( C, m128_const1_64( 0x3C6EF3723C6EF372 ) );
-      r[3] = _mm_add_epi32( D, m128_const1_64( 0xA54FF53AA54FF53A ) );
-      r[4] = _mm_add_epi32( E, m128_const1_64( 0x510E527F510E527F ) );
-      r[5] = _mm_add_epi32( F, m128_const1_64( 0x9B05688C9B05688C ) );
-      r[6] = _mm_add_epi32( G, m128_const1_64( 0x1F83D9AB1F83D9AB ) );
-      r[7] = _mm_add_epi32( H, m128_const1_64( 0x5BE0CD195BE0CD19 ) );
-   }
+   mm128_block_bswap_32( W, data );
+   mm128_block_bswap_32( W+8, data+8 );
+   SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
 void sha256_4way_init( sha256_4way_context *sc )
 {
-   sc->initialized = false;
    sc->count_high = sc->count_low = 0;
-/*
-   sc->val[0] = _mm_set1_epi32( H256[0] );
-   sc->val[1] = _mm_set1_epi32( H256[1] );
-   sc->val[2] = _mm_set1_epi32( H256[2] );
-   sc->val[3] = _mm_set1_epi32( H256[3] );
-   sc->val[4] = _mm_set1_epi32( H256[4] );
-   sc->val[5] = _mm_set1_epi32( H256[5] );
-   sc->val[6] = _mm_set1_epi32( H256[6] );
-   sc->val[7] = _mm_set1_epi32( H256[7] );
-*/
+   sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m128_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m128_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 }
 
 void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -424,7 +239,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
       len -= clen;
       if ( ptr == buf_size )
       {
-         sha256_4way_round( sc, sc->buf, sc->val );
+         sha256_4way_transform_be( sc->val, sc->buf, sc->val );
          ptr = 0;
       }
       clow = sc->count_low;
@@ -449,7 +264,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
     if ( ptr > pad )
     {
          memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_4way_round( sc, sc->buf, sc->val );
+         sha256_4way_transform_be( sc->val, sc->buf, sc->val );
          memset_zero_128( sc->buf, pad >> 2 );
     }
     else
@@ -461,7 +276,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
 
     sc->buf[  pad     >> 2 ] = m128_const1_32( bswap_32( high ) );
     sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
-    sha256_4way_round( sc, sc->buf, sc->val );
+    sha256_4way_transform_be( sc->val, sc->buf, sc->val );
 
     mm128_block_bswap_32( dst, sc->val );
 }
@@ -539,8 +354,7 @@ do { \
 
 #define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
-  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
-                                 W[ i ] ); \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
   __m256i T1 = BSG2_1x( E ); \
   __m256i T2 = BSG2_0x( A ); \
   T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
@@ -552,45 +366,74 @@ do { \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
-/*
-#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+//  the X_xor_y technique can be extended to eliminate the mov instruction.
+//  Perform double rounds and alternate each round. Doesn't apply to AVX512
+//  and isn't suitable for running 3 round prehash.
+//
+// read Y_xor_Z, update X_xor_Y
+#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
+  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
+                                         Y_xor_Z ) )
+
+// start with toc initialized to y^z:   toc = B ^ C
+// First round reads toc as Y_xor_Z and saves X_xor_Y as tic.
+// Second round reads tic as Y_xor_Z and saves X_xor_Y as toc.
+
+#define SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, i0, i1, j ) \
 do { \
-  __m256i T1, T2; \
-  __m256i K = _mm256_set1_epi32( K256[( (j)+(i) )] ); \
-  T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
-  Y_xor_Z = X_xor_Y; \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
+                                 W[ i0 ] ); \
+  __m256i T1 = BSG2_1x( E ); \
+  __m256i T2 = BSG2_0x( A ); \
+  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
+  T1 = _mm256_add_epi32( T1, H ); \
+  T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
   D  = _mm256_add_epi32( D,  T1 ); \
   H  = _mm256_add_epi32( T1, T2 ); \
+\
+  T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
+                                 W[ (i1) ] ); \
+  T1 = BSG2_1x( D ); \
+  T2 = BSG2_0x( H ); \
+  T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \
+  T1 = _mm256_add_epi32( T1, G ); \
+  T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
+  C  = _mm256_add_epi32( C,  T1 ); \
+  G  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
-*/
 
 #endif   // AVX512VL else AVX2
 
-// accepts LE byte ordered data, skip the byte swap
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
-                            const __m256i *state_in )
+static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
+                                          const  __m256i *in ) \
 {
    __m256i A, B, C, D, E, F, G, H;
-#if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z;
-#endif
-   __m256i W[16];
-   memcpy_256( W, data, 16 );
 
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
+   A = _mm256_load_si256( in   );
+   B = _mm256_load_si256( in+1 );
+   C = _mm256_load_si256( in+2 );
+   D = _mm256_load_si256( in+3 );
+   E = _mm256_load_si256( in+4 );
+   F = _mm256_load_si256( in+5 );
+   G = _mm256_load_si256( in+6 );
+   H = _mm256_load_si256( in+7 );
 
 #if !defined(__AVX512VL__)
-   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif
+
+   __m256i tic, toc = _mm256_xor_si256( B, C );
+
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, 0 );
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, 0 );
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, 0 );
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, 0 );
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, 0 );
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, 0 );
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, 0 );
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, 0 );
+
+#else
 
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
@@ -609,6 +452,8 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
    SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
    SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 
+#endif
+
    for ( int j = 16; j < 64; j += 16 )
    {
       W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
@@ -628,6 +473,19 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
       W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
       W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
 
+#if !defined(__AVX512VL__)
+
+      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, j );
+      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, j );
+      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, j );
+      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, j );
+      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, j );
+      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j );
+      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j );
+      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j );
+
+#else
+      
       SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
       SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
       SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
@@ -644,244 +502,52 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
       SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
       SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
       SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+
+#endif      
    }
 
-   state_out[0] = _mm256_add_epi32( state_in[0], A );
-   state_out[1] = _mm256_add_epi32( state_in[1], B );
-   state_out[2] = _mm256_add_epi32( state_in[2], C );
-   state_out[3] = _mm256_add_epi32( state_in[3], D );
-   state_out[4] = _mm256_add_epi32( state_in[4], E );
-   state_out[5] = _mm256_add_epi32( state_in[5], F );
-   state_out[6] = _mm256_add_epi32( state_in[6], G );
-   state_out[7] = _mm256_add_epi32( state_in[7], H );
+   out[0] = _mm256_add_epi32( in[0], A );
+   out[1] = _mm256_add_epi32( in[1], B );
+   out[2] = _mm256_add_epi32( in[2], C );
+   out[3] = _mm256_add_epi32( in[3], D );
+   out[4] = _mm256_add_epi32( in[4], E );
+   out[5] = _mm256_add_epi32( in[5], F );
+   out[6] = _mm256_add_epi32( in[6], G );
+   out[7] = _mm256_add_epi32( in[7], H );
 }
 
-
-// Accepts BE byte ordered data, need to byte swap
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+// accepts LE input data
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
                                const __m256i *state_in )
 {
-   __m256i A, B, C, D, E, F, G, H;
-#if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z;
-#endif
    __m256i W[16];
-
-   mm256_block_bswap_32( W  , data   );
-   mm256_block_bswap_32( W+8, data+8 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
-
-#if !defined(__AVX512VL__)
-   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif
-   
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   state_out[0] = _mm256_add_epi32( state_in[0], A );
-   state_out[1] = _mm256_add_epi32( state_in[1], B );
-   state_out[2] = _mm256_add_epi32( state_in[2], C );
-   state_out[3] = _mm256_add_epi32( state_in[3], D );
-   state_out[4] = _mm256_add_epi32( state_in[4], E );
-   state_out[5] = _mm256_add_epi32( state_in[5], F );
-   state_out[6] = _mm256_add_epi32( state_in[6], G );
-   state_out[7] = _mm256_add_epi32( state_in[7], H );
+   memcpy_256( W, data, 16 );
+   SHA256_8WAY_TRANSFORM( state_out, W, state_in );
 }
 
-static void
-sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
+// Accepts BE input data, need to bswap
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in )
 {
-   register  __m256i A, B, C, D, E, F, G, H;
-#if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z;
-#endif
    __m256i W[16];
-
-   mm256_block_bswap_32( W  , in   );
-   mm256_block_bswap_32( W+8, in+8 );
-
-   if ( ctx->initialized )
-   {
-      A = r[0];
-      B = r[1];
-      C = r[2];
-      D = r[3];
-      E = r[4];
-      F = r[5];
-      G = r[6];
-      H = r[7];
-   }
-   else
-   {
-      A = m256_const1_64( 0x6A09E6676A09E667 );
-      B = m256_const1_64( 0xBB67AE85BB67AE85 );
-      C = m256_const1_64( 0x3C6EF3723C6EF372 );
-      D = m256_const1_64( 0xA54FF53AA54FF53A );
-      E = m256_const1_64( 0x510E527F510E527F );
-      F = m256_const1_64( 0x9B05688C9B05688C );
-      G = m256_const1_64( 0x1F83D9AB1F83D9AB );
-      H = m256_const1_64( 0x5BE0CD195BE0CD19 );
-   }
-
-#if !defined(__AVX512VL__)
-   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif
-   
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   if ( ctx->initialized )
-   {
-      r[0] = _mm256_add_epi32( r[0], A );
-      r[1] = _mm256_add_epi32( r[1], B );
-      r[2] = _mm256_add_epi32( r[2], C );
-      r[3] = _mm256_add_epi32( r[3], D );
-      r[4] = _mm256_add_epi32( r[4], E );
-      r[5] = _mm256_add_epi32( r[5], F );
-      r[6] = _mm256_add_epi32( r[6], G );
-      r[7] = _mm256_add_epi32( r[7], H );
-   }
-   else
-   {
-      ctx->initialized = true;
-      r[0] = _mm256_add_epi32( A, m256_const1_64( 0x6A09E6676A09E667 ) );
-      r[1] = _mm256_add_epi32( B, m256_const1_64( 0xBB67AE85BB67AE85 ) );
-      r[2] = _mm256_add_epi32( C, m256_const1_64( 0x3C6EF3723C6EF372 ) );
-      r[3] = _mm256_add_epi32( D, m256_const1_64( 0xA54FF53AA54FF53A ) );
-      r[4] = _mm256_add_epi32( E, m256_const1_64( 0x510E527F510E527F ) );
-      r[5] = _mm256_add_epi32( F, m256_const1_64( 0x9B05688C9B05688C ) );
-      r[6] = _mm256_add_epi32( G, m256_const1_64( 0x1F83D9AB1F83D9AB ) );
-      r[7] = _mm256_add_epi32( H, m256_const1_64( 0x5BE0CD195BE0CD19 ) );
-   }
+   mm256_block_bswap_32( W  , data   );
+   mm256_block_bswap_32( W+8, data+8 );
+   SHA256_8WAY_TRANSFORM( state_out, W, state_in );
 }
 
 void sha256_8way_init( sha256_8way_context *sc )
 {
-   sc->initialized = false;
    sc->count_high = sc->count_low = 0;
-/*
-   sc->val[0] = _mm256_set1_epi32( H256[0] );
-   sc->val[1] = _mm256_set1_epi32( H256[1] );
-   sc->val[2] = _mm256_set1_epi32( H256[2] );
-   sc->val[3] = _mm256_set1_epi32( H256[3] );
-   sc->val[4] = _mm256_set1_epi32( H256[4] );
-   sc->val[5] = _mm256_set1_epi32( H256[5] );
-   sc->val[6] = _mm256_set1_epi32( H256[6] );
-   sc->val[7] = _mm256_set1_epi32( H256[7] );
-*/
+   sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m256_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 }
 
-
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.
 
@@ -906,7 +572,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
       len -= clen;
       if ( ptr == buf_size )
       {
-         sha256_8way_round( sc, sc->buf, sc->val );
+         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
          ptr = 0;
       }
       clow = sc->count_low;
@@ -931,7 +597,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
     if ( ptr > pad )
     {
          memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_8way_round( sc, sc->buf, sc->val );
+         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
          memset_zero_256( sc->buf, pad >> 2 );
     }
     else
@@ -944,7 +610,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
     sc->buf[   pad     >> 2 ] = m256_const1_32( bswap_32( high ) );
     sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
 
-    sha256_8way_round( sc, sc->buf, sc->val );
+    sha256_8way_transform_be( sc->val, sc->buf, sc->val );
 
     mm256_block_bswap_32( dst, sc->val );
 }
@@ -986,8 +652,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 
 #define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
-  __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \
-                                 W[ i ] ); \
+  __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
   __m512i T1 = BSG2_1x16( E ); \
   __m512i T2 = BSG2_0x16( A ); \
   T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
@@ -1011,23 +676,19 @@ do { \
 } while (0)
 */
 
-// accepts LE input data
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in )
+
+static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
+                                           const  __m512i *in ) \
 {
    __m512i A, B, C, D, E, F, G, H;
-   __m512i W[16];
-
-   memcpy_512( W, data, 16 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
+   A = _mm512_load_si512( in   );
+   B = _mm512_load_si512( in+1 );
+   C = _mm512_load_si512( in+2 );
+   D = _mm512_load_si512( in+3 );
+   E = _mm512_load_si512( in+4 );
+   F = _mm512_load_si512( in+5 );
+   G = _mm512_load_si512( in+6 );
+   H = _mm512_load_si512( in+7 );
 
    SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
@@ -1083,100 +744,36 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
       SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
    }
 
-   state_out[0] = _mm512_add_epi32( state_in[0], A );
-   state_out[1] = _mm512_add_epi32( state_in[1], B );
-   state_out[2] = _mm512_add_epi32( state_in[2], C );
-   state_out[3] = _mm512_add_epi32( state_in[3], D );
-   state_out[4] = _mm512_add_epi32( state_in[4], E );
-   state_out[5] = _mm512_add_epi32( state_in[5], F );
-   state_out[6] = _mm512_add_epi32( state_in[6], G );
-   state_out[7] = _mm512_add_epi32( state_in[7], H );
+   out[0] = _mm512_add_epi32( in[0], A );
+   out[1] = _mm512_add_epi32( in[1], B );
+   out[2] = _mm512_add_epi32( in[2], C );
+   out[3] = _mm512_add_epi32( in[3], D );
+   out[4] = _mm512_add_epi32( in[4], E );
+   out[5] = _mm512_add_epi32( in[5], F );
+   out[6] = _mm512_add_epi32( in[6], G );
+   out[7] = _mm512_add_epi32( in[7], H );
+}
+
+// accepts LE input data
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+                                const __m512i *state_in )
+{
+   __m512i W[16];
+   memcpy_512( W, data, 16 );
+   SHA256_16WAY_TRANSFORM( state_out, W, state_in );
 }
 
 // Accepts BE input data, need to bswap
 void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
                                 const __m512i *state_in )
 {
-   __m512i A, B, C, D, E, F, G, H;
    __m512i W[16];
-
    mm512_block_bswap_32( W  , data   );
    mm512_block_bswap_32( W+8, data+8 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
-
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   state_out[0] = _mm512_add_epi32( state_in[0], A );
-   state_out[1] = _mm512_add_epi32( state_in[1], B );
-   state_out[2] = _mm512_add_epi32( state_in[2], C );
-   state_out[3] = _mm512_add_epi32( state_in[3], D );
-   state_out[4] = _mm512_add_epi32( state_in[4], E );
-   state_out[5] = _mm512_add_epi32( state_in[5], F );
-   state_out[6] = _mm512_add_epi32( state_in[6], G );
-   state_out[7] = _mm512_add_epi32( state_in[7], H );
+   SHA256_16WAY_TRANSFORM( state_out, W, state_in );
 }
-
-// Aggresive prehashing
+ 
+// Aggresive prehashing, LE byte order
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                              const __m512i *state_in )
 {
@@ -1295,125 +892,19 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    _mm512_store_si512( state_out + 7,  H );
 }
 
-static void
-sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
-{
-   register __m512i A, B, C, D, E, F, G, H;
-   __m512i W[16];
-
-   mm512_block_bswap_32( W  , in   );
-   mm512_block_bswap_32( W+8, in+8 );
-
-   if ( ctx->initialized )
-   {
-      A = r[0];
-      B = r[1];
-      C = r[2];
-      D = r[3];
-      E = r[4];
-      F = r[5];
-      G = r[6];
-      H = r[7];
-   }
-   else
-   {
-      A = m512_const1_64( 0x6A09E6676A09E667 );
-      B = m512_const1_64( 0xBB67AE85BB67AE85 );
-      C = m512_const1_64( 0x3C6EF3723C6EF372 );
-      D = m512_const1_64( 0xA54FF53AA54FF53A );
-      E = m512_const1_64( 0x510E527F510E527F );
-      F = m512_const1_64( 0x9B05688C9B05688C );
-      G = m512_const1_64( 0x1F83D9AB1F83D9AB );
-      H = m512_const1_64( 0x5BE0CD195BE0CD19 );
-   }
-
-
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   if ( ctx->initialized )
-   {
-      r[0] = _mm512_add_epi32( r[0], A );
-      r[1] = _mm512_add_epi32( r[1], B );
-      r[2] = _mm512_add_epi32( r[2], C );
-      r[3] = _mm512_add_epi32( r[3], D );
-      r[4] = _mm512_add_epi32( r[4], E );
-      r[5] = _mm512_add_epi32( r[5], F );
-      r[6] = _mm512_add_epi32( r[6], G );
-      r[7] = _mm512_add_epi32( r[7], H );
-   }
-   else
-   {
-      ctx->initialized = true;
-      r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) );
-      r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) );
-      r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) );
-      r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) );
-      r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) );
-      r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) );
-      r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) );
-      r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) );
-   }
-}
-
 void sha256_16way_init( sha256_16way_context *sc )
 {
-   sc->initialized = false;
    sc->count_high = sc->count_low = 0;
+   sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m512_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m512_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 }
 
-
 void sha256_16way_update( sha256_16way_context *sc, const void *data,
                            size_t len )
 {
@@ -1436,7 +927,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
       len -= clen;
       if ( ptr == buf_size )
       {
-         sha256_16way_round( sc, sc->buf, sc->val );
+         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
          ptr = 0;
       }
       clow = sc->count_low;
@@ -1461,7 +952,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
     if ( ptr > pad )
     {
          memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_16way_round( sc, sc->buf, sc->val );
+         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
          memset_zero_512( sc->buf, pad >> 2 );
     }
     else
@@ -1474,7 +965,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
     sc->buf[   pad     >> 2 ] = m512_const1_32( bswap_32( high ) );
     sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
 
-    sha256_16way_round( sc, sc->buf, sc->val );
+    sha256_16way_transform_be( sc->val, sc->buf, sc->val );
 
     mm512_block_bswap_32( dst, sc->val );
 }
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index c53cb39f..8225595b 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -70,6 +70,8 @@ extern "C"{
            C8, C9, CA, CB, CC, CD, CE, CF; \
    __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
            M8, M9, MA, MB, MC, MD, ME, MF; \
+   const __m256i FIVE  = _mm256_set1_epi32( 5 ); \
+   const __m256i THREE = _mm256_set1_epi32( 3 ); \
    sph_u32 Wlow, Whigh;
 
 #define READ_STATE8(state) do \
@@ -314,8 +316,7 @@ do { \
             _mm256_andnot_si256( xb3, xb2 ), \
             _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
                _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   _mm256_set1_epi32(5UL) ) ), \
-               _mm256_set1_epi32(3UL) ) ) ); \
+                                   FIVE ) ), THREE ) ) ); \
    xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
 
@@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	        C8, C9, CA, CB, CC, CD, CE, CF; \
 	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
-	sph_u32 Wlow, Whigh;
+   const __m128i FIVE  = _mm_set1_epi32( 5 ); \
+   const __m128i THREE = _mm_set1_epi32( 3 ); \
+   sph_u32 Wlow, Whigh;
 
 #define READ_STATE(state) do \
 { \
@@ -931,8 +934,8 @@ do { \
    xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
             _mm_andnot_si128( xb3, xb2 ), \
             _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
-                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), FIVE ) \
+                   ) ), THREE ) ) ) ); \
    xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
 } while (0)
 
diff --git a/algo/swifftx/inttypes.h b/algo/swifftx/inttypes.h
index 2b6b941b..9f74eee2 100644
--- a/algo/swifftx/inttypes.h
+++ b/algo/swifftx/inttypes.h
@@ -18,16 +18,20 @@
  #ifndef __INTTYPES_H_
  #define __INTTYPES_H_
 
+#include <stdint.h>
+
  /* Use [u]intN_t if you need exactly N bits.
   XXX - doesn't handle the -mint8 option.  */
 
  typedef signed char swift_int8_t;
  typedef unsigned char swift_uint8_t;
 
- typedef int swift_int16_t;
+ typedef int32_t swift_int16_t;
+// typedef int swift_int16_t;
  typedef unsigned int swift_uint16_t;
 
- typedef long swift_int32_t;
+ typedef int32_t swift_int32_t;
+// typedef long swift_int32_t;
  typedef unsigned long swift_uint32_t;
 
  typedef long long swift_int64_t;
diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c
index f38ea854..d3ecd15c 100644
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -18,6 +18,8 @@
 //#include "stdbool.h"
 #include <memory.h>
 
+#include "simd-utils.h"
+
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // Constants and static tables portion.
 ///////////////////////////////////////////////////////////////////////////////////////////////
@@ -49,20 +51,20 @@
 // - A: the first operand. After the operation stores the sum of the two operands.
 // - B: the second operand. After the operation stores the difference between the first and the
 //   second operands.
-#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
+//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
 
 // Quickly reduces an integer modulo 257.
 //
 // Parameters:
 // - A: the input.
-#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
+//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
 
 // Since we need to do the setup only once, this is the indicator variable:
 static bool wasSetupDone = false;
 
 // This array stores the powers of omegas that correspond to the indices, which are the input
 // values. Known also as the "outer FFT twiddle factors".
-swift_int16_t multipliers[N];
+swift_int16_t multipliers[N] __attribute__ ((aligned (64)));
 
 // This array stores the powers of omegas, multiplied by the corresponding values.
 // We store this table to save computation time.
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
 // compression function, i is between 0 and 31, x_i is a 64-bit value.
 // One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
 // formula (2), section 3, page 6.
-swift_int16_t fftTable[256 * EIGHTH_N];
+swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));
 
 // The A's we use in SWIFFTX shall be random elements of Z_257.
 // We generated these A's from the decimal expansion of PI as follows:  we converted each
 // triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
 // element, otherwise move to the next triple of digits in the expansion. This guarntees that
 // the A's are random, provided that PI digits are.
-const swift_int16_t As[3 * M * N] =
+const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
 {141,  78, 139,  75, 238, 205, 129, 126,  22, 245, 197, 169, 142, 118, 105,  78,
   50, 149,  29, 208, 114,  34,  85, 117,  67, 148,  86, 256,  25,  49, 133,  93,
   95,  36,  68, 231, 211, 102, 151, 128, 224, 117, 193,  27, 102, 187,   7, 105,
@@ -636,9 +638,202 @@ void InitializeSWIFFTX()
 	wasSetupDone = true;
 }
 
+// In the original code the F matrix is rotated so it was not aranged
+// the same as all the other data. Rearanging F to match all the other
+// data made vectorizing possible, the compiler probably could have been
+// able to auto-vectorize with proper data organisation.
+// Also in the original code the custom 16 bit data types are all now 32
+// bit int32_t regardless of the type name.
+//
 void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 {
-	swift_int16_t *mult = multipliers;
+#if defined(__AVX2__)
+
+   __m256i F[8] __attribute__ ((aligned (64)));
+   __m256i *mul = (__m256i*)multipliers;
+   __m256i *out = (__m256i*)output;
+   __m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
+
+   F[0] = _mm256_mullo_epi32( mul[0], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
+   F[1] = _mm256_mullo_epi32( mul[1], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
+   F[2] = _mm256_mullo_epi32( mul[2], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
+   F[3] = _mm256_mullo_epi32( mul[3], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
+   F[4] = _mm256_mullo_epi32( mul[4], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
+   F[5] = _mm256_mullo_epi32( mul[5], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
+   F[6] = _mm256_mullo_epi32( mul[6], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
+   F[7] = _mm256_mullo_epi32( mul[7], *tbl );
+
+   #define ADD_SUB( a, b ) \
+   { \
+      __m256i tmp = b; \
+      b = _mm256_sub_epi32( a, b ); \
+      a = _mm256_add_epi32( a, tmp ); \
+   }
+   
+   ADD_SUB( F[0], F[1] );
+   ADD_SUB( F[2], F[3] );
+   ADD_SUB( F[4], F[5] );
+   ADD_SUB( F[6], F[7] );
+
+   F[3] = _mm256_slli_epi32( F[3], 4 );
+   F[7] = _mm256_slli_epi32( F[7], 4 );
+
+   ADD_SUB( F[0], F[2] );
+   ADD_SUB( F[1], F[3] );
+   ADD_SUB( F[4], F[6] );
+   ADD_SUB( F[5], F[7] );  
+
+   F[5] = _mm256_slli_epi32( F[5], 2 );
+   F[6] = _mm256_slli_epi32( F[6], 4 );
+   F[7] = _mm256_slli_epi32( F[7], 6 );
+
+   ADD_SUB( F[0], F[4] );
+   ADD_SUB( F[1], F[5] );
+   ADD_SUB( F[2], F[6] );
+   ADD_SUB( F[3], F[7] );
+
+   #undef ADD_SUB
+
+#if defined (__AVX512VL__) && defined(__AVX512BW__)   
+
+   #define Q_REDUCE( a ) \
+       _mm256_sub_epi32( _mm256_and_si256( a, \
+                 _mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) ) 
+
+#else   
+
+   #define Q_REDUCE( a ) \
+       _mm256_sub_epi32( _mm256_and_si256( a, \
+                   m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) ) 
+
+#endif
+                          
+   out[0] = Q_REDUCE( F[0] );  
+   out[1] = Q_REDUCE( F[1] );                        
+   out[2] = Q_REDUCE( F[2] );                        
+   out[3] = Q_REDUCE( F[3] );                        
+   out[4] = Q_REDUCE( F[4] );                        
+   out[5] = Q_REDUCE( F[5] );                        
+   out[6] = Q_REDUCE( F[6] );                        
+   out[7] = Q_REDUCE( F[7] );
+
+   #undef Q_REDUCE
+
+#elif defined(__SSE4_1__)
+
+   __m128i F[16] __attribute__ ((aligned (64)));
+   __m128i *mul = (__m128i*)multipliers;
+   __m128i *out = (__m128i*)output;
+   __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
+
+   F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
+   F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
+   F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
+   F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
+   F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
+   F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
+   F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
+   F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
+   F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
+   F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
+   F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
+   F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
+   F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
+   F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
+   F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
+   F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
+
+   #define ADD_SUB( a, b ) \
+   { \
+      __m128i tmp = b; \
+      b = _mm_sub_epi32( a, b ); \
+      a = _mm_add_epi32( a, tmp ); \
+   }
+
+   ADD_SUB( F[ 0], F[ 2] );
+   ADD_SUB( F[ 1], F[ 3] );
+   ADD_SUB( F[ 4], F[ 6] );
+   ADD_SUB( F[ 5], F[ 7] );
+   ADD_SUB( F[ 8], F[10] );
+   ADD_SUB( F[ 9], F[11] );
+   ADD_SUB( F[12], F[14] );
+   ADD_SUB( F[13], F[15] );
+
+   F[ 6] = _mm_slli_epi32( F[ 6], 4 );
+   F[ 7] = _mm_slli_epi32( F[ 7], 4 );
+   F[14] = _mm_slli_epi32( F[14], 4 );
+   F[15] = _mm_slli_epi32( F[15], 4 );
+
+   ADD_SUB( F[ 0], F[ 4] );
+   ADD_SUB( F[ 1], F[ 5] );
+   ADD_SUB( F[ 2], F[ 6] );
+   ADD_SUB( F[ 3], F[ 7] );
+   ADD_SUB( F[ 8], F[12] );
+   ADD_SUB( F[ 9], F[13] );
+   ADD_SUB( F[10], F[14] );
+   ADD_SUB( F[11], F[15] );
+
+   F[10] = _mm_slli_epi32( F[10], 2 );
+   F[11] = _mm_slli_epi32( F[11], 2 );
+   F[12] = _mm_slli_epi32( F[12], 4 );
+   F[13] = _mm_slli_epi32( F[13], 4 );
+   F[14] = _mm_slli_epi32( F[14], 6 );
+   F[15] = _mm_slli_epi32( F[15], 6 );
+   
+   ADD_SUB( F[ 0], F[ 8] );
+   ADD_SUB( F[ 1], F[ 9] );
+   ADD_SUB( F[ 2], F[10] );
+   ADD_SUB( F[ 3], F[11] );
+   ADD_SUB( F[ 4], F[12] );
+   ADD_SUB( F[ 5], F[13] );
+   ADD_SUB( F[ 6], F[14] );
+   ADD_SUB( F[ 7], F[15] );
+
+   #undef ADD_SUB
+
+   #define Q_REDUCE( a ) \
+      _mm_sub_epi32( _mm_and_si128( a, \
+                   m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) ) 
+
+   out[ 0] = Q_REDUCE( F[ 0] );
+   out[ 1] = Q_REDUCE( F[ 1] );
+   out[ 2] = Q_REDUCE( F[ 2] );
+   out[ 3] = Q_REDUCE( F[ 3] );
+   out[ 4] = Q_REDUCE( F[ 4] );
+   out[ 5] = Q_REDUCE( F[ 5] );
+   out[ 6] = Q_REDUCE( F[ 6] );
+   out[ 7] = Q_REDUCE( F[ 7] );
+   out[ 8] = Q_REDUCE( F[ 8] );
+   out[ 9] = Q_REDUCE( F[ 9] );
+   out[10] = Q_REDUCE( F[10] );
+   out[11] = Q_REDUCE( F[11] );
+   out[12] = Q_REDUCE( F[12] );
+   out[13] = Q_REDUCE( F[13] );
+   out[14] = Q_REDUCE( F[14] );
+   out[15] = Q_REDUCE( F[15] );
+
+   #undef Q_REDUCE
+
+#else   // < SSE4.1
+   
+   swift_int16_t *mult = multipliers;
+
+   // First loop unrolling:
+	register swift_int16_t *table = &(fftTable[input[0] << 3]);
 
 /*
    swift_int32_t F[64];
@@ -666,11 +861,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
                 F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
                 F60, F61, F62, F63;
    
-   // First loop unrolling:
-	register swift_int16_t *table = &(fftTable[input[0] << 3]);
-
-	F0 = mult[0] * table[0];
-	F8 = mult[1] * table[1];
+	F0  = mult[0] * table[0];
+	F8  = mult[1] * table[1];
 	F16 = mult[2] * table[2];
 	F24 = mult[3] * table[3];
 	F32 = mult[4] * table[4];
@@ -678,90 +870,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	F48 = mult[6] * table[6];
 	F56 = mult[7] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[1] << 3]);
 
-	F1 = mult[0] * table[0];
-	F9 = mult[1] * table[1];
-	F17 = mult[2] * table[2];
-	F25 = mult[3] * table[3];
-	F33 = mult[4] * table[4];
-	F41 = mult[5] * table[5];
-	F49 = mult[6] * table[6];
-	F57 = mult[7] * table[7];
+	F1  = mult[ 8] * table[0];
+	F9  = mult[ 9] * table[1];
+	F17 = mult[10] * table[2];
+	F25 = mult[11] * table[3];
+	F33 = mult[12] * table[4];
+	F41 = mult[13] * table[5];
+	F49 = mult[14] * table[6];
+	F57 = mult[15] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[2] << 3]);
 
-	F2 = mult[0] * table[0];
-	F10 = mult[1] * table[1];
-	F18 = mult[2] * table[2];
-	F26 = mult[3] * table[3];
-	F34 = mult[4] * table[4];
-	F42 = mult[5] * table[5];
-	F50 = mult[6] * table[6];
-	F58 = mult[7] * table[7];
+	F2  = mult[16] * table[0];
+	F10 = mult[17] * table[1];
+	F18 = mult[18] * table[2];
+	F26 = mult[19] * table[3];
+	F34 = mult[20] * table[4];
+	F42 = mult[21] * table[5];
+	F50 = mult[22] * table[6];
+	F58 = mult[23] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[3] << 3]);
 
-	F3 = mult[0] * table[0];
-	F11 = mult[1] * table[1];
-	F19 = mult[2] * table[2];
-	F27 = mult[3] * table[3];
-	F35 = mult[4] * table[4];
-	F43 = mult[5] * table[5];
-	F51 = mult[6] * table[6];
-	F59 = mult[7] * table[7];
+	F3  = mult[24] * table[0];
+	F11 = mult[25] * table[1];
+	F19 = mult[26] * table[2];
+	F27 = mult[27] * table[3];
+	F35 = mult[28] * table[4];
+	F43 = mult[29] * table[5];
+	F51 = mult[30] * table[6];
+	F59 = mult[31] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[4] << 3]);
 
-	F4 = mult[0] * table[0];
-	F12 = mult[1] * table[1];
-	F20 = mult[2] * table[2];
-	F28 = mult[3] * table[3];
-	F36 = mult[4] * table[4];
-	F44 = mult[5] * table[5];
-	F52 = mult[6] * table[6];
-	F60 = mult[7] * table[7];
+	F4  = mult[32] * table[0];
+	F12 = mult[33] * table[1];
+	F20 = mult[34] * table[2];
+	F28 = mult[35] * table[3];
+	F36 = mult[36] * table[4];
+	F44 = mult[37] * table[5];
+	F52 = mult[38] * table[6];
+	F60 = mult[39] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[5] << 3]);
 
-	F5 = mult[0] * table[0];
-	F13 = mult[1] * table[1];
-	F21 = mult[2] * table[2];
-	F29 = mult[3] * table[3];
-	F37 = mult[4] * table[4];
-	F45 = mult[5] * table[5];
-	F53 = mult[6] * table[6];
-	F61 = mult[7] * table[7];
+	F5  = mult[40] * table[0];
+	F13 = mult[41] * table[1];
+	F21 = mult[42] * table[2];
+	F29 = mult[43] * table[3];
+	F37 = mult[44] * table[4];
+	F45 = mult[45] * table[5];
+	F53 = mult[46] * table[6];
+	F61 = mult[47] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[6] << 3]);
 
-	F6 = mult[0] * table[0];
-	F14 = mult[1] * table[1];
-	F22 = mult[2] * table[2];
-	F30 = mult[3] * table[3];
-	F38 = mult[4] * table[4];
-	F46 = mult[5] * table[5];
-	F54 = mult[6] * table[6];
-	F62 = mult[7] * table[7];
+	F6  = mult[48] * table[0];
+	F14 = mult[49] * table[1];
+	F22 = mult[50] * table[2];
+	F30 = mult[51] * table[3];
+	F38 = mult[52] * table[4];
+	F46 = mult[53] * table[5];
+	F54 = mult[54] * table[6];
+	F62 = mult[55] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[7] << 3]);
 
-	F7 = mult[0] * table[0];
-	F15 = mult[1] * table[1];
-	F23 = mult[2] * table[2];
-	F31 = mult[3] * table[3];
-	F39 = mult[4] * table[4];
-	F47 = mult[5] * table[5];
-	F55 = mult[6] * table[6];
-	F63 = mult[7] * table[7];
-
+	F7  = mult[56] * table[0];
+	F15 = mult[57] * table[1];
+	F23 = mult[58] * table[2];
+	F31 = mult[59] * table[3];
+	F39 = mult[60] * table[4];
+	F47 = mult[61] * table[5];
+	F55 = mult[62] * table[6];
+	F63 = mult[63] * table[7];
+
+   #define ADD_SUB( a, b ) \
+   { \
+      int temp = b; \
+      b = a - b; \
+      a = a + temp; \
+   }
+   
+   #define Q_REDUCE( a ) \
+      ( ( (a) & 0xff ) - ( (a) >> 8 ) )
+   
 /*
 
    for ( int i = 0; i < 8; i++ )
@@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
    }
 */
 
-
 	// Second loop unrolling:
 	// Iteration 0:
 	ADD_SUB(F0, F1);
@@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	output[47] = Q_REDUCE(F61);
 	output[55] = Q_REDUCE(F62);
 	output[63] = Q_REDUCE(F63);
+
+   #undef ADD_SUB
+   #undef Q_REDUCE
+
+#endif  // AVX2 elif SSE4.1 else
 }
 
 // Calculates the FFT part of SWIFFT.
@@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
 // - m: the input size divided by 64.
 // - output: will store the result.
 // - a: the coefficients in the sum. Of size 64 * m.
-void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
+void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
+                const swift_int16_t *a )
 {
 	int i, j;
-	swift_int32_t result[N];
+	swift_int32_t result[N] __attribute__ ((aligned (64)));
 	register swift_int16_t carry = 0;
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+   __m512i *res = (__m512i*)result;
+   for ( j = 0; j < N/16; ++j )
+   {
+      __m512i sum = _mm512_setzero_si512();
+      const __m512i *f = (__m512i*)input + j;
+      const __m512i *k = (__m512i*)a + j;
+      for ( i = 0; i < m; i++, f += N/16, k += N/16 )
+         sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#elif defined(__AVX2__)
+
+   __m256i *res = (__m256i*)result;
+   for ( j = 0; j < N/8; ++j )
+   {
+      __m256i sum = _mm256_setzero_si256();
+      const __m256i *f = (__m256i*)input + j;
+      const __m256i *k = (__m256i*)a + j;
+      for ( i = 0; i < m; i++, f += N/8, k += N/8 )
+         sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#elif defined(__SSE4_1__)
+
+   __m128i *res = (__m128i*)result;
+   for ( j = 0; j < N/4; ++j )
+   {
+      __m128i sum = _mm_setzero_si128();
+      const __m128i *f = (__m128i*)input + j;
+      const __m128i *k = (__m128i*)a + j;
+      for ( i = 0; i < m; i++, f += N/4, k += N/4 )
+         sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#else
+
 	for (j = 0; j < N; ++j)
 	{
 		register swift_int32_t sum = 0;
 		const register swift_int32_t *f = input + j;
 		const register swift_int16_t *k = a + j;
-
 		for (i = 0; i < m; i++, f += N,k += N)
 			sum += (*f) * (*k);
-
 		result[j] = sum;
 	}
 
+#endif
+
 	for (j = 0; j < N; ++j)
 		result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
 
@@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
 	int i;
 	// Will store the result of the FFT parts:
-	swift_int32_t fftOut[N * M];
-	unsigned char intermediate[N * 3 + 8];
+	swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
+	unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
 	unsigned char carry0,carry1,carry2;
 
 	// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
@@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
    int i;
    // Will store the result of the FFT parts:
-   swift_int32_t fftOut[N * M];
-   unsigned char intermediate[N * 3 + 8];
+   swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
+   unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
    unsigned char carry0,carry1,carry2;
 
    // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
diff --git a/configure b/configure
index db3efc9f..ae0d7bec 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.0'
-PACKAGE_STRING='cpuminer-opt 3.18.0'
+PACKAGE_VERSION='3.18.1'
+PACKAGE_STRING='cpuminer-opt 3.18.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.18.0
+cpuminer-opt configure 3.18.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.18.0, which was
+It was created by cpuminer-opt $as_me 3.18.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.18.0'
+ VERSION='3.18.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.0, which was
+This file was extended by cpuminer-opt $as_me 3.18.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.0
+cpuminer-opt config.status 3.18.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index fbe5a9b0..869b3669 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.0])
+AC_INIT([cpuminer-opt], [3.18.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index c8895381..2a63729e 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2083,7 +2083,8 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                            / ( opt_target_factor * opt_diff_factor );
    diff_to_hash( g_work->target, g_work->targetdiff );
 
-   // Increment extranonce2
+   // Pre increment extranonce2 in case of being called again before receiving
+   // a new job
    for ( int t = 0;
          t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] );
          t++ );
@@ -2103,20 +2104,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 
    pthread_mutex_unlock( &stats_lock );
 
-   if ( !opt_quiet )
-   {
-      int mismatch = submitted_share_count
-         - ( accepted_share_count + stale_share_count + rejected_share_count );
-      if ( mismatch )
-         applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
-   }
-
    if ( stratum_diff != sctx->job.diff )
       applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
                         sctx->job.diff, sctx->block_height, g_work->job_id );
    else if ( last_block_height != sctx->block_height )
-      applog( LOG_BLUE, "New Block %d, Job %s",
-                        sctx->block_height, g_work->job_id );
+      applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
+                        sctx->block_height, net_diff, g_work->job_id );
    else if ( g_work->job_id && new_job )
       applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
                          sctx->block_height, net_diff, g_work->job_id );
@@ -2173,7 +2166,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                {
                   double net_hr = nd / net_ttf;
                   char net_hr_units[4] = {0};
-
                   scale_hash_for_display ( &net_hr, net_hr_units );
                   applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
                                      net_hr, net_hr_units );
@@ -2182,6 +2174,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
          }  // hr > 0
       } // !quiet
    }  // new diff/block
+
+   if ( new_job && !opt_quiet )
+   {
+      int mismatch = submitted_share_count - ( accepted_share_count
+                                             + stale_share_count
+                                             + rejected_share_count );
+      if ( mismatch )
+         applog( LOG_INFO,
+                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
+                 submitted_share_count );
+   }
 }
 
 static void *miner_thread( void *userdata )
@@ -3970,6 +3973,7 @@ int main(int argc, char *argv[])
    gettimeofday( &last_submit_time, NULL );
    memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
    memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
    pthread_mutex_unlock( &stats_lock );
 
    applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 3d840107..1116976f 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -1,7 +1,7 @@
 #if !defined(SIMD_256_H__)
 #define SIMD_256_H__ 1
 
-#if defined(__AVX2__)
+//#if defined(__AVX2__)
 
 /////////////////////////////////////////////////////////////////////
 //
@@ -14,7 +14,9 @@
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
 
-// Used instead if casting.
+#if defined(__AVX__)
+
+// Used instead of casting.
 typedef union
 {
    __m256i m256;
@@ -23,6 +25,28 @@ typedef union
    uint32_t u32[8];
 } __attribute__ ((aligned (32))) m256_ovly;
 
+//
+// Pointer casting
+
+// p = any aligned pointer
+// returns p as pointer to vector type, not very useful
+#define castp_m256i(p) ((__m256i*)(p))
+
+// p = any aligned pointer
+// returns *p, watch your pointer arithmetic
+#define cast_m256i(p) (*((__m256i*)(p)))
+
+// p = any aligned pointer, i = scaled array index
+// returns value p[i]
+#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
+
+// p = any aligned pointer, o = scaled offset
+// returns pointer p+o
+#define casto_m256i(p,o) (((__m256i*)(p))+(o))
+
+#endif
+#if defined(__AVX2__)
+
 
 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn()
 #define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
 #define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
 
-//
-// Pointer casting
-
-// p = any aligned pointer
-// returns p as pointer to vector type, not very useful
-#define castp_m256i(p) ((__m256i*)(p))
-
-// p = any aligned pointer
-// returns *p, watch your pointer arithmetic
-#define cast_m256i(p) (*((__m256i*)(p)))
-
-// p = any aligned pointer, i = scaled array index
-// returns value p[i]
-#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
-
-// p = any aligned pointer, o = scaled offset
-// returns pointer p+o
-#define casto_m256i(p,o) (((__m256i*)(p))+(o))
-
-
 //
 // Memory functions
 // n = number of 256 bit (32 byte) vectors
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index de948cc4..3cc090a4 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
 
 // Rotate 256 bit lanes by one 64 bit element
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
-
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )
 
 // Rotate 256 bit lanes by one 32 bit element
@@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // shufl2r is 2 input ...
 // Drop macros? They can easilly be rebuilt using shufl2 functions
 
-// add shuflr shufll functions performing rotate, returning first arg
-// They're faster than doing both, when both not needed.
-
 // Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
 // rotated v1 
 // visually confusing for shif2r because of arg order. First arg is always

From 1a234cbe534925518fed5270d109b5118f102163 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Tue, 19 Oct 2021 22:35:36 -0400
Subject: [PATCH 13/20] v3.18.2

---
 Makefile.am                    |   1 +
 RELEASE_NOTES                  |  14 +++
 algo/scrypt/scrypt-core-4way.c |  26 ++---
 algo/scrypt/scrypt.c           | 191 ++++++++++++++++++---------------
 algo/sha/sha-hash-4way.h       |   5 +
 algo/sha/sha2.c                |  24 +++--
 algo/sha/sha256-hash-4way.c    | 130 ++++++++++++++++++++++
 algo/sha/sha256-hash.h         |   4 +
 algo/sha/sha256d-4way.c        |  41 +++++--
 algo/sha/sha256d-4way.h        |  48 +++++++++
 algo/sha/sha256t-4way.c        |  26 ++---
 algo/sha/sph_sha2.c            |  30 ++++++
 algo/sha/sph_sha2.h            |   3 +
 configure                      |  20 ++--
 configure.ac                   |   2 +-
 cpu-miner.c                    |  61 ++++++-----
 miner.h                        |  18 ++--
 simd-utils/simd-int.h          |  21 ++--
 18 files changed, 475 insertions(+), 190 deletions(-)
 create mode 100644 algo/sha/sha256d-4way.h

diff --git a/Makefile.am b/Makefile.am
index a4163b33..36e208ae 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
   algo/sha/hmac-sha256-hash-4way.c \
   algo/sha/sha256d.c \
   algo/sha/sha2.c \
+  algo/sha/sha256d-4way.c \
   algo/sha/sha256t-gate.c \
   algo/sha/sha256t-4way.c \
   algo/sha/sha256t.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index ef3f912f..b3b48785 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.8.2
+
+Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
+
+AVX512 for sha256d.
+
+SSE42 and AVX may now be displayed as mining features at startup.
+This is hard coded for each algo, and is only implemented for scrypt
+at this time as it is the only algo with significant performance differences
+with those features.
+
+Fixed an issue where a high hashrate algo could cause excessive invalid hash
+rate log reports when starting up in benchmark mode.
+
 v3.18.1
 
 More speed for scrypt:
diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
index 1039c3fc..23ad4e62 100644
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -337,42 +337,42 @@ do{ \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA1 ); \
+   XA1 = ROL_1X32( XA1 ); \
    TB = ADD32( XB2, XB1 ); \
    TC = ADD32( XC2, XC1 ); \
-   TA = ROL32( TA, 13 ); \
-   XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
+   TA = ROL32( TA, 13 ); \
    XA3 = XOR( XA3, TA ); \
+   XC1 = ROL_1X32( XC1 ); \
    TB = ROL32( TB, 13 ); \
    XB3 = XOR( XB3, TB ); \
    TC = ROL32( TC, 13 ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA2 ); \
+   XA2 = SWAP_64( XA2 ); \
    TB = ADD32( XB3, XB2 ); \
    TC = ADD32( XC3, XC2 ); \
    TA = ROL32( TA, 18 ); \
-   XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
    XA0 = XOR( XA0, TA ); \
    TB = ROL32( TB, 18 ); \
    XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
    TC = ROL32( TC, 18 ); \
    XC0 = XOR( XC0, TC ); \
 \
    TA = ADD32( XA0, XA1 ); \
+   XA3 = ROR_1X32( XA3 ); \
    TB = ADD32( XB0, XB1 ); \
    TC = ADD32( XC0, XC1 ); \
    TA = ROL32( TA, 7 ); \
-   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
    XA3 = XOR( XA3, TA ); \
    TB = ROL32( TB, 7 ); \
-   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
    XB3 = XOR( XB3, TB ); \
    TC = ROL32( TC, 7 ); \
-   XC3 = ROR_1X32( XC3 ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA0 ); \
@@ -399,24 +399,24 @@ do{ \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA2 ); \
+   XA2 = SWAP_64( XA2 ); \
    TB = ADD32( XB1, XB2 ); \
+   XB2 = SWAP_64( XB2 ); \
    TA = ROL32( TA, 18); \
    TC = ADD32( XC1, XC2 ); \
-   XA2 = SWAP_64( XA2 ); \
+   XC2 = SWAP_64( XC2 ); \
    TB = ROL32( TB, 18); \
    XA0 = XOR( XA0, TA ); \
-   XB2 = SWAP_64( XB2 ); \
+   XA1 = ROR_1X32( XA1 ); \
    TC = ROL32( TC, 18); \
    XB0 = XOR( XB0, TB ); \
-   XC2 = SWAP_64( XC2 ); \
-   XA1 = ROR_1X32( XA1 ); \
    XB1 = ROR_1X32( XB1 ); \
    XC0 = XOR( XC0, TC ); \
    XC1 = ROR_1X32( XC1 ); \
 } while (0);
    
 
-// slow rol, an attempt to optimze non-avx512 bit rotations
+// slow rot, an attempt to optimze non-avx512 bit rotations
 // Contains target specific instructions, only for use with 128 bit vectors
 #define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
 do{ \
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index e919ccb3..95639691 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -28,7 +28,6 @@
  */
 
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 
-static int scrypt_throughput = 0;
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define SCRYPT_THROUGHPUT 16
+
+#elif defined(__AVX2__)
+
+#define SCRYPT_THROUGHPUT 8
+
+#else
+
+#define SCRYPT_THROUGHPUT 4
+
+#endif
+
+// static int scrypt_throughput = 0;
 
 static int scratchbuf_size = 0;
 
-static __thread char *scratchbuf = NULL;
+static __thread uint32_t *scratchbuf = NULL;
 
 // change this to a constant to be used directly  as input state arg
 // vectors still need an init function.
@@ -709,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
 
 #endif // AVX512
 
-//#if defined(USE_ASM) && defined(__x86_64__)
-
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
-//int scrypt_best_throughput();
 void scrypt_core(uint32_t *X, uint32_t *V, int N);
 void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
 
-//#if defined(USE_AVX2)
 #if defined(__AVX2__)
 #undef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 24
@@ -727,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
 
 #ifndef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 1
-//#define scrypt_best_throughput() 1
 #endif
 
 #include "scrypt-core-4way.h"
 
-static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
-	uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
+/*
+static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
+                              uint32_t *midstate, int N, int thr_id )
 {
 	uint32_t tstate[8], ostate[8];
 	uint32_t X[32];
-	uint32_t *V = (uint32_t*)scratchpad;
 	
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
 
-   scrypt_core_simd128( X, V, N );  // woring
+   scrypt_core_simd128( X, scratchbuf, N );  // woring
 //   scrypt_core_1way( X, V, N );  // working
 //   scrypt_core(X, V, N);
 
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
    return true;
 }
+*/
 
-#if defined(__AVX2__)
+#if ( SCRYPT_THROUGHPUT == 8 )
 
 static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                  uint32_t *midstate, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[ 8*8 ];
    uint32_t _ALIGN(128) ostate[ 8*8 ];
    uint32_t _ALIGN(128) W[ 8*32 ];
    uint32_t _ALIGN(128) X[ 8*32 ];
-   uint32_t *V = (uint32_t*)scratchpad;
 
    intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
                    input+80, input+100, input+120, input+140, 640 );
@@ -774,11 +782,11 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    
    if ( opt_param_n > 0x4000 )
    {
-      scrypt_core_simd128_3buf( X,     V, N );
+      scrypt_core_simd128_3buf( X,     scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+ 96, V, N );
+      scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+192, V, N );
+      scrypt_core_simd128_2buf( X+192, scratchbuf, N );
    }
    else
    {
@@ -786,13 +794,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
       intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
       intrlv_2x128( W+128, X+128, X+160, 1024 );
       intrlv_2x128( W+192, X+192, X+224, 1024 );
-      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
       dintrlv_2x128( X,     X+ 32, W,     1024 );
       dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
       dintrlv_2x128( X+128, X+160, W+128, 1024 );
@@ -928,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
 
 #endif  // AVX2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if ( SCRYPT_THROUGHPUT == 16 )
 
 static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                   uint32_t *midstate, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[ 16*8 ];
    uint32_t _ALIGN(128) ostate[ 16*8 ];
    uint32_t _ALIGN(128) W[ 16*32 ]; 
    uint32_t _ALIGN(128) X[ 16*32 ];
-   uint32_t *V = (uint32_t*)scratchpad;
 
    intrlv_16x32( W, input,     input+ 20, input+ 40, input+ 60,
                     input+ 80, input+100, input+120, input+140,
@@ -956,17 +963,17 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 
    if ( opt_param_n > 0x4000 )
    {
-      scrypt_core_simd128_3buf( X,     V, N );
+      scrypt_core_simd128_3buf( X,     scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+ 96, V, N );
+      scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+192, V, N );
+      scrypt_core_simd128_2buf( X+192, scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+256, V, N );
+      scrypt_core_simd128_3buf( X+256, scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+352, V, N );
+      scrypt_core_simd128_3buf( X+352, scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+448, V, N );
+      scrypt_core_simd128_2buf( X+448, scratchbuf, N );
    }
    else
    {
@@ -974,13 +981,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
       intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
       intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
       intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
-      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
       dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
       dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
       dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
@@ -1236,15 +1243,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 
 #endif // AVX512
 
-#if defined(__SHA__)
-
+#if 0
 static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                      uint32_t *midstate, int N, int thrid )
 {
     uint32_t _ALIGN(128) tstate[ 2*8 ];
     uint32_t _ALIGN(128) ostate[ 2*8 ];
     uint32_t _ALIGN(128) W[ 2*32 ];
-    uint32_t *V = (uint32_t*)scratchpad;
 
     memcpy( tstate,    midstate, 32 );
     memcpy( tstate+ 8, midstate, 32 );
@@ -1254,7 +1259,7 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
     PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
                                    input, input+20,  W, W+32 );
 
-    scrypt_core_simd128_2buf( W, V, N );
+    scrypt_core_simd128_2buf( W, scratchbuf, N );
     if ( work_restart[thrid].restart ) return 0;
 
     PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
@@ -1264,12 +1269,11 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
 }
 
 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+           uint32_t *midstate, int N, int thrid )
 {
     uint32_t _ALIGN(128) tstate[4 * 8];
     uint32_t _ALIGN(128) ostate[4 * 8];
     uint32_t _ALIGN(128) W[4 * 32];
-    uint32_t *V = (uint32_t*)scratchpad;
 
     memcpy( tstate,    midstate, 32 );
     memcpy( tstate+ 8, midstate, 32 );
@@ -1300,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 */
 
    // working, double buffered linear simd
-   scrypt_core_simd128_2buf( W, V, N );
+   scrypt_core_simd128_2buf( W, scratchbuf, N );
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( W+64, V, N );
+   scrypt_core_simd128_2buf( W+64, scratchbuf, N );
 
 /*
    scrypt_core_simd128_3buf( W, V, N );
@@ -1323,17 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 
    return 1;
 }
+#endif
 
-#else
-
-#ifdef HAVE_SHA256_4WAY
+#if ( SCRYPT_THROUGHPUT == 4 )
 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+           uint32_t *midstate, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[ 4*8 ];
    uint32_t _ALIGN(128) ostate[ 4*8 ];
    uint32_t _ALIGN(128) W[ 4*32 ];
-   uint32_t *V = (uint32_t*)scratchpad;
 
    intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
    for ( int i = 0; i < 8; i++ )
@@ -1346,13 +1348,13 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
    {
       uint32_t _ALIGN(128) X[ 4*32 ];
       dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
-      scrypt_core_simd128_2buf( X, V, N );
+      scrypt_core_simd128_2buf( X, scratchbuf, N );
       if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+64, V, N );
+      scrypt_core_simd128_2buf( X+64, scratchbuf, N );
       intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
    }
    else
-      scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+      scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
 
 
 
@@ -1398,65 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
 
    return 1;
 }
-#endif /* HAVE_SHA256_4WAY */
+#endif   // SCRYPT_THROUGHPUT == 4
 
-#endif // SHA
+//#endif // SHA
 
 extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
+   uint32_t _ALIGN(64) hash[  8*SCRYPT_THROUGHPUT ];
+   uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
    uint32_t midstate[8];
    uint32_t n = pdata[19] - 1;
    int thr_id = mythr->id;  
-   int throughput = scrypt_throughput;
    int i;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
-   for ( i = 0; i < throughput; i++ )
+   for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
       memcpy( data + i * 20, pdata, 80 );
 
    sha256_transform_le( midstate, data, sha256_initial_state );
 
    do {
       bool rc = true;
-      for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-      if ( throughput == 16 )
-         rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
-                                      opt_param_n, thr_id );
-      else
-#endif
-#if defined(__AVX2__)      
-      if ( throughput == 8 )      
-         rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
-                                     opt_param_n, thr_id );
-      else
-#endif
-      if ( throughput == 4 ) // slower on Ryzen than 8way
-#if defined(__SHA__)
-         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
-                                         opt_param_n, thr_id );
+      for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;
+
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if ( SCRYPT_THROUGHPUT == 16 )
+//      if ( SCRYPT_THROUGHPUT == 16 )
+         rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
+                                      thr_id );
+//      else
+//#endif
+//#if defined(__AVX2__)      
+#elif ( SCRYPT_THROUGHPUT == 8 )
+//         if ( SCRYPT_THROUGHPUT == 8 )      
+         rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
+                                     thr_id );
+//      else
+//#endif
+#elif ( SCRYPT_THROUGHPUT == 4 )
+//      if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
+//#if defined(__SHA__)
+//         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
+//                                         thr_id );
+//#else
+         rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
+                                     thr_id );
 #else
-         rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
-                                     opt_param_n, thr_id );
+
+#error "Invalid SCRYPT_THROUGHPUT"
+
 #endif
+/*
 #if defined(__SHA__)
       else
-      if (throughput == 2 )  // slower on Ryzen than 4way_sha & 8way
-         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
-                                         opt_param_n, thr_id );
+      if ( SCRYPT_THROUGHPUT == 2 )  // slower on Ryzen than 4way_sha & 8way
+         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
+                                         thr_id );
 #endif         
       else  // should never get here
-         rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
-                                opt_param_n, thr_id );
+         rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
+*/
 
       // test the hash
       if ( rc )
-      for ( i = 0; i < throughput; i++ )
+      for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
       {
          if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
          {
@@ -1468,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
       }
 
 
-   } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
+   } while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
 	
 	*hashes_done = n - pdata[19];
 	pdata[19] = n;
@@ -1489,7 +1499,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
 //#if defined(__SHA__)
 //   gate->optimizations = SSE2_OPT | SHA_OPT;
 //#else
-   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
 //#endif
    gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
    gate->scanhash         = (void*)&scanhash_scrypt;
@@ -1497,8 +1507,11 @@ bool register_scrypt_algo( algo_gate_t* gate )
    opt_param_n = opt_param_n ? opt_param_n : 1024;
    applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
 
+// scrypt_throughput can be defined at compile time and used to replace
+// MAX_WAYS to reduce memory usage.
+   
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   scrypt_throughput = 16;
+//   scrypt_throughput = 16;
    if ( opt_param_n > 0x4000 )
       scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
    else      
@@ -1511,13 +1524,13 @@ bool register_scrypt_algo( algo_gate_t* gate )
 */
 
 #elif defined(__AVX2__)
-   scrypt_throughput = 8;   
+//   scrypt_throughput = 8;   
    if ( opt_param_n > 0x4000 )
       scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
    else
       scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
 #else
-   scrypt_throughput = 4;
+//   scrypt_throughput = 4;
    if ( opt_param_n > 0x4000 )
    scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
    else
@@ -1533,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
    format_number_si( &d_size, d_units );
    
    applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
-          scrypt_throughput, t_size, t_units, d_size, d_units );
+          SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
 
    return true;
 };
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index de3f1d43..63a8f927 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -84,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
                                const __m256i *state_in );
 
+void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
+                             const __m256i *state_in );
+void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+                          const __m256i *state_in, const __m256i *state_mid );
+
 #endif  // AVX2
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 2a229bf6..63651c3d 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -8,7 +8,7 @@
  * any later version.  See COPYING for more details.
  */
 
-#include "algo-gate-api.h"
+#include "sha256d-4way.h"
 
 #include <string.h>
 #include <inttypes.h>
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
 };
 
 // this performs the entire hash all over again, why?
+// because main function only does 56 rounds.
+
 static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 {
 	uint32_t S[16];
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
 void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);
 
-static inline int scanhash_sha256d_4way( struct work *work,
+static inline int scanhash_sha256d_4way_pooler( struct work *work,
              uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
         uint32_t *pdata = work->data;
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
 void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);
 
-static inline int scanhash_sha256d_8way( struct work *work,
+static inline int scanhash_sha256d_8way_pooler( struct work *work,
             uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
         uint32_t *pdata = work->data;
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
 
 #endif /* HAVE_SHA256_8WAY */
 
-int scanhash_sha256d( struct work *work,
+int scanhash_sha256d_pooler( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
         uint32_t *pdata = work->data;
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,
 
 #ifdef HAVE_SHA256_8WAY
 	if (sha256_use_8way())
-		return scanhash_sha256d_8way( work,	max_nonce, hashes_done, mythr );
+		return scanhash_sha256d_8way_pooler( work,	max_nonce, hashes_done, mythr );
 #endif
 #ifdef HAVE_SHA256_4WAY
 	if (sha256_use_4way())
-		return scanhash_sha256d_4way( work,	max_nonce, hashes_done, mythr );
+		return scanhash_sha256d_4way_pooler( work,	max_nonce, hashes_done, mythr );
 #endif
 	
 	memcpy(data, pdata + 16, 64);
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
 
 bool register_sha256d_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT;
-   gate->scanhash = (void*)&scanhash_sha256d;
-//   gate->hash     = (void*)&sha256d;
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256D_16WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_16way;
+#else
+   gate->scanhash = (void*)&scanhash_sha256d_pooler;
+#endif
+   //   gate->hash     = (void*)&sha256d;
    return true;
 };
 
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index 1c630cc8..b520746e 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -548,6 +548,136 @@ void sha256_8way_init( sha256_8way_context *sc )
    sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 }
 
+// Aggresive prehashing, LE byte order
+void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
+                             const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+
+   A = _mm256_load_si256( state_in     );
+   B = _mm256_load_si256( state_in + 1 );
+   C = _mm256_load_si256( state_in + 2 );
+   D = _mm256_load_si256( state_in + 3 );
+   E = _mm256_load_si256( state_in + 4 );
+   F = _mm256_load_si256( state_in + 5 );
+   G = _mm256_load_si256( state_in + 6 );
+   H = _mm256_load_si256( state_in + 7 );
+
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+
+   _mm256_store_si256( state_mid    , A );
+   _mm256_store_si256( state_mid + 1, B );
+   _mm256_store_si256( state_mid + 2, C );
+   _mm256_store_si256( state_mid + 3, D );
+   _mm256_store_si256( state_mid + 4, E );
+   _mm256_store_si256( state_mid + 5, F );
+   _mm256_store_si256( state_mid + 6, G );
+   _mm256_store_si256( state_mid + 7, H );
+}
+
+void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+                          const __m256i *state_in, const __m256i *state_mid )
+{
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];
+
+   memcpy_256( W, data, 16 );
+
+   A = _mm256_load_si256( state_mid     );
+   B = _mm256_load_si256( state_mid + 1 );
+   C = _mm256_load_si256( state_mid + 2 );
+   D = _mm256_load_si256( state_mid + 3 );
+   E = _mm256_load_si256( state_mid + 4 );
+   F = _mm256_load_si256( state_mid + 5 );
+   G = _mm256_load_si256( state_mid + 6 );
+   H = _mm256_load_si256( state_mid + 7 );
+
+//   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+//   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+//   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
+#endif
+
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   A = _mm256_add_epi32( A, _mm256_load_si256( state_in     ) );
+   B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
+   C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
+   D = _mm256_add_epi32( D, _mm256_load_si256( state_in + 3 ) );
+   E = _mm256_add_epi32( E, _mm256_load_si256( state_in + 4 ) );
+   F = _mm256_add_epi32( F, _mm256_load_si256( state_in + 5 ) );
+   G = _mm256_add_epi32( G, _mm256_load_si256( state_in + 6 ) );
+   H = _mm256_add_epi32( H, _mm256_load_si256( state_in + 7 ) );
+
+   _mm256_store_si256( state_out    ,  A );
+   _mm256_store_si256( state_out + 1,  B );
+   _mm256_store_si256( state_out + 2,  C );
+   _mm256_store_si256( state_out + 3,  D );
+   _mm256_store_si256( state_out + 4,  E );
+   _mm256_store_si256( state_out + 5,  F );
+   _mm256_store_si256( state_out + 6,  G );
+   _mm256_store_si256( state_out + 7,  H );
+}
+
+
+
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.
 
diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h
index c6d61d8f..410ca90f 100644
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
 #define sha256_transform_be sph_sha256_transform_be
 
 #endif
+
+// SHA can't do only 3 rounds
+#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
+
 #endif
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
index fd3ae2f1..c69ad582 100644
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -1,4 +1,4 @@
-#include "sha256t-gate.h"
+#include "sha256d-4way.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    __m512i  block[16]    __attribute__ ((aligned (64)));
    __m512i  hash32[8]    __attribute__ ((aligned (32)));
    __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate1[8] __attribute__ ((aligned (32)));
    __m512i  midstate2[8] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    __m512i  vdata[20]    __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
    initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 
-   // hash first 64 byte block of data
-   sha256_16way_transform_le( midstate, vdata, initstate );
+   sha256_16way_transform_le( midstate1, vdata, initstate );
 
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
 
    do
    {
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_512( block + 5, 10 );
       block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
 
       // 2. 32 byte hash from 1.
       memcpy_512( block, hash32, 8 );
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    __m256i  block[16]    __attribute__ ((aligned (64)));
    __m256i  hash32[8]    __attribute__ ((aligned (32)));
    __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    __m256i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -116,7 +116,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    const __m256i eight = m256_const1_32( 8 );
 
    for ( int i = 0; i < 19; i++ )
-       vdata[i] = m256_const1_32( pdata[i] );
+      vdata[i] = m256_const1_32( pdata[i] );
 
    *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
    initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 
-   // hash first 64 bytes of data
-   sha256_8way_transform_le( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate1, vdata, initstate );
+   
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
 
    do
    {
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_256( block + 5, 10 );
       block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, midstate );
+      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
 
       // 2. 32 byte hash from 1.
       memcpy_256( block, hash32, 8 );
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 
 #endif
 
+/*
+bool register_sha256d_algo( algo_gate_t* gate )
+{
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256D_16WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_16way;
+#elif defined(SHA256D_8WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_8way;
+#elif defined(SHA256D_4WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_4way;
+#endif
+   
+//   gate->hash     = (void*)&sha256d;
+   return true;
+};
+*/
+
diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h
new file mode 100644
index 00000000..9051ec4b
--- /dev/null
+++ b/algo/sha/sha256d-4way.h
@@ -0,0 +1,48 @@
+#ifndef __SHA256D_4WAY_H__
+#define __SHA256D_4WAY_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA256D_16WAY 1
+/*
+#elif defined(__AVX2__)
+  #define SHA256D_8WAY 1
+#else
+  #define SHA256D_4WAY 1
+*/
+#endif
+
+bool register_sha256d_algo( algo_gate_t* gate );
+
+#if defined(SHA256D_16WAY)
+
+int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+/*
+#if defined(SHA256D_8WAY)
+
+int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+
+#if defined(SHA256D_4WAY)
+
+int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+*/
+
+/*
+#if defined(__SHA__)
+
+int scanhash_sha256d( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+*/
+
+#endif
+
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 12cbcde2..9cd3a227 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    __m512i  block[16]    __attribute__ ((aligned (64)));
    __m512i  hash32[8]    __attribute__ ((aligned (32)));
    __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate1[8] __attribute__ ((aligned (32)));
    __m512i  midstate2[8] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    __m512i  vdata[20]    __attribute__ ((aligned (32)));
@@ -31,7 +31,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    const __m512i sixteen = m512_const1_32( 16 );
 
    for ( int i = 0; i < 19; i++ )
-       vdata[i] = m512_const1_32( pdata[i] );
+      vdata[i] = m512_const1_32( pdata[i] );
 
    *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                                n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
    initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 
-   // hash first 64 byte block of data
-   sha256_16way_transform_le( midstate, vdata, initstate );
-
+   sha256_16way_transform_le( midstate1, vdata, initstate );
+   
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
 
    do
    {
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_512( block + 5, 10 );  
       block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
 
       // 2. 32 byte hash from 1.
       memcpy_512( block, hash32, 8 );
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    __m256i  block[16]    __attribute__ ((aligned (64)));
    __m256i  hash32[8]    __attribute__ ((aligned (32)));
    __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    __m256i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -121,7 +121,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    const __m256i eight = m256_const1_32( 8 );
 
    for ( int i = 0; i < 19; i++ )
-       vdata[i] = m256_const1_32( pdata[i] );
+      vdata[i] = m256_const1_32( pdata[i] );
 
    *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
@@ -135,9 +135,11 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
    initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 
-   // hash first 64 bytes of data
-   sha256_8way_transform_le( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate1, vdata, initstate );
 
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   
    do
    {
       // 1. final 16 bytes of data, with padding
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_256( block + 5, 10 );
       block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, midstate );
+      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
 
       // 2. 32 byte hash from 1.
       memcpy_256( block, hash32, 8 );
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index a89fc8d7..cab78589 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );
 
 }
 
+void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
+                                 const uint32_t *state_in )
+{
+   uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
+   memcpy( state_out, state_in, 32 );
+
+   t1 = state_out[7] + BSG2_1( state_out[4] )
+      + CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
+   t2 = BSG2_0( state_out[0] )
+      + MAJ( state_out[0], state_out[1], state_out[2] );
+   Y_xor_Z = X_xor_Y;
+   state_out[3] += t1;
+   state_out[7] = t1 + t2;
+
+   t1 = state_out[6] + BSG2_1( state_out[3] ) 
+      + CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
+   t2 = BSG2_0( state_out[7] )
+      + MAJ( state_out[7], state_out[0], state_out[1] );
+   Y_xor_Z = X_xor_Y;
+   state_out[2] += t1;
+   state_out[6] = t1 + t2;
+
+   t1 = state_out[5] + BSG2_1( state_out[2] )
+      + CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
+   t2 = BSG2_0( state_out[6] )
+      + MAJ( state_out[6], state_out[7], state_out[0] );
+   state_out[1] += t1;
+   state_out[5] = t1 + t2;
+}   
+
 /* see sph_sha2.h */
 void
 sph_sha224_init(void *cc)
diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h
index b76c3f4b..ab05423e 100644
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
 void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
                               const uint32_t *state_in );
 
+void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
+                                 const uint32_t *state_in );
+
 
 #if SPH_64
 
diff --git a/configure b/configure
index ae0d7bec..18825971 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.1'
-PACKAGE_STRING='cpuminer-opt 3.18.1'
+PACKAGE_VERSION='3.18.2'
+PACKAGE_STRING='cpuminer-opt 3.18.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.18.1
+cpuminer-opt configure 3.18.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.18.1, which was
+It was created by cpuminer-opt $as_me 3.18.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.18.1'
+ VERSION='3.18.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.1, which was
+This file was extended by cpuminer-opt $as_me 3.18.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.1
+cpuminer-opt config.status 3.18.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 869b3669..bc5329c2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.1])
+AC_INIT([cpuminer-opt], [3.18.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 2a63729e..061bbb96 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1112,19 +1112,17 @@ void report_summary_log( bool force )
    applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
    applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
    applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
-            submit_rate, (double)submitted_share_count*60. /
-            ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
+            submit_rate, safe_div( (double)submitted_share_count*60.,
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
    applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
             shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
 
    if ( accepted_share_count < submitted_share_count )
    {
-      double lost_ghrate = uptime.tv_sec == 0 ? 0.
-                : target_diff
-                       * (double)(submitted_share_count - accepted_share_count )
-                  / (double)uptime.tv_sec;
-      double lost_shrate = share_time == 0. ? 0.
-               : target_diff  * (double)(submits - accepts ) / share_time;
+      double lost_ghrate = safe_div( target_diff
+                    * (double)(submitted_share_count - accepted_share_count ),
+                    (double)uptime.tv_sec, 0. );
+      double lost_shrate = safe_div( target_diff * (double)(submits - accepts ),                                     share_time, 0. );
       char lshr_units[4] = {0};
       char lghr_units[4] = {0};
       scale_hash_for_display( &lost_shrate, lshr_units );
@@ -2495,18 +2493,21 @@ static void *miner_thread( void *userdata )
              timeval_subtract( &uptime, &total_hashes_time, &session_start ); 
              double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
 
-             scale_hash_for_display( &hashrate,  hr_units );
-             sprintf( hr, "%.2f", hashrate );
+             if ( hashrate > 0. )
+             {
+                scale_hash_for_display( &hashrate,  hr_units );
+                sprintf( hr, "%.2f", hashrate );
 #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
-             applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
+                applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
 #else
-             float lo_freq = 0., hi_freq = 0.;
-             linux_cpu_hilo_freq( &lo_freq, &hi_freq );
-             applog( LOG_NOTICE,
+                float lo_freq = 0., hi_freq = 0.;
+                linux_cpu_hilo_freq( &lo_freq, &hi_freq );
+                applog( LOG_NOTICE,
                      "Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz",
                      hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6,
                      hi_freq / 1e6 );
 #endif
+             }
           }
        }  // benchmark
 
@@ -2900,6 +2901,7 @@ static bool cpu_capability( bool display_only )
      bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
      bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
      bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
+     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
      bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
      bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
      bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
@@ -2907,6 +2909,8 @@ static bool cpu_capability( bool display_only )
      bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
      bool use_aes;
      bool use_sse2;
+     bool use_sse42;
+     bool use_avx;
      bool use_avx2;
      bool use_avx512;
      bool use_sha;
@@ -2976,18 +2980,21 @@ static bool cpu_capability( bool display_only )
      else if ( sw_has_aes    )    printf( "  AES"   );
      if      ( sw_has_sha    )    printf( " SHA"    );
 
-     printf("\nAlgo features:");
-     if ( algo_features == EMPTY_SET ) printf( " None" );
-     else
+     if ( !display_only )
      {
-        if      ( algo_has_avx512  )  printf( " AVX512" );
-        else if ( algo_has_avx2    )  printf( " AVX2  " );
-        else if ( algo_has_sse42   )  printf( " SSE4.2" );
-        else if ( algo_has_sse2    )  printf( " SSE2  " );
-        if      ( algo_has_vaes ||
-                  algo_has_vaes256 )  printf( " VAES"   );
-        else if ( algo_has_aes     )  printf( "  AES"   );
-        if      ( algo_has_sha     )  printf( " SHA"    );
+        printf("\nAlgo features:");
+        if ( algo_features == EMPTY_SET ) printf( " None" );
+        else
+        {
+           if      ( algo_has_avx512  )  printf( " AVX512" );
+           else if ( algo_has_avx2    )  printf( " AVX2  " );
+           else if ( algo_has_sse42   )  printf( " SSE4.2" );
+           else if ( algo_has_sse2    )  printf( " SSE2  " );
+           if      ( algo_has_vaes ||
+                     algo_has_vaes256 )  printf( " VAES"   );
+           else if ( algo_has_aes     )  printf( "  AES"   );
+           if      ( algo_has_sha     )  printf( " SHA"    );
+        }
      }
      printf("\n");
 
@@ -3022,6 +3029,8 @@ static bool cpu_capability( bool display_only )
 
      // Determine mining options
      use_sse2   = cpu_has_sse2   && algo_has_sse2;
+     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
+     use_avx    = cpu_has_avx    && sw_has_avx    && algo_has_avx;
      use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
      use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
      use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
@@ -3038,6 +3047,8 @@ static bool cpu_capability( bool display_only )
      {
         if      ( use_avx512 ) printf( " AVX512" );
         else if ( use_avx2   ) printf( " AVX2"   );
+        else if ( use_avx    ) printf( " AVX"    );
+        else if ( use_sse42  ) printf( " SSE42"  );
         else if ( use_sse2   ) printf( " SSE2"   );
         if      ( use_vaes   ) printf( " VAES"   );
         else if ( use_aes    ) printf( " AES"    );
diff --git a/miner.h b/miner.h
index 5592d4ac..99124111 100644
--- a/miner.h
+++ b/miner.h
@@ -868,9 +868,9 @@ Options:\n\
                           yespowerr16   Yenten (YTN)\n\
                           yespower-b2b  generic yespower + blake2b\n\
                           zr5           Ziftr\n\
-  -N, --param-n         N parameter for scrypt based algos\n\
-  -R, --param-r         R parameter for scrypt based algos\n\
-  -K, --param-key       Key (pers) parameter for algos that use it\n\
+  -N, --param-n=N       N parameter for scrypt based algos\n\
+  -R, --param-r=N       R parameter for scrypt based algos\n\
+  -K, --param-key=STRING  Key (pers) parameter for algos that use it\n\
   -o, --url=URL         URL of mining server\n\
   -O, --userpass=U:P    username:password pair for mining server\n\
   -u, --user=USERNAME   username for mining server\n\
@@ -886,8 +886,8 @@ Options:\n\
   -s, --scantime=N      upper bound on time spent scanning current work when\n\
                           long polling is unavailable, in seconds (default: 5)\n\
       --randomize       Randomize scan range start to reduce duplicates\n\
-  -f, --diff-factor     Divide req. difficulty by this factor (std is 1.0)\n\
-  -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
+  -f, --diff-factor=N   Divide req. difficulty by this factor (std is 1.0)\n\
+  -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
       --hash-meter      Display thread hash rates\n\
       --coinbase-addr=ADDR  payout address for solo mining\n\
       --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
@@ -895,9 +895,9 @@ Options:\n\
       --no-getwork      disable getwork support\n\
       --no-gbt          disable getblocktemplate support\n\
       --no-stratum      disable X-Stratum support\n\
-      --no-extranonce   disable Stratum extranonce support\n\
+      --no-extranonce   disable Stratum extranonce subscribe\n\
       --no-redirect     ignore requests to change the URL of the mining server\n\
-  -q, --quiet           disable per-thread hashmeter output\n\
+  -q, --quiet           reduce log verbosity\n\
       --no-color        disable colored output\n\
   -D, --debug           enable debug output\n\
   -P, --protocol-dump   verbose dump of protocol-level activities\n"
@@ -916,9 +916,9 @@ Options:\n\
       --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
       --max-diff=N      Only mine if net difficulty is less than specified value\n\
   -c, --config=FILE     load a JSON-format configuration file\n\
-      --data-file       path and name of data file\n\
+      --data-file=FILE  path and name of data file\n\
       --verify          enable additional time consuming start up tests\n\
-  -V, --version         display version information and exit\n\
+  -V, --version         display version and CPU information and exit\n\
   -h, --help            display this help text and exit\n\
 ";
 
diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h
index 601c7508..58caa3e5 100644
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -2,22 +2,21 @@
 #define SIMD_INT_H__ 1
 
 // Endian byte swap
-#define bswap_64( a ) __builtin_bswap64( a )
-#define bswap_32( a ) __builtin_bswap32( a )
+#define bswap_64    __builtin_bswap64
+#define bswap_32    __builtin_bswap32
+
+// Bit rotation
+#define rol64       __rolq
+#define ror64       __rorq
+#define rol32       __rold
+#define ror32       __rord
 
 // Safe division, integer or floating point. For floating point it's as  
-// safe as 0. is precisely zero.
-// Returns safe_result if division by zero.
+// safe as 0 is precisely zero.
+// Returns safe_result if division by zero, typically zero.
 #define safe_div( dividend, divisor, safe_result ) \
    ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )
 
-// Aliases with familiar names for built in bit rotate instructions
-#define rol64( a, n )   _lrotl( a, n )  
-#define ror64( a, n )   _lrotr( a, n )
-#define rol32( a, n )   _rotl( a, n )
-#define ror32( a, n )   _rotr( a, n )
-#define rol16( a, n )   _rotwl( a, n )
-#define ror16( a, n )   _rotwr( a, n )
 
 ///////////////////////////////////////
 // 

From e6fd9b1d69acf503c8d47975ccf18fcbef28e0b3 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Wed, 10 Nov 2021 21:33:44 -0500
Subject: [PATCH 14/20] v3.19.0

---
 INSTALL_WINDOWS             |   79 +--
 RELEASE_NOTES               |   17 +-
 algo/sha/sha-hash-4way.h    |   23 +-
 algo/sha/sha2.c             |   19 +-
 algo/sha/sha256-hash-4way.c | 1061 ++++++++++++++++++++++++-----------
 algo/sha/sha256d-4way.c     |  180 +++---
 algo/sha/sha256d-4way.h     |    6 +-
 algo/sha/sha256t-4way.c     |  257 ++++++---
 compat.h                    |    4 +
 configure                   |   20 +-
 configure.ac                |    2 +-
 cpu-miner.c                 |  327 ++++-------
 winbuild-cross.sh           |   32 +-
 13 files changed, 1198 insertions(+), 829 deletions(-)

diff --git a/INSTALL_WINDOWS b/INSTALL_WINDOWS
index 02a829ed..b61f0915 100644
--- a/INSTALL_WINDOWS
+++ b/INSTALL_WINDOWS
@@ -40,7 +40,7 @@ $ mkdir $HOME/usr/lib
    version available in the repositories.
 
 Download the following source code packages from their respective and
-respected download locations, copy them to ~/usr/lib/ and uncompress them. 
+respected download locations, copy them to $HOME/usr/lib/ and uncompress them. 
 
 openssl: https://github.com/openssl/openssl/releases
 
@@ -149,85 +149,10 @@ Copy cpuminer.exe to the release directory, compress and copy the release direct
 
 Run cpuminer
 
-In a command windows change directories to the unzipped release folder. to get a list of all options:
+In a command windows change directories to the unzipped release folder. To get a list of all options:
 
 cpuminer.exe --help
 
 Command options are specific to where you mine. Refer to the pool's instructions on how to set them.
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Create a link to the locally compiled version of gmp.h
-
-$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
-
-Edit configure.ac to fix lipthread package name.
-
-sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
-
-
-7. Compile
-
-you can use the default compile if you intend to use cpuminer-opt on the
-same CPU and the virtual machine supports that architecture.
-
-./build.sh
-
-Otherwise you can compile manually while setting options in CFLAGS.
-
-Some common options:
-
-To compile for a specific CPU architecture:
-
-CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
-
-This will compile for AMD Ryzen.
-
-You can compile more generically for a set of specific CPU features
-if you know what features you want:
-
-CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
-
-This will compile for an older CPU that does not have AVX.
-
-You can find several examples in build-allarch.sh
-
-If you have a CPU with more than 64 threads and Windows 7 or higher you
-can enable the CPU Groups feature:
-
--D_WIN32_WINNT==0x0601
-
-Once you have run configure successfully run make with n CPU threads:
-
-make -j n
-
-Copy cpuminer.exe to the release directory, compress and copy the release
-directory to a Windows system and run cpuminer.exe from the command line.
-
-Run cpuminer
-
-In a command windows change directories to the unzipped release folder.
-to get a list of all options:
-
-cpuminer.exe --help
-
-Command options are specific to where you mine. Refer to the pool's
-instructions on how to set them.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index b3b48785..ce7752b4 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,7 +65,22 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
-v3.8.2
+v3.19.0
+
+Windows binaries now built with support for CPU groups, requires Windows 7.
+
+Changes to cpu-affinity:
+  - PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups,
+  - added support for CPU affinity for up to 256 threads or CPUs,
+  - streamlined code for more efficient initialization of miner threads,
+  - precise affining of each miner thread to a specific CPU,
+  - added an option to disable CPU affinity with "--cpu-affinity 0"
+
+Faster sha256t with AVX512 & AVX2.
+
+Added stratum error count to stats log, reported only when non-zero.
+
+v3.18.2
 
 Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
 
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 63a8f927..6428e2ba 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -62,6 +62,12 @@ void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
                             const __m128i *state_in );
 void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
                             const __m128i *state_in );
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in );
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                     const __m128i *state_in );
 
 #endif  // SSE2
 
@@ -84,10 +90,12 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
                                const __m256i *state_in );
 
-void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
-                             const __m256i *state_in );
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                 const __m256i *W, const __m256i *state_in );
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-                          const __m256i *state_in, const __m256i *state_mid );
+        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                                     const __m256i *state_in );
 
 #endif  // AVX2
 
@@ -109,10 +117,13 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in );
 void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in );
-void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
-                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
+                                  const __m512i *W, const __m512i *state_in );
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-                          const __m512i *state_in, const __m512i *state_mid );
+        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
+
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                                     const __m512i *state_in );
 
 #endif // AVX512
 
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 63651c3d..ef152738 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -611,11 +611,11 @@ static inline int scanhash_sha256d_8way_pooler( struct work *work,
 
 #endif /* HAVE_SHA256_8WAY */
 
-int scanhash_sha256d_pooler( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_sha256d_pooler( struct work *work,	uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t _ALIGN(128) data[64];
 	uint32_t _ALIGN(32) hash[8];
 	uint32_t _ALIGN(32) midstate[8];
@@ -626,12 +626,12 @@ int scanhash_sha256d_pooler( struct work *work,
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
 #ifdef HAVE_SHA256_8WAY
-	if (sha256_use_8way())
-		return scanhash_sha256d_8way_pooler( work,	max_nonce, hashes_done, mythr );
+	if ( sha256_use_8way() )
+		return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
 #endif
 #ifdef HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		return scanhash_sha256d_4way_pooler( work,	max_nonce, hashes_done, mythr );
+	if ( sha256_use_4way() )
+		return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
 #endif
 	
 	memcpy(data, pdata + 16, 64);
@@ -695,8 +695,11 @@ bool register_sha256d_algo( algo_gate_t* gate )
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256D_16WAY)
    gate->scanhash = (void*)&scanhash_sha256d_16way;
+//#elif defined(SHA256D_8WAY)
+//   gate->scanhash = (void*)&scanhash_sha256d_8way;
 #else
    gate->scanhash = (void*)&scanhash_sha256d_pooler;
+//   gate->scanhash = (void*)&scanhash_sha256d_4way;
 #endif
    //   gate->hash     = (void*)&sha256d;
    return true;
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index b520746e..dd96d79d 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -1,34 +1,3 @@
-/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * SHA-384 / SHA-512 implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
 
 #if defined(__SSE2__)
 
@@ -66,10 +35,7 @@ static const uint32_t K256[64] =
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 
-// SHA-256 4 way
-
-#define SHA2s_MEXP( a, b, c, d ) \
-  mm128_add4_32( SSG2_1( W[a] ), W[b], SSG2_0( W[c] ), W[d] );
+// SHA-256 4 way SSE2
 
 #define CHs(X, Y, Z) \
    _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
@@ -94,6 +60,27 @@ static const uint32_t K256[64] =
    _mm_xor_si128( _mm_xor_si128( \
         mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
 
+#define SHA2s_MEXP( a, b, c, d ) \
+  mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
+
+#define SHA256x4_MSG_EXPANSION( W ) \
+   W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
+   W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
+   W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
+   W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
+   W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
+   W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
+   W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
+   W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
+   W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
+   W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
+   W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
+   W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
+   W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); \
+   W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); \
+   W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); \
+   W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] );
+
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m128i T1, T2; \
@@ -106,11 +93,32 @@ do { \
   H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
 
+#define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+{ \
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j ); \
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j ); \
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j ); \
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j ); \
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j ); \
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j ); \
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j ); \
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); \
+}
+
 // LE data, no need to byte swap
 static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
                                           const __m128i *in )
 {
-   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   __m128i A, B, C, D, E, F, G, H;
 
    A = in[0];
    B = in[1];
@@ -120,61 +128,14 @@ static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
    F = in[5];
    G = in[6];
    H = in[7];
-   Y_xor_Z = _mm_xor_si128( B, C );
-
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
    
    out[0] = _mm_add_epi32( in[0], A );
    out[1] = _mm_add_epi32( in[1], B );
@@ -205,6 +166,245 @@ void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
    SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H;
+
+   // precalculate constant part msg expansion for second iteration.
+   X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ),
+                          W[ 2] );
+   X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ),
+                          SSG2_0( W[ 4] ) );
+   X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ),
+                          W[ 4] );
+   X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ),
+                          W[ 5] );
+   X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ),
+                          W[ 6] );
+   X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ),
+                          W[ 7] );
+   X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ),
+                          W[ 8] );
+   X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] );
+   X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] );
+   X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] );
+   X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] );
+   X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] );
+   X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] );
+   X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] );
+
+   A = _mm_load_si128( state_in     );
+   B = _mm_load_si128( state_in + 1 );
+   C = _mm_load_si128( state_in + 2 );
+   D = _mm_load_si128( state_in + 3 );
+   E = _mm_load_si128( state_in + 4 );
+   F = _mm_load_si128( state_in + 5 );
+   G = _mm_load_si128( state_in + 6 );
+   H = _mm_load_si128( state_in + 7 );
+
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   
+   _mm_store_si128( state_mid    , A );
+   _mm_store_si128( state_mid + 1, B );
+   _mm_store_si128( state_mid + 2, C );
+   _mm_store_si128( state_mid + 3, D );
+   _mm_store_si128( state_mid + 4, E );
+   _mm_store_si128( state_mid + 5, F );
+   _mm_store_si128( state_mid + 6, G );
+   _mm_store_si128( state_mid + 7, H );
+}
+
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+          const __m128i *state_in, const __m128i *state_mid, const __m128i *X )
+{
+   __m128i A, B, C, D, E, F, G, H;
+   __m128i W[16];
+
+   memcpy_128( W, data, 16 );
+
+   A = _mm_load_si128( state_mid     );
+   B = _mm_load_si128( state_mid + 1 );
+   C = _mm_load_si128( state_mid + 2 );
+   D = _mm_load_si128( state_mid + 3 );
+   E = _mm_load_si128( state_mid + 4 );
+   F = _mm_load_si128( state_mid + 5 );
+   G = _mm_load_si128( state_mid + 6 );
+   H = _mm_load_si128( state_mid + 7 );
+
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H );
+
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   // update precalculated msg expansion with new nonce: W[3].
+   W[ 0] = X[ 0];
+   W[ 1] = X[ 1];
+   W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) );
+   W[ 3] = _mm_add_epi32( X[ 3], W[ 3] );
+   W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) );
+   W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) );
+   W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) );
+   W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) );
+   W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) );
+   W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ),
+                                                W[ 2] ) );
+   W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ),
+                                                W[ 3] ) );
+   W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ),
+                                                W[ 4] ) );
+   W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ),
+                                                W[ 5] ) );
+   W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ),
+                                                W[ 6] ) );
+   W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ),
+                                                W[ 7] ) );
+   W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ),
+                                                W[ 8] ) );
+
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
+
+   A = _mm_add_epi32( A, _mm_load_si128( state_in     ) );
+   B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) );
+   C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) );
+   D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) );
+   E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) );
+   F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) );
+   G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) );
+   H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) );
+
+   _mm_store_si128( state_out    ,  A );
+   _mm_store_si128( state_out + 1,  B );
+   _mm_store_si128( state_out + 2,  C );
+   _mm_store_si128( state_out + 3,  D );
+   _mm_store_si128( state_out + 4,  E );
+   _mm_store_si128( state_out + 5,  F );
+   _mm_store_si128( state_out + 6,  G );
+   _mm_store_si128( state_out + 7,  H );
+}
+
+// returns 0 if hash aborted early and invalid.
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                     const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H;
+   __m128i W[16];      memcpy_128( W, data, 16 );
+   // Value required by H after round 60 to produce valid final hash
+   const __m128i H_ = m128_const1_32( 0x136032ED );
+
+   A = _mm_load_si128( state_in   );
+   B = _mm_load_si128( state_in+1 );
+   C = _mm_load_si128( state_in+2 );
+   D = _mm_load_si128( state_in+3 );
+   E = _mm_load_si128( state_in+4 );
+   F = _mm_load_si128( state_in+5 );
+   G = _mm_load_si128( state_in+6 );
+   H = _mm_load_si128( state_in+7 );
+
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
+   W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] );
+
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 48 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 48 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 48 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 48 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 48 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 48 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
+
+   __m128i T1_57 = _mm_add_epi32( G,
+                          mm128_add4_32( BSG2_1( D ), CHs( D, E, F ),
+                          _mm_set1_epi32( K256[57] ), W[ 9] ) );
+   C = _mm_add_epi32( C, T1_57 );
+
+   __m128i T1_58 = _mm_add_epi32( F,
+                          mm128_add4_32( BSG2_1( C ), CHs( C, D, E ),
+                          _mm_set1_epi32( K256[58] ), W[10] ) );
+   B = _mm_add_epi32( B, T1_58 );
+
+   __m128i T1_59 = _mm_add_epi32( E,
+                          mm128_add4_32( BSG2_1( B ), CHs( B, C, D ),
+                          _mm_set1_epi32( K256[59] ), W[11] ) );
+   A = _mm_add_epi32( A, T1_59 );
+
+   __m128i T1_60 = mm128_add4_32( D, BSG2_1( A ), CHs( A, B, C ), W[12] );
+   H = _mm_add_epi32( H, T1_60 );
+
+   if ( _mm_movemask_ps( (__m128)_mm_cmpeq_epi32( H, H_ ) ) == 0 )
+      return 0;
+
+   __m128i K60 = _mm_set1_epi32( K256[60] );
+   H = _mm_add_epi32( H, K60 );
+   
+   G = _mm_add_epi32( T1_57, _mm_add_epi32( BSG2_0( H ),
+                                            MAJs( H, A, B ) ) );
+   F = _mm_add_epi32( T1_58, _mm_add_epi32( BSG2_0( G ),
+                                            MAJs( G, H, A ) ) );
+   E = _mm_add_epi32( T1_59, _mm_add_epi32( BSG2_0( F ),
+                                            MAJs( F, G, H ) ) );
+   D = mm128_add4_32( T1_60, BSG2_0( E ), MAJs( E, F, G ), K60 );
+
+   W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] );
+
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
+
+   state_out[0] = _mm_add_epi32( state_in[0], A );
+   state_out[1] = _mm_add_epi32( state_in[1], B );
+   state_out[2] = _mm_add_epi32( state_in[2], C );
+   state_out[3] = _mm_add_epi32( state_in[3], D );
+   state_out[4] = _mm_add_epi32( state_in[4], E );
+   state_out[5] = _mm_add_epi32( state_in[5], F );
+   state_out[6] = _mm_add_epi32( state_in[6], G );
+   state_out[7] = _mm_add_epi32( state_in[7], H );
+   return 1;
+}
+   
 void sha256_4way_init( sha256_4way_context *sc )
 {
    sc->count_high = sc->count_low = 0;
@@ -314,7 +514,26 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
                                        _mm256_srli_epi32( x, 10 ) )
 
 #define SHA2x_MEXP( a, b, c, d ) \
-     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
+     mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d );
+
+#define SHA256x8_MSG_EXPANSION( W ) \
+      W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
+      W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
+      W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
+      W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
+      W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
+      W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
+      W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
+      W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
+      W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
+      W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
+      W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
+      W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
+      W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] ); \
+      W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] ); \
+      W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] ); \
+      W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); 
+
 
 // With AVX512VL ternary logic optimizations are available.
 // If not optimize by forwarding the result of X^Y in MAJ to the next round
@@ -341,6 +560,24 @@ do { \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
+#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j ); \
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j ); \
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j ); \
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j ); \
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j ); \
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j ); \
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j ); \
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+
 #else  // AVX2
 
 #define CHx(X, Y, Z) \
@@ -352,6 +589,7 @@ do { \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
 
+
 #define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
   __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -366,10 +604,7 @@ do { \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
-//  the X_xor_y technique can be extended to eliminate the mov instruction.
-//  Perform double rounds and alternate each round. Doesn't apply to AVX512
-//  and isn't suitable for running 3 round prehash.
-//
+
 // read Y_xor_Z, update X_xor_Y
 #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
@@ -404,6 +639,19 @@ do { \
   G  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
+#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+{ \
+   __m256i tic, toc = _mm256_xor_si256( B, C ); \
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, j ); \
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, j ); \
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, j ); \
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, j ); \
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, j ); \
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j ); \
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j ); \
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j ); \
+}
+
 #endif   // AVX512VL else AVX2
 
 static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
@@ -420,90 +668,12 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
    G = _mm256_load_si256( in+6 );
    H = _mm256_load_si256( in+7 );
 
-#if !defined(__AVX512VL__)
-
-   __m256i tic, toc = _mm256_xor_si256( B, C );
-
-   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, 0 );
-   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, 0 );
-   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, 0 );
-   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, 0 );
-   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, 0 );
-   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, 0 );
-   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, 0 );
-   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, 0 );
-
-#else
-
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-#endif
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
 
    for ( int j = 16; j < 64; j += 16 )
    {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-#if !defined(__AVX512VL__)
-
-      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, j );
-      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, j );
-      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, j );
-      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, j );
-      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, j );
-      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j );
-      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j );
-      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j );
-
-#else
-      
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-
-#endif      
+      SHA256x8_MSG_EXPANSION( W );
+      SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j );
    }
 
    out[0] = _mm256_add_epi32( in[0], A );
@@ -535,25 +705,36 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
    SHA256_8WAY_TRANSFORM( state_out, W, state_in );
 }
 
-void sha256_8way_init( sha256_8way_context *sc )
-{
-   sc->count_high = sc->count_low = 0;
-   sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
-   sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
-   sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
-   sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
-   sc->val[4] = m256_const1_64( 0x510E527F510E527F );
-   sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
-   sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
-}
-
-// Aggresive prehashing, LE byte order
-void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
-                             const __m256i *state_in )
+// Aggressive prehashing, LE byte order
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                  const __m256i *W, const __m256i *state_in )
 {
    __m256i A, B, C, D, E, F, G, H;
 
+   X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
+                             W[ 2] );
+   X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
+                             SSG2_0x( W[ 4] ) );
+   X[ 4] = _mm256_add_epi32( _mm256_add_epi32( W[13], SSG2_0x( W[ 5] ) ),
+                             W[ 4] );
+   X[ 5] = _mm256_add_epi32( _mm256_add_epi32( W[14], SSG2_0x( W[ 6] ) ),
+                             W[ 5] );
+   X [6] = _mm256_add_epi32( _mm256_add_epi32( W[15], SSG2_0x( W[ 7] ) ),
+                             W[ 6] );
+   X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
+                             W[ 7] );
+   X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
+                             W[ 8] );
+   X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
+   X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
+   X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
+   X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
+   X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
+   X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
+   X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );
+
    A = _mm256_load_si256( state_in     );
    B = _mm256_load_si256( state_in + 1 );
    C = _mm256_load_si256( state_in + 2 );
@@ -582,7 +763,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
 }
 
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-                          const __m256i *state_in, const __m256i *state_mid )
+          const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
 {
    __m256i A, B, C, D, E, F, G, H;
    __m256i W[16];
@@ -620,43 +801,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
    SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
+   W[ 0] = X[ 0];
+   W[ 1] = X[ 1];
+   W[ 2] = _mm256_add_epi32( X[ 2], SSG2_0x( W[ 3] ) );
+   W[ 3] = _mm256_add_epi32( X[ 3], W[ 3] );
+   W[ 4] = _mm256_add_epi32( X[ 4], SSG2_1x( W[ 2] ) );
+   W[ 5] = _mm256_add_epi32( X[ 5], SSG2_1x( W[ 3] ) );
+   W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
+   W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
+   W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
+   W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
+                                                      W[ 2] ) );
+   W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
+                                                      W[ 3] ) );
+   W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
+                                                      W[ 4] ) );
+   W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
+                                                      W[ 5] ) );
+   W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
+                                                      W[ 6] ) );
+   W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
+                                                      W[ 7] ) );
+   W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
+                                                      W[ 8] ) );
+
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x8_MSG_EXPANSION( W );
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x8_MSG_EXPANSION( W );
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
+   
    A = _mm256_add_epi32( A, _mm256_load_si256( state_in     ) );
    B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
    C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
@@ -676,7 +850,136 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    _mm256_store_si256( state_out + 7,  H );
 }
 
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                                     const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];  memcpy_256( W, data, 16 );
+   const __m256i H_ = m256_const1_32( 0x136032ED );
+
+   A = _mm256_load_si256( state_in   );
+   B = _mm256_load_si256( state_in+1 );
+   C = _mm256_load_si256( state_in+2 );
+   D = _mm256_load_si256( state_in+3 );
+   E = _mm256_load_si256( state_in+4 );
+   F = _mm256_load_si256( state_in+5 );
+   G = _mm256_load_si256( state_in+6 );
+   H = _mm256_load_si256( state_in+7 );
 
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+
+   for ( int j = 16; j < 48; j += 16 )
+   {
+      SHA256x8_MSG_EXPANSION( W );
+      SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j );
+   }
+
+   W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
+
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 48 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 48 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 48 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 48 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 48 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 48 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
+
+   __m256i T1_57 = _mm256_add_epi32( G,
+                          mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ),
+                          _mm256_set1_epi32( K256[57] ), W[ 9] ) );
+   C = _mm256_add_epi32( C, T1_57 );
+
+   __m256i T1_58 = _mm256_add_epi32( F,  
+                          mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ),
+                          _mm256_set1_epi32( K256[58] ), W[10] ) );
+   B = _mm256_add_epi32( B, T1_58 );
+   
+   __m256i T1_59 = _mm256_add_epi32( E,  
+                          mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ),
+                          _mm256_set1_epi32( K256[59] ), W[11] ) );
+   A = _mm256_add_epi32( A, T1_59 );
+
+   __m256i T1_60 = mm256_add4_32( D, BSG2_1x( A ), CHx( A, B, C ), W[12] );
+   H = _mm256_add_epi32( H, T1_60 );
+
+   if ( _mm256_movemask_ps( (__m256)_mm256_cmpeq_epi32( H, H_ ) ) == 0 )
+      return 0;
+
+   __m256i K60 = _mm256_set1_epi32( K256[60] );
+   H = _mm256_add_epi32( H, K60 );
+
+   G = _mm256_add_epi32( T1_57, _mm256_add_epi32( BSG2_0x( H ),
+                                                  MAJx( H, A, B ) ) );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   F = _mm256_add_epi32( T1_58, _mm256_add_epi32( BSG2_0x( G ),
+                                                  MAJx( G, H, A ) ) );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   E = _mm256_add_epi32( T1_59, _mm256_add_epi32( BSG2_0x( F ),
+                                                  MAJx( F, G, H ) ) );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   D = mm256_add4_32( T1_60, BSG2_0x( E ), MAJx( E, F, G ), K60 );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   W[13] = SHA2x_MEXP( W[11],  W[6], W[14], W[13] );
+   W[14] = SHA2x_MEXP( W[12],  W[7], W[15], W[14] );
+   W[15] = SHA2x_MEXP( W[13],  W[8], W[ 0], W[15] );
+
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
+
+   state_out[0] = _mm256_add_epi32( state_in[0], A );
+   state_out[1] = _mm256_add_epi32( state_in[1], B );
+   state_out[2] = _mm256_add_epi32( state_in[2], C );
+   state_out[3] = _mm256_add_epi32( state_in[3], D );
+   state_out[4] = _mm256_add_epi32( state_in[4], E );
+   state_out[5] = _mm256_add_epi32( state_in[5], F );
+   state_out[6] = _mm256_add_epi32( state_in[6], G );
+   state_out[7] = _mm256_add_epi32( state_in[7], H );
+   return 1;
+}
+
+void sha256_8way_init( sha256_8way_context *sc )
+{
+   sc->count_high = sc->count_low = 0;
+   sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m256_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+}
 
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.
@@ -778,7 +1081,25 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
                                       _mm512_srli_epi32( x, 10 ) )
 
 #define SHA2x16_MEXP( a, b, c, d ) \
-     mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
+     mm512_add4_32( SSG2_1x16( a ), b, SSG2_0x16( c ), d );
+
+#define SHA256x16_MSG_EXPANSION( W ) \
+   W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
+   W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
+   W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
+   W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
+   W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
+   W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
+   W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
+   W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
+   W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
+   W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
+   W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
+   W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
+   W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] ); \
+   W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] ); \
+   W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] ); \
+   W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
 
 #define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
@@ -806,6 +1127,23 @@ do { \
 } while (0)
 */
 
+#define SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j ); \
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j ); \
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j ); \
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j ); \
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j ); \
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j ); \
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j ); \
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
 
 static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
                                            const  __m512i *in ) \
@@ -820,59 +1158,13 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
    G = _mm512_load_si512( in+6 );
    H = _mm512_load_si512( in+7 );
 
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H,  0 );   
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
 
    out[0] = _mm512_add_epi32( in[0], A );
    out[1] = _mm512_add_epi32( in[1], B );
@@ -903,11 +1195,36 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
    SHA256_16WAY_TRANSFORM( state_out, W, state_in );
 }
  
-// Aggresive prehashing, LE byte order
-void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
-                             const __m512i *state_in )
+// Aggressive prehashing, LE byte order
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, 
+                                   const __m512i *W, const __m512i *state_in )
 {
    __m512i A, B, C, D, E, F, G, H;
+   
+   // precalculate constant part msg expansion for second iteration.
+   X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
+                             W[ 2] );
+   X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
+                             SSG2_0x16( W[ 4] ) );         
+   X[ 4] = _mm512_add_epi32( _mm512_add_epi32( W[13], SSG2_0x16( W[ 5] ) ),
+                             W[ 4] );
+   X[ 5] = _mm512_add_epi32( _mm512_add_epi32( W[14], SSG2_0x16( W[ 6] ) ),
+                             W[ 5] );
+   X [6] = _mm512_add_epi32( _mm512_add_epi32( W[15], SSG2_0x16( W[ 7] ) ),
+                             W[ 6] ); 
+   X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
+                             W[ 7] );
+   X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
+                             W[ 8] );
+   X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
+   X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
+   X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
+   X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
+   X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
+   X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
+   X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );
 
    A = _mm512_load_si512( state_in     );
    B = _mm512_load_si512( state_in + 1 );
@@ -933,7 +1250,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
 }   
 
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-                          const __m512i *state_in, const __m512i *state_mid )
+          const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
    __m512i A, B, C, D, E, F, G, H;
    __m512i W[16];
@@ -949,9 +1266,6 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    G = _mm512_load_si512( state_mid + 6 );
    H = _mm512_load_si512( state_mid + 7 );
 
-//   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-//   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-//   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
    SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
    SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
    SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
@@ -966,42 +1280,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
    SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
+   // update precalculated msg expansion with new nonce: W[3].
+   W[ 0] = X[ 0];
+   W[ 1] = X[ 1];
+   W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
+   W[ 3] = _mm512_add_epi32( X[ 3], W[ 3] );
+   W[ 4] = _mm512_add_epi32( X[ 4], SSG2_1x16( W[ 2] ) );
+   W[ 5] = _mm512_add_epi32( X[ 5], SSG2_1x16( W[ 3] ) );
+   W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
+   W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
+   W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
+   W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
+                                                      W[ 2] ) );
+   W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
+                                                      W[ 3] ) );
+   W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
+                                                      W[ 4] ) );
+   W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
+                                                      W[ 5] ) );
+   W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
+                                                      W[ 6] ) );
+   W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
+                                                      W[ 7] ) );
+   W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
+                                                      W[ 8] ) );
+
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
 
    A = _mm512_add_epi32( A, _mm512_load_si512( state_in     ) );
    B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) );
@@ -1022,6 +1330,105 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    _mm512_store_si512( state_out + 7,  H );
 }
 
+// returns 0 if hash aborted early and invalid.
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                                     const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];      memcpy_512( W, data, 16 );
+   // Value for H at round 60, before adding K, to produce valid final hash
+   //where H == 0.
+   // H_ =  -( H256[7] + K256[60] );
+   const __m512i H_ = m512_const1_32( 0x136032ED );
+
+   A = _mm512_load_si512( state_in   );
+   B = _mm512_load_si512( state_in+1 );
+   C = _mm512_load_si512( state_in+2 );
+   D = _mm512_load_si512( state_in+3 );
+   E = _mm512_load_si512( state_in+4 );
+   F = _mm512_load_si512( state_in+5 );
+   G = _mm512_load_si512( state_in+6 );
+   H = _mm512_load_si512( state_in+7 );
+
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
+   W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
+   
+   // Rounds 48 to 56
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 48 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 48 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 48 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 48 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 48 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 48 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
+
+   // Rounds 57 to 60 part 1
+   __m512i T1_57 = _mm512_add_epi32( _mm512_set1_epi32( K256[57] ),
+                  mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) );
+   C = _mm512_add_epi32( C, T1_57 );
+   __m512i T1_58 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ), 
+                  mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
+   B = _mm512_add_epi32( B, T1_58 );
+   __m512i T1_59 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ), 
+                  mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
+   A = _mm512_add_epi32( A, T1_59 );
+   __m512i T1_60 = mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D );
+   H = _mm512_add_epi32( H, T1_60 );
+
+   // give up?
+   if ( _mm512_cmpeq_epi32_mask( H, H_ ) == 0 ) return 0;   
+
+   // Rounds 57 to 60 part 2
+   __m512i K60 = _mm512_set1_epi32( K256[60] );
+   H = _mm512_add_epi32( H, K60 );
+
+   G = _mm512_add_epi32( T1_57, _mm512_add_epi32( BSG2_0x16( H ),
+                                                  MAJx16( H, A, B ) ) );
+   F = _mm512_add_epi32( T1_58, _mm512_add_epi32( BSG2_0x16( G ),
+                                                  MAJx16( G, H, A ) ) );
+   E = _mm512_add_epi32( T1_59, _mm512_add_epi32( BSG2_0x16( F ),
+                                                  MAJx16( F, G, H ) ) );
+   D = mm512_add4_32( T1_60, BSG2_0x16( E ), MAJx16( E, F, G ), K60 );
+
+   // Rounds 61 to 63
+   W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
+   
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
+   
+   state_out[0] = _mm512_add_epi32( state_in[0], A );
+   state_out[1] = _mm512_add_epi32( state_in[1], B );
+   state_out[2] = _mm512_add_epi32( state_in[2], C );
+   state_out[3] = _mm512_add_epi32( state_in[3], D );
+   state_out[4] = _mm512_add_epi32( state_in[4], E );
+   state_out[5] = _mm512_add_epi32( state_in[5], F );
+   state_out[6] = _mm512_add_epi32( state_in[6], G );
+   state_out[7] = _mm512_add_epi32( state_in[7], H );
+   return 1;
+}
+  
 void sha256_16way_init( sha256_16way_context *sc )
 {
    sc->count_high = sc->count_low = 0;
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
index c69ad582..18eceffe 100644
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -10,13 +10,14 @@
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
    __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (32)));
-   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate1[8] __attribute__ ((aligned (32)));
-   __m512i  midstate2[8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -36,6 +37,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                                n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = m512_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
@@ -49,39 +58,33 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    sha256_16way_transform_le( midstate1, vdata, initstate );
 
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
 
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_512( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );
-      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_512( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_512( block + 9, 6 );
-      block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
       {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm512_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 16; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_16x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm512_add_epi32( *noncev, sixteen );
-       n += 16;
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -95,13 +98,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
    __m256i  hash32[8]    __attribute__ ((aligned (32)));
    __m256i  initstate[8] __attribute__ ((aligned (32)));
    __m256i  midstate1[8] __attribute__ ((aligned (32)));
    __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m256i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -120,6 +124,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = m256_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -133,35 +145,30 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    sha256_8way_transform_le( midstate1, vdata, initstate );
    
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
 
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_256( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_256( block + 5, 10 );
-      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_256( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_256( block + 9, 6 );
-      block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm256_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm256_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
        }
        *noncev = _mm256_add_epi32( *noncev, eight );
@@ -179,12 +186,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  block[16]    __attribute__ ((aligned (64)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  initstate[8] __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m128i  vdata[20]    __attribute__ ((aligned (32)));
+   __m128i  vdata[32]     __attribute__ ((aligned (64)));
+   __m128i  block[16]     __attribute__ ((aligned (32)));
+   __m128i  hash32[8]     __attribute__ ((aligned (32)));
+   __m128i  initstate[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate1[8]   __attribute__ ((aligned (32)));
+   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
+   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -203,6 +212,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+
    // initialize state
    initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -214,39 +231,36 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate1, vdata, initstate );
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
 
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_128( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_128( block + 5, 10 );
-      block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, midstate );
+      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_128( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_128( block + 9, 6 );
-      block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm128_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_4x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm128_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_4x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm_add_epi32( *noncev, four );
-       n += 4;
+      }
+      *noncev = _mm_add_epi32( *noncev, four );
+      n += 4;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h
index 9051ec4b..bae02148 100644
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -6,12 +6,10 @@
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   #define SHA256D_16WAY 1
-/*
 #elif defined(__AVX2__)
   #define SHA256D_8WAY 1
 #else
   #define SHA256D_4WAY 1
-*/
 #endif
 
 bool register_sha256d_algo( algo_gate_t* gate );
@@ -21,7 +19,7 @@ bool register_sha256d_algo( algo_gate_t* gate );
 int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
-/*
+
 #if defined(SHA256D_8WAY)
 
 int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
@@ -33,7 +31,7 @@ int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
-*/
+
 
 /*
 #if defined(__SHA__)
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 9cd3a227..9c1677b1 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -10,13 +10,14 @@
 int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
    __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (32)));
-   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate1[8] __attribute__ ((aligned (32)));
-   __m512i  midstate2[8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -36,7 +37,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                                n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
-   // initialize state
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+   
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = m512_const1_32( 32*8 ); // bit count
+   
    initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
    initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
@@ -49,43 +57,37 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    sha256_16way_transform_le( midstate1, vdata, initstate );
    
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
 
    do
    {
-      // 1. final 16 bytes of data, with padding
-      memcpy_512( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );  
-      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
+      // 1. final 16 bytes of data, pre-padded
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_512( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_512( block + 9, 6 );
-      block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform_le( hash32, block, initstate );
+      sha256_16way_transform_le( block, block, initstate );
 
       // 3. 32 byte hash from 2.
-      memcpy_512( block, hash32, 8 );
-      sha256_16way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );    
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+               sha256_16way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm512_block_bswap_32( hash32, hash32 );    
+
+         for ( int lane = 0; lane < 16; lane++ )
+         if ( hash32_d7[ lane ] <= targ32_d7 )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_16x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm512_add_epi32( *noncev, sixteen );
-       n += 16;
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -100,13 +102,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
    __m256i  hash32[8]    __attribute__ ((aligned (32)));
    __m256i  initstate[8] __attribute__ ((aligned (32)));
    __m256i  midstate1[8] __attribute__ ((aligned (32)));
    __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m256i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -125,6 +128,14 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = m256_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -138,43 +149,37 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    sha256_8way_transform_le( midstate1, vdata, initstate );
 
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
    
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_256( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_256( block + 5, 10 );
-      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_256( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_256( block + 9, 6 );
-      block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, initstate );
+      sha256_8way_transform_le( block, block, initstate );
 
       // 3. 32 byte hash from 2.
-      memcpy_256( block, hash32, 8 );
-      sha256_8way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm256_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm256_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( hash32_d7[ lane ] <= targ32_d7 )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm256_add_epi32( *noncev, eight );
-       n += 8;
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -183,18 +188,24 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 
 #endif
 
+
 #if defined(SHA256T_4WAY)
 
+// Optimizations are slower with AVX/SSE2
+// https://github.com/JayDDee/cpuminer-opt/issues/344
+/*
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  block[16]    __attribute__ ((aligned (64)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  initstate[8] __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m128i  vdata[20]    __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   __m128i  vdata[32]     __attribute__ ((aligned (64)));
+   __m128i  block[16]     __attribute__ ((aligned (32)));
+   __m128i  hash32[8]     __attribute__ ((aligned (32)));
+   __m128i  initstate[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate1[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
+   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t targ32_d7 = ptarget[7];
@@ -212,6 +223,14 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -223,29 +242,100 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
 
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_128( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_128( block + 5, 10 );
-      block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, midstate );
+      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_128( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_128( block + 9, 6 );
-      block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, initstate );
+      sha256_4way_transform_le( block, block, initstate );
 
       // 3. 32 byte hash from 2.
-      memcpy_128( block, hash32, 8 );
-      sha256_4way_transform_le( hash32, block, initstate );
+      if ( unlikely(
+              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
+      {   
+         // byte swap final hash for testing
+         mm128_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+         {
+            extr_lane_4x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
+         }
+      }
+      *noncev = _mm_add_epi32( *noncev, four );
+      n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+*/
+
+int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m128_const1_32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
 
-      // byte swap final hash for testing
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+   
+   // initialize state
+   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m128_const1_64( 0x510E527F510E527F );
+   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform_le( midstate, vdata, initstate );
+
+   do
+   {
+      sha256_4way_transform_le( block,  vdata+16, midstate  );
+      sha256_4way_transform_le( block,  block,    initstate );
+      sha256_4way_transform_le( hash32, block,    initstate );
       mm128_block_bswap_32( hash32, hash32 );
 
       for ( int lane = 0; lane < 4; lane++ )
@@ -266,5 +356,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
    return 0;
 }
 
+
 #endif
 
diff --git a/compat.h b/compat.h
index 124bc40a..bd23f9c5 100644
--- a/compat.h
+++ b/compat.h
@@ -3,6 +3,10 @@
 
 #ifdef WIN32
 
+#if _WIN32_WINNT==0x0601    // Windows 7
+ #define WINDOWS_CPU_GROUPS_ENABLED 1
+#endif
+
 #include <windows.h>
 #include <time.h>
 
diff --git a/configure b/configure
index 18825971..b93191f8 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.2'
-PACKAGE_STRING='cpuminer-opt 3.18.2'
+PACKAGE_VERSION='3.18.3'
+PACKAGE_STRING='cpuminer-opt 3.18.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.18.2
+cpuminer-opt configure 3.18.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.18.2, which was
+It was created by cpuminer-opt $as_me 3.18.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.18.2'
+ VERSION='3.18.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.2, which was
+This file was extended by cpuminer-opt $as_me 3.18.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.2
+cpuminer-opt config.status 3.18.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index bc5329c2..8b80c385 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.2])
+AC_INIT([cpuminer-opt], [3.19.0])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 061bbb96..179881c6 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3,7 +3,7 @@
  * Copyright 2012-2014 pooler
  * Copyright 2014 Lucas Jones
  * Copyright 2014-2016 Tanguy Pruvot
- * Copyright 2016-2020 Jay D Dee
+ * Copyright 2016-2021 Jay D Dee
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -115,22 +115,12 @@ int opt_param_n = 0;
 int opt_param_r = 0;
 int opt_n_threads = 0;
 bool opt_sapling = false;
-
-// Windows doesn't support 128 bit affinity mask.
-// Need compile time and run time test.
-#if defined(__linux) && defined(GCC_INT128)  
-#define AFFINITY_USES_UINT128 1
-static uint128_t opt_affinity = -1;
-static bool affinity_uses_uint128 = true;
-#else
-static uint64_t opt_affinity = -1;
-static bool affinity_uses_uint128 = false;
-#endif
-
+static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL;  // default, use all cores
 int opt_priority = 0;  // deprecated
 int num_cpus = 1;
-int num_cpugroups = 1;
-char *rpc_url = NULL;;
+int num_cpugroups = 1;  // For Windows
+#define max_cpus 256   // max for affinity
+char *rpc_url = NULL;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
 char *short_url = NULL;
@@ -166,6 +156,7 @@ uint32_t accepted_share_count = 0;
 uint32_t rejected_share_count = 0;
 uint32_t stale_share_count = 0;
 uint32_t solved_block_count = 0;
+uint32_t stratum_errors = 0;
 double *thr_hashrates;
 double global_hashrate = 0.;
 double total_hashes = 0.;
@@ -227,18 +218,21 @@ char*  lp_id;
 
 static void   workio_cmd_free(struct workio_cmd *wc);
 
-static void format_affinity_map( char *map_str, uint64_t map )
+// array mapping thread to cpu
+static uint8_t thread_affinity_map[ max_cpus ];
+
+// display affinity mask graphically
+static void format_affinity_mask( char *mask_str, uint64_t mask )
 {
    int n = num_cpus < 64 ? num_cpus : 64;
    int i;
-
    for ( i = 0; i < n; i++ )
    {
-      if ( map & 1 )  map_str[i] = '!';
-      else            map_str[i] = '.';
-      map >>= 1;
+      if ( mask & 1 )  mask_str[i] = '!';
+      else             mask_str[i] = '.';
+      mask >>= 1;
    }
-   memset( &map_str[i], 0, 64 - i );
+   memset( &mask_str[i], 0, 64 - i );
 }
 
 #ifdef __linux /* Linux specific policy and affinity management */
@@ -260,93 +254,70 @@ static inline void drop_policy(void)
 #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
 #endif
 
-// Linux affinity can use int128.
-#if AFFINITY_USES_UINT128
-static void affine_to_cpu_mask( int id, uint128_t mask )
-#else
-static void affine_to_cpu_mask( int id, uint64_t mask )
-#endif
+static void affine_to_cpu( struct thr_info *thr )
 {
+   int thread = thr->id;
    cpu_set_t set;
    CPU_ZERO( &set );
-   uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus;       
-
-   for ( uint8_t i = 0; i < ncpus; i++ ) 
-   {
-      // cpu mask
-#if AFFINITY_USES_UINT128
-      if( ( mask & ( (uint128_t)1 << i ) ) )  CPU_SET( i, &set );
-#else
-      if( (ncpus > 64) || ( mask & (1 << i) ) )  CPU_SET( i, &set );
-#endif
-   }
-   if ( id == -1 )
-   {
-      // process affinity
-      sched_setaffinity(0, sizeof(&set), &set);
-   }
-   else
-   {
-      // thread only
-      pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
-   }
+   CPU_SET( thread_affinity_map[ thread ], &set );
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d",
+                        thread, thread_affinity_map[ thread ] );
+   pthread_setaffinity_np( thr->pth, sizeof(set), &set );
 }
 
 #elif defined(WIN32) /* Windows */
+
 static inline void drop_policy(void) { }
 
 // Windows CPU groups to manage more than 64 CPUs.
-static void affine_to_cpu_mask( int id, uint64_t mask )
+// mask arg is ignored
+static void affine_to_cpu( struct thr_info *thr )
 {
-   bool success;
+   int thread = thr->id;
    unsigned long last_error;    
-//   BOOL success;
-//   DWORD last_error;
+   bool ok;
 
-   if ( id == -1 )
-      success = SetProcessAffinityMask( GetCurrentProcess(), mask );
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
+   unsigned long group_size = GetActiveProcessorCount( 0 );
+   unsigned long group      = thread / group_size;
+   unsigned long cpu        = thread_affinity_map[ thread % group_size ];
 
-// Are Windows CPU Groups supported?
-#if _WIN32_WINNT==0x0601
-   else if ( num_cpugroups == 1 )
-	   success = SetThreadAffinityMask( GetCurrentThread(), mask );
-   else
-   {
-	   // Find the correct cpu group
-	   int cpu = id % num_cpus;
-	   int group;
-	   for( group = 0; group < num_cpugroups; group++ )
-	   {
-	      int cpus = GetActiveProcessorCount( group );
- 	      if ( cpu < cpus )  break;
-  	      cpu -= cpus;
-      }
+   GROUP_AFFINITY affinity;
+   affinity.Group = group;
+   affinity.Mask = 1ULL << cpu;
 
-	   if (opt_debug)
-         applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)",
-               id, cpu, group, (1ULL << cpu));
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d in cpu group %d",
+                        thread, cpu, group );
+
+   ok = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL );
 
-	   GROUP_AFFINITY affinity;
-	   affinity.Group = group;
-	   affinity.Mask = 1ULL << cpu;
-	   success = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL );
-   }
 #else
-   else 
-      success = SetThreadAffinityMask( GetCurrentThread(), mask );
+
+   unsigned long cpu = thread_affinity_map[ thread ];
+   uint64_t mask = 1ULL << cpu;
+
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d", thread, cpu );
+
+   ok = SetThreadAffinityMask( GetCurrentThread(), mask );
+
 #endif
 
-   if (!success)
+   if ( !ok )
    {
-	   last_error = GetLastError();
-	   applog(LOG_WARNING, "affine_to_cpu_mask for %u returned %x",
-               id, last_error);
+      last_error = GetLastError();
+      applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
+                           thread, last_error );
    }
-}
+}   
 
 #else
+
 static inline void drop_policy(void) { }
-static void affine_to_cpu_mask(int id, unsigned long mask) { }
+static void affine_to_cpu( struct thr_info *thr ) { }
+
 #endif
 
 // not very useful, just index the arrray directly.
@@ -1159,17 +1130,23 @@ void report_summary_log( bool force )
       applog2( prio, "Blocks Solved   %7d      %7d",
                solved, solved_block_count );
    }
+   if ( stratum_errors )
+      applog2( LOG_INFO, "Stratum errors               %7d", stratum_errors );
+
    applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
             highest_share, lowest_share );
 
    int mismatch = submitted_share_count
          - ( accepted_share_count + stale_share_count + rejected_share_count );
+
    if ( mismatch )
    {
-      if ( mismatch != 1 )
-         applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch );
-      else
-         applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N );
+      if ( stratum_errors )
+         applog2( LOG_MINR, "Count mismatch: %d, stats may be inaccurate",
+                            mismatch );
+      else if ( !opt_quiet )
+         applog2( LOG_INFO, CL_LBL
+                  "Count mismatch, submitted share may still be pending" CL_N );
    }
 }
 
@@ -2241,49 +2218,9 @@ static void *miner_thread( void *userdata )
 	   if ( opt_priority == 0 )
 	      drop_policy();
    }
+
    // CPU thread affinity
-   if ( num_cpus > 1 )
-   {
-#if AFFINITY_USES_UINT128
-      // Default affinity
-      if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 )
-      {  
-         affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) );
-         if ( opt_debug )
-            applog( LOG_INFO, "Binding thread %d to cpu %d.",
-                    thr_id, thr_id % num_cpus,
-	                 u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ),
-		              u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) );
-      }
-#else
-      if ( ( opt_affinity == -1 ) && ( opt_n_threads > 1 ) ) 
-      {
-         affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) );
-         if (opt_debug)
-            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
-                thr_id, thr_id % num_cpus, 1 << (thr_id % num_cpus)) ;
-      }
-#endif
-      else   // Custom affinity
-      {
-         affine_to_cpu_mask( thr_id, opt_affinity );
-         if ( opt_debug )
-         {
-#if AFFINITY_USES_UINT128
-            if ( num_cpus > 64 )
-               applog( LOG_INFO, "Binding thread %d to mask %016llx %016llx",
-                                thr_id, u128_hi64( opt_affinity ), 
-                                        u128_lo64( opt_affinity ) );
-            else
-               applog( LOG_INFO, "Binding thread %d to mask %016llx",
-                                 thr_id, opt_affinity );
-#else
-            applog( LOG_INFO, "Binding thread %d to mask %016llx",
-                                 thr_id, opt_affinity );
-#endif
-         }
-      }
-   }  // num_cpus > 1
+   if ( opt_affinity && num_cpus > 1 )   affine_to_cpu( mythr );
 
    if ( !algo_gate.miner_thread_init( thr_id ) )
    {
@@ -2792,6 +2729,7 @@ static void *stratum_thread(void *userdata )
       {
           stratum_need_reset = false;
           stratum_down = true;
+          stratum_errors++;
           stratum_disconnect( &stratum );
           if ( strcmp( stratum.url, rpc_url ) )
           {
@@ -2809,6 +2747,7 @@ static void *stratum_thread(void *userdata )
       while ( !stratum.curl )
       {
          stratum_down = true;
+         restart_threads();
          pthread_rwlock_wrlock( &g_work_lock );
          g_work_time = 0;
          pthread_rwlock_unlock( &g_work_lock );
@@ -2830,7 +2769,6 @@ static void *stratum_thread(void *userdata )
          else
          {
             stratum_down = false;
-            restart_threads();
             applog(LOG_BLUE,"Stratum connection established" );
          }
       }
@@ -3137,7 +3075,7 @@ void parse_arg(int key, char *arg )
 {
 	char *p;
 	int v, i;
-	uint64_t ul;
+//	uint64_t ul;
 	double d;
 
 	switch( key )
@@ -3448,21 +3386,10 @@ void parse_arg(int key, char *arg )
 		break;
 #endif
 	case 1020:  // cpu-affinity
-		p = strstr(arg, "0x");
-		if ( p )
-			ul = strtoull( p, NULL, 16 );
-		else
-			ul = atoll( arg );
-#if AFFINITY_USES_UINT128
-// replicate the low 64 bits to make a full 128 bit mask if there are more
-// than 64 CPUs, otherwise zero extend the upper half.
-         opt_affinity = (uint128_t)ul;
-         if ( num_cpus > 64 )
-            opt_affinity |= opt_affinity << 64;
-#else
-         opt_affinity = ul;
-#endif
-		break;
+      p = strstr( arg, "0x" );
+      opt_affinity = p ? strtoull( p, NULL, 16 )
+                       : atoll( arg );
+      break;
 	case 1021:  // cpu-priority
 		v = atoi(arg);
 		if (v < 0 || v > 5)	/* sanity check */
@@ -3565,20 +3492,18 @@ static void parse_cmdline(int argc, char *argv[])
    while (1)
    {
 #if HAVE_GETOPT_LONG
-	key = getopt_long(argc, argv, short_options, options, NULL);
+      key = getopt_long(argc, argv, short_options, options, NULL);
 #else
-	key = getopt(argc, argv, short_options);
+      key = getopt(argc, argv, short_options);
 #endif
-	if (key < 0)
-		break;
-
-	parse_arg(key, optarg);
+      if ( key < 0 )   break;
+      parse_arg( key, optarg );
    }
-   if (optind < argc)
+   if ( optind < argc )
    {
-	fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n",
-		argv[0], argv[optind]);
-        show_usage_and_exit(1);
+      fprintf( stderr, "%s: unsupported non-option argument -- '%s'\n",
+		                 argv[0], argv[optind]);
+      show_usage_and_exit(1);
    }
 }
 
@@ -3642,26 +3567,21 @@ int main(int argc, char *argv[])
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
 
-   parse_cmdline(argc, argv);
-
 #if defined(WIN32)
-//	SYSTEM_INFO sysinfo;
-//	GetSystemInfo(&sysinfo);
-//	num_cpus = sysinfo.dwNumberOfProcessors;
-// What happens if GetActiveProcessorGroupCount called if groups not enabled?
 
 // Are Windows CPU Groups supported?
-#if _WIN32_WINNT==0x0601
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
  	num_cpus = 0;
 	num_cpugroups = GetActiveProcessorGroupCount();
-	for(  i = 0; i < num_cpugroups; i++ )
+	for( i = 0; i < num_cpugroups; i++ )
 	{
- 	   int cpus = GetActiveProcessorCount(i);
+ 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;
 
 	   if (opt_debug)
-         applog(LOG_DEBUG, "Found %d cpus on cpu group %d", cpus, i);
+         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}
+
 #else
    SYSTEM_INFO sysinfo;
    GetSystemInfo(&sysinfo);
@@ -3677,21 +3597,20 @@ int main(int argc, char *argv[])
 #else
 	num_cpus = 1;
 #endif
-	if (num_cpus < 1)
-		num_cpus = 1;
 
-   if (!opt_n_threads)
-      opt_n_threads = num_cpus;
+   if ( num_cpus < 1 )    num_cpus = 1;
+
+   parse_cmdline( argc, argv );
 
    if ( opt_algo == ALGO_NULL )
    {
-      fprintf(stderr, "%s: no algo supplied\n", argv[0]);
+      fprintf( stderr, "%s: No algo parameter specified\n", argv[0] );
       show_usage_and_exit(1);
    }
 
    // need to register to get algo optimizations for cpu capabilities
-   // but that causes register logs before cpu capabilities is output.
-   // Would need to split register into 2 parts. First part sets algo
+   // but that causes registration logs before cpu capabilities is output.
+   // Would need to split register function into 2 parts. First part sets algo
    // optimizations but no logging, second part does any logging.   
    if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
 
@@ -3735,9 +3654,6 @@ int main(int argc, char *argv[])
          return 1;
 	}
 
-   // All options must be set before starting the gate
-//   if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
-
    if ( coinbase_address )
    {
       pk_script_size = address_to_script( pk_script, pk_buffer_size,
@@ -3749,8 +3665,6 @@ int main(int argc, char *argv[])
       }
    }
 
-//   if ( !check_cpu_capability() ) exit(1);
-
 	pthread_mutex_init( &stats_lock, NULL );
    pthread_rwlock_init( &g_work_lock, NULL );
 	pthread_mutex_init( &stratum.sock_lock, NULL );
@@ -3820,44 +3734,31 @@ int main(int argc, char *argv[])
 	}
 #endif
 
-// To be confirmed with more than 64 cpus
-   if ( opt_affinity != -1 )
+   if ( ( opt_n_threads == 0 ) || ( opt_n_threads > num_cpus ) )
+      opt_n_threads = num_cpus;
+
+   if ( opt_affinity && num_cpus > max_cpus )
    {
-      if ( !affinity_uses_uint128 && num_cpus > 64 )
+      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
+                            max_cpus );
+      opt_affinity = 0ULL;
+   }
+   
+   if ( opt_affinity )
+   {
+      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
       {
-          applog(LOG_WARNING,"Setting CPU affinity with more than 64 CPUs is only");
-          applog(LOG_WARNING,"available on Linux. Using default affinity.");
-          opt_affinity = -1;
+         while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;   
+         thread_affinity_map[ thr ] = cpu % num_cpus;
       }
-/*
-      else	
+      if ( !opt_quiet )
       {
-         affine_to_cpu_mask( -1, opt_affinity );
-         if ( !opt_quiet )
-         {
-#if AFFINITY_USES_UINT128
-            if ( num_cpus > 64 )
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      u128_hi64( opt_affinity ), u128_lo64( opt_affinity ) );
-            else 
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      opt_affinity );
-#else
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      opt_affinity );
-#endif
-         }
+         char affinity_mask[64];
+         format_affinity_mask( affinity_mask, opt_affinity );
+         applog( LOG_INFO, "CPU affinity [%s]", affinity_mask );
       }
-*/
    }
-
-   if ( !opt_quiet && ( opt_n_threads < num_cpus ) )
-   {
-      char affinity_map[64];
-      format_affinity_map( affinity_map, opt_affinity );
-      applog( LOG_INFO, "CPU affinity [%s]", affinity_map );
-   }
-   
+    
 #ifdef HAVE_SYSLOG_H
 	if (use_syslog)
 		openlog("cpuminer", LOG_PID, LOG_USER);
@@ -3955,7 +3856,7 @@ int main(int argc, char *argv[])
 			return 1;
 		}
       if ( !opt_quiet )
-         applog( LOG_INFO,"API listnening to %s:%d", opt_api_allow,
+         applog( LOG_INFO,"API listening to %s:%d", opt_api_allow,
                                                      opt_api_listen );
    }
 
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 4953cec2..71e42981 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,13 +16,13 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+# support for Windows CPU groups
+export DEFAULT_CFLAGS="-O3 -Wall -D_WIN32_WINNT=0x0601"
+#export DEFAULT_CFLAGS="-O3 -Wall"
 
 # make link to local gmp header file.
 ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 
-# edit configure to fix pthread lib name for Windows.
-#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
-
 # make release directory and copy selected DLLs.
 
 rm -rf release > /dev/null
@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=icelake-client -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,8 +53,8 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # Rocketlake AVX512 SHA AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=cascadelake -msha -Wall" ./configure $CONFIGURE_ARGS
-#CFLAGS="-O3 -march=rocketlake -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=cascadelake -msha" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=rocketlake" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha.exe
@@ -62,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha.exe
 # Zen1 AVX2 AES SHA
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe
@@ -70,8 +70,8 @@ mv cpuminer.exe release/cpuminer-zen.exe
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS
-# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=znver2 -mvaes" ./configure $CONFIGURE_ARGS
+# CFLAGS="$DEFAULT_CFLAGS -march=znver3" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen3.exe
@@ -80,7 +80,7 @@ mv cpuminer.exe release/cpuminer-zen3.exe
 # mingw won't compile avx512 without -fno-asynchronous-unwind-tables
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
@@ -90,7 +90,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES in -march=core-avx2
-CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=core-avx2 -maes" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
@@ -99,7 +99,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
 make clean || echo clean
 rm -f config.status
 # -march=corei7-avx still includes aes, but just in case
-CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure $CONFIGURE_ARGS 
+CFLAGS="$DEFAULT_CFLAGS -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS 
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=westmere -maes" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
@@ -116,7 +116,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Nehalem SSE4.2
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=corei7" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-sse42.exe
@@ -124,7 +124,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Core2 SSSE3
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=core2" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-ssse3.exe
@@ -133,7 +133,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -msse2" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe

From 7d2ef7973dab836abe2465c24bea97bdec09e06c Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Sat, 20 Nov 2021 00:46:01 -0500
Subject: [PATCH 15/20] v3.19.1

---
 README.txt                  |  48 +-
 RELEASE_NOTES               |  12 +
 algo-gate-api.h             |   1 -
 algo/blake/decred-gate.c    |   6 +-
 algo/lyra2/lyra2-gate.c     |   2 +-
 algo/ripemd/lbry-gate.c     |   4 +-
 algo/ripemd/sph_ripemd.c    |   2 +-
 algo/swifftx/swifftx-4way.c | 912 ------------------------------------
 algo/swifftx/swifftx.c      |  70 ++-
 algo/swifftx/swifftx.h      |   7 +-
 algo/x16/x16r-gate.c        |  18 +-
 algo/x17/sonoa-gate.c       |   2 +-
 algo/x17/x17-gate.c         |   2 +-
 algo/x17/xevan-gate.c       |   2 +-
 algo/x22/x22i-gate.c        |   8 +-
 build-allarch.sh            |  85 ++--
 clean-all.sh                |   4 +-
 configure                   |  20 +-
 configure.ac                |   2 +-
 cpu-miner.c                 |  49 +-
 winbuild-cross.sh           |  61 +--
 21 files changed, 180 insertions(+), 1137 deletions(-)
 delete mode 100644 algo/swifftx/swifftx-4way.c

diff --git a/README.txt b/README.txt
index 22428ec1..3776d85e 100644
--- a/README.txt
+++ b/README.txt
@@ -18,14 +18,14 @@ error to find the fastest one that works. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
 optimum speed using the best available features.
 
-Architecture names and compile options used are only provided for Intel
-Core series. Budget CPUs like Pentium and Celeron are often missing some
-features.
+Architecture names and compile options used are only provided for 
+mainstream desktop CPUs. Budget CPUs like Pentium and Celeron are often
+missing some features. Check your CPU.
 
-AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
-supported by cpuminer-opt due to an incompatible implementation of SSE2 on
-these CPUs. Some algos may crash the miner with an invalid instruction.
-Users are recommended to use an unoptimized miner such as cpuminer-multi.
+Support for AMD CPUs older than Ryzen is incomplete and without specific 
+recommendations. Find the best fit. CPUs older than Piledriver, including
+Athlon x2 and Phenom II x4, are not supported by cpuminer-opt due to an
+incompatible implementation of SSE2 on these CPUs. 
 
 More information for Intel and AMD CPU architectures and their features
 can be found on Wikipedia.
@@ -34,26 +34,21 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
 
 https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
 
+File name                      Architecture name
 
-Exe file name                Compile flags              Arch name
+cpuminer-sse2.exe              Core2, Nehalem, generic x86_64 with SSE2   
+cpuminer-aes-sse42.exe         Westmere
+cpuminer-avx.exe               Sandybridge, Ivybridge
+cpuminer-avx2.exe              Haswell, Skylake, Kabylake, Coffeelake, Cometlake
+cpuminer-avx2-sha.exe          AMD Zen1, Zen2
+cpuminer-avx2-sha-vaes.exe     Intel Alderlake*, AMD Zen3
+cpuminer-avx512.exe            Intel HEDT Skylake-X, Cascadelake
+cpuminer-avx512-sha-vaes.exe   Icelake, Tigerlake, Rocketlake
 
-cpuminer-sse2.exe            "-msse2"                   Core2, Nehalem   
-cpuminer-aes-sse42.exe       "-march=westmere"          Westmere
-cpuminer-avx.exe             "-march=corei7-avx"        Sandybridge, Ivybridge
-cpuminer-avx2.exe            "-march=core-avx2 -maes"   Haswell(1)
-cpuminer-avx512.exe          "-march=skylake-avx512"    Skylake-X, Cascadelake
-cpuminer-avx512-sha.exe      "-march=cascadelake -msha" Rocketlake(2)
-cpuminer-avx512-sha-vaes.exe "-march=icelake-client"    Icelake, Tigerlake(3)
-cpuminer-zen.exe             "-march=znver1"            AMD Zen1, Zen2
-cpuminer-zen3.exe            "-march=znver2 -mvaes"     Zen3(4)
-
-(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. 
-(2) Rocketlake build uses cascadelake+sha as a workaround until Rocketlake
-    compiler support is avalable.
-(3) Icelake & Tigerlake are only available on some laptops. Mining with a
-    laptop is not recommended.
-(4) Zen3 build uses zen2+vaes as a workaround until Zen3 compiler support is
-    available. Zen2 CPUs should use Zen1 build.
+* Alderlake is a hybrid architecture. With the E-cores disabled it may be
+  possible to enable AVX512 on the the P-cores and use the avx512-sha-vaes
+  build. This is not officially supported by Intel at time of writing.
+  Check for current information.
 
 Notes about included DLL files:
 
@@ -66,8 +61,7 @@ https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
 
 Some DLL filess may already be installed on the system by Windows or third
 party packages. They often will work and may be used instead of the included
-file. Without a compelling reason to do so it's recommended to use the included
-files as they are packaged.
+file. 
 
 If you like this software feel free to donate:
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index ce7752b4..e22c2baa 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,18 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.19.1
+
+Changes to Windows binaries package:
+ - builds for CPUs with AVX or lower have CPU groups disabled,
+ - zen3 build renamed to avx2-sha-vaes to support Alderlake as well as Zen3,
+ - zen build renamed to avx2-sha, supports Zen1 & Zen2,
+ - avx512-sha build removed, Rocketlake CPUs can use avx512-sha-vaes,
+ - see README.txt for compatibility details.
+
+Fixed a few compiler warnings that are new in GCC 11.
+Other minor fixes.
+
 v3.19.0
 
 Windows binaries now built with support for CPU groups, requires Windows 7.
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 56594d59..07108021 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -97,7 +97,6 @@ typedef  uint32_t set_t;
 #define SHA_OPT       0x20   // Zen1, Icelake (sha256)
 #define AVX512_OPT    0x40   // Skylake-X (AVX512[F,VL,DQ,BW])
 #define VAES_OPT      0x80   // Icelake (VAES & AVX512)
-#define VAES256_OPT   0x100  // Zen3 (VAES without AVX512)
 
 
 // return set containing all elements from sets a & b
diff --git a/algo/blake/decred-gate.c b/algo/blake/decred-gate.c
index 0a90de7f..9c58b21b 100644
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -8,7 +8,7 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data )
    return &work_data[ DECRED_NONCE_INDEX ];
 }
 
-double decred_calc_network_diff( struct work* work )
+long double decred_calc_network_diff( struct work* work )
 {
    // sample for diff 43.281 : 1c05ea29
    // todo: endian reversed on longpoll could be zr5 specific...
@@ -16,7 +16,7 @@ double decred_calc_network_diff( struct work* work )
    uint32_t bits = ( nbits & 0xffffff );
    int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
    int m;
-   double d = (double)0x0000ffff / (double)bits;
+   long double d = (long double)0x0000ffff / (long double)bits;
 
    for ( m = shift; m < 29; m++ )
        d *= 256.0;
@@ -25,7 +25,7 @@ double decred_calc_network_diff( struct work* work )
    if ( shift == 28 )
        d *= 256.0; // testnet
    if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
                            shift, bits );
    return net_diff;
 }
diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
index c1d70e7d..8804c41c 100644
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -188,7 +188,7 @@ bool register_allium_algo( algo_gate_t* gate )
   gate->hash      = (void*)&allium_hash;
 #endif
   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
-	              | VAES_OPT | VAES256_OPT;
+	                   | VAES_OPT;
   opt_target_factor = 256.0;
   return true;
 };
diff --git a/algo/ripemd/lbry-gate.c b/algo/ripemd/lbry-gate.c
index ba38c651..d962f501 100644
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <stdio.h>
 
-double lbry_calc_network_diff( struct work *work )
+long double lbry_calc_network_diff( struct work *work )
 {
         // sample for diff 43.281 : 1c05ea29
         // todo: endian reversed on longpoll could be zr5 specific...
@@ -12,7 +12,7 @@ double lbry_calc_network_diff( struct work *work )
    uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
    uint32_t bits = (nbits & 0xffffff);
    int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   double d = (double)0x0000ffff / (double)bits;
+   long double d = (long double)0x0000ffff / (long double)bits;
 
    for (int m=shift; m < 29; m++) d *= 256.0;
    for (int m=29; m < shift; m++) d /= 256.0;
diff --git a/algo/ripemd/sph_ripemd.c b/algo/ripemd/sph_ripemd.c
index f2954971..9273fb8d 100644
--- a/algo/ripemd/sph_ripemd.c
+++ b/algo/ripemd/sph_ripemd.c
@@ -479,7 +479,7 @@ sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
  * One round of RIPEMD-128. The data must be aligned for 32-bit access.
  */
 static void
-ripemd128_round(const unsigned char *data, sph_u32 r[5])
+ripemd128_round(const unsigned char *data, sph_u32 r[4])
 {
 #if SPH_LITTLE_FAST
 
diff --git a/algo/swifftx/swifftx-4way.c b/algo/swifftx/swifftx-4way.c
deleted file mode 100644
index cc003f02..00000000
--- a/algo/swifftx/swifftx-4way.c
+++ /dev/null
@@ -1,912 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////////////
-//
-//  SWIFFTX ANSI C OPTIMIZED 32BIT IMPLEMENTATION FOR NIST SHA-3 COMPETITION
-//
-//  SWIFFTX.c
-//
-//  October 2008
-//
-//  This is the source file of the OPTIMIZED 32BIT implementation of SWIFFTX hash function.
-//  SWIFFTX is a candidate function for SHA-3 NIST competition.
-//  More details about SWIFFTX can be found in the accompanying submission documents.
-//
-///////////////////////////////////////////////////////////////////////////////////////////////
-#include "swifftx.h"
-// See the remarks concerning compatibility issues inside stdint.h.
-#include "stdint.h"
-// Remove this while using gcc:
-//#include "stdbool.h"
-#include <memory.h>
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// Constants and static tables portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-// In SWIFFTX we work over Z_257, so this is the modulus and the arithmetic is performed modulo
-// this number.
-#define FIELD_SIZE 257
-
-// The size of FFT we use:
-#define N 64
-
-#define LOGN 6
-
-#define EIGHTH_N (N / 8)
-
-// The number of FFTS done on the input.
-#define M (SWIFFTX_INPUT_BLOCK_SIZE / 8)   // 32
-
-// Omega is the 128th root of unity in Z_257.
-// We choose w = 42.
-#define OMEGA 42
-
-// The size of the inner FFT lookup table:
-#define W 8
-
-// Calculates the sum and the difference of two numbers.
-//
-// Parameters:
-// - A: the first operand. After the operation stores the sum of the two operands.
-// - B: the second operand. After the operation stores the difference between the first and the
-//   second operands.
-#define ADD_SUB_4WAY( A, B ) \
-{ \
-  __m128i temp = B; \
-  B = _mm_sub_epi32( A, B ); \
-  A = _mm_add_epi32( A, temp ); \
-}
-
-
-//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
-
-// Quickly reduces an integer modulo 257.
-//
-// Parameters:
-// - A: the input.
-
-#define Q_REDUCE( A ) ( _mm_sub_epi32( \
-                               _mm_and_epi32( A, m128_const1_32( 0xff ) ), \
-                               _mm_srli_epi32( A, 8 ) ) )
-
-//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
-
-// Since we need to do the setup only once, this is the indicator variable:
-static bool wasSetupDone = false;
-
-// This array stores the powers of omegas that correspond to the indices, which are the input
-// values. Known also as the "outer FFT twiddle factors".
-swift_int16_t multipliers[N];
-
-// This array stores the powers of omegas, multiplied by the corresponding values.
-// We store this table to save computation time.
-//
-// To calculate the intermediate value of the compression function (the first out of two
-// stages), we multiply the k-th bit of x_i by w^[(2i + 1) * k]. {x_i} is the input to the
-// compression function, i is between 0 and 31, x_i is a 64-bit value.
-// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
-// formula (2), section 3, page 6.
-swift_int16_t fftTable[256 * EIGHTH_N];
-
-// The A's we use in SWIFFTX shall be random elements of Z_257.
-// We generated these A's from the decimal expansion of PI as follows:  we converted each
-// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
-// element, otherwise move to the next triple of digits in the expansion. This guarntees that
-// the A's are random, provided that PI digits are.
-const swift_int16_t As[3 * M * N] =
-{141,  78, 139,  75, 238, 205, 129, 126,  22, 245, 197, 169, 142, 118, 105,  78,
-  50, 149,  29, 208, 114,  34,  85, 117,  67, 148,  86, 256,  25,  49, 133,  93,
-  95,  36,  68, 231, 211, 102, 151, 128, 224, 117, 193,  27, 102, 187,   7, 105,
-  45, 130, 108, 124, 171, 151, 189, 128, 218, 134, 233, 165,  14, 201, 145, 134,
-  52, 203,  91,  96, 197,  69, 134, 213, 136,  93,   3, 249, 141,  16, 210,  73,
-   6,  92,  58,  74, 174,   6, 254,  91, 201, 107, 110,  76, 103,  11,  73,  16,
-  34, 209,   7, 127, 146, 254,  95, 176,  57,  13, 108, 245,  77,  92, 186, 117,
- 124,  97, 105, 118,  34,  74, 205, 122, 235,  53,  94, 238, 210, 227, 183,  11,
- 129, 159, 105, 183, 142, 129,  86,  21, 137, 138, 224, 223, 190, 188, 179, 188,
- 256,  25, 217, 176,  36, 176, 238, 127, 160, 210, 155, 148, 132,   0,  54, 127,
- 145,   6,  46,  85, 243,  95, 173, 123, 178, 207, 211, 183, 224, 173, 146,  35,
-  71, 114,  50,  22, 175,   1,  28,  19, 112, 129,  21,  34, 161, 159, 115,  52,
-   4, 193, 211,  92, 115,  49,  59, 217, 218,  96,  61,  81,  24, 202, 198,  89,
-  45, 128,   8,  51, 253,  87, 171,  35,   4, 188, 171,  10,   3, 137, 238,  73,
-  19, 208, 124, 163, 103, 177, 155, 147,  46,  84, 253, 233, 171, 241, 211, 217,
- 159,  48,  96,  79, 237,  18, 171, 226,  99,   1,  97, 195, 216, 163, 198,  95,
-   0, 201,  65, 228,  21, 153, 124, 230,  44,  35,  44, 108,  85, 156, 249, 207,
-  26, 222, 131,   1,  60, 242, 197, 150, 181,  19, 116, 213,  75,  98, 124, 240,
- 123, 207,  62, 255,  60, 143, 187, 157, 139,   9,  12, 104,  89,  49, 193, 146,
- 104, 196, 181,  82, 198, 253, 192, 191, 255, 122, 212, 104,  47,  20, 132, 208,
-  46, 170,   2,  69, 234,  36,  56, 163,  28, 152, 104, 238, 162,  56,  24,  58,
-  38, 150, 193, 254, 253, 125, 173,  35,  73, 126, 247, 239, 216,   6, 199,  15,
-  90,  12,  97, 122,   9,  84, 207, 127, 219,  72,  58,  30,  29, 182,  41, 192,
- 235, 248, 237,  74,  72, 176, 210, 252,  45,  64, 165,  87, 202, 241, 236, 223,
- 151, 242, 119, 239,  52, 112, 169,  28,  13,  37, 160,  60, 158,  81, 133,  60,
-  16, 145, 249, 192, 173, 217, 214,  93, 141, 184,  54,  34, 161, 104, 157,  95,
-  38, 133, 218, 227, 211, 181,   9,  66, 137, 143,  77,  33, 248, 159,   4,  55,
- 228,  48,  99, 219, 222, 184,  15,  36, 254, 256, 157, 237,  87, 139, 209, 113,
- 232,  85, 126, 167, 197, 100, 103, 166,  64, 225, 125, 205, 117, 135,  84, 128,
- 231, 112,  90, 241,  28,  22, 210, 147, 186,  49, 230,  21, 108,  39, 194,  47,
- 123, 199, 107, 114,  30, 210, 250, 143,  59, 156, 131, 133, 221,  27,  76,  99,
- 208, 250,  78,  12, 211, 141,  95,  81, 195, 106,   8, 232, 150, 212, 205, 221,
-  11, 225,  87, 219, 126, 136, 137, 180, 198,  48,  68, 203, 239, 252, 194, 235,
- 142, 137, 174, 172, 190, 145, 250, 221, 182, 204,   1, 195, 130, 153,  83, 241,
- 161, 239, 211, 138,  11, 169, 155, 245, 174,  49,  10, 166,  16, 130, 181, 139,
- 222, 222, 112,  99, 124,  94,  51, 243, 133, 194, 244, 136,  35, 248, 201, 177,
- 178, 186, 129, 102,  89, 184, 180,  41, 149,  96, 165,  72, 225, 231, 134, 158,
- 199,  28, 249,  16, 225, 195,  10, 210, 164, 252, 138,   8,  35, 152, 213, 199,
-  82, 116,  97, 230,  63, 199, 241,  35,  79, 120,  54, 174,  67, 112,   1,  76,
-  69, 222, 194,  96,  82,  94,  25, 228, 196, 145, 155, 136, 228, 234,  46, 101,
- 246,  51, 103, 166, 246,  75,   9, 200, 161,   4, 108,  35, 129, 168, 208, 144,
-  50,  14,  13, 220,  41, 132, 122, 127, 194,   9, 232, 234, 107,  28, 187,   8,
-  51, 141,  97, 221, 225,   9, 113, 170, 166, 102, 135,  22, 231, 185, 227, 187,
- 110, 145, 251, 146,  76,  22, 146, 228,   7,  53,  64,  25,  62, 198, 130, 190,
- 221, 232, 169,  64, 188, 199, 237, 249, 173, 218, 196, 191,  48, 224,   5, 113,
- 100, 166, 160,  21, 191, 197,  61, 162, 149, 171, 240, 183, 129, 231, 123, 204,
- 192, 179, 134,  15,  47, 161, 142, 177, 239, 234, 186, 237, 231,  53, 208,  95,
- 146,  36, 225, 231,  89, 142,  93, 248, 137, 124,  83,  39,  69,  77,  89, 208,
- 182,  48,  85, 147, 244, 164, 246,  68,  38, 190, 220,  35, 202,  91, 157, 151,
- 201, 240, 185, 218,   4, 152,   2, 132, 177,  88, 190, 196, 229,  74, 220, 135,
- 137, 196,  11,  47,   5, 251, 106, 144, 163,  60, 222, 127,  52,  57, 202, 102,
-  64, 140, 110, 206,  23, 182,  39, 245,   1, 163, 157, 186, 163,  80,   7, 230,
-  44, 249, 176, 102, 164, 125, 147, 120,  18, 191, 186, 125,  64,  65, 198, 157,
- 164, 213,  95,  61,  13, 181, 208,  91, 242, 197, 158,  34,  98, 169,  91,  14,
-  17,  93, 157,  17,  65,  30, 183,   6, 139,  58, 255, 108, 100, 136, 209, 144,
- 164,   6, 237,  33, 210, 110,  57, 126, 197, 136, 125, 244, 165, 151, 168,   3,
- 143, 251, 247, 155, 136, 130,  88,  14,  74, 121, 250, 133,  21, 226, 185, 232,
- 118, 132,  89,  64, 204, 161,   2,  70, 224, 159,  35, 204, 123, 180,  13,  52,
- 231,  57,  25,  78,  66,  69,  97,  42, 198,  84, 176,  59,   8, 232, 125, 134,
- 193,   2, 232, 109, 216,  69,  90, 142,  32,  38, 249,  37,  75, 180, 184, 188,
-  19,  47, 120,  87, 146,  70, 232, 120, 191,  45,  33,  38,  19, 248, 110, 110,
-  44,  64,   2,  84, 244, 228, 252, 228, 170, 123,  38, 144, 213, 144, 171, 212,
- 243,  87, 189,  46, 128, 110,  84,  77,  65, 183,  61, 184, 101,  44, 168,  68,
-  14, 106, 105,   8, 227, 211, 166,  39, 152,  43,  52, 254, 197,  55, 119,  89,
- 168,  65,  53, 138, 177,  56, 219,   0,  58, 121, 148,  18,  44, 100, 215, 103,
- 145, 229, 117, 196,  91,  89, 113, 143, 172, 239, 249, 184, 154,  39, 112,  65,
- 204,  42,  84,  38, 155, 151, 151,  16, 100,  87, 174, 162, 145, 147, 149, 186,
- 237, 145, 134, 144, 198, 235, 213, 163,  48, 230,  24,  47,  57,  71, 127,   0,
- 150, 219,  12,  81, 197, 150, 131,  13, 169,  63, 175, 184,  48, 235,  65, 243,
- 149, 200, 163, 254, 202, 114, 247,  67, 143, 250, 126, 228,  80, 130, 216, 214,
-  36,   2, 230,  33, 119, 125,   3, 142, 237, 100,   3, 152, 197, 174, 244, 129,
- 232,  30, 206, 199,  39, 210, 220,  43, 237, 221, 201,  54, 179,  42,  28, 133,
- 246, 203, 198, 177,   0,  28, 194,  85, 223, 109, 155, 147, 221,  60, 133, 108,
- 157, 254,  26,  75, 157, 185,  49, 142,  31, 137,  71,  43,  63,  64, 237, 148,
- 237, 172, 159, 160, 155, 254, 234, 224, 140, 193, 114, 140,  62, 109, 136,  39,
- 255,   8, 158, 146, 128,  49, 222,  96,  57, 209, 180, 249, 202, 127, 113, 231,
-  78, 178,  46,  33, 228, 215, 104,  31, 207, 186,  82,  41,  42,  39, 103, 119,
- 123, 133, 243, 254, 238, 156,  90, 186,  37, 212,  33, 107, 252,  51, 177,  36,
- 237,  76, 159, 245,  93, 214,  97,  56, 190,  38, 160,  94, 105, 222, 220, 158,
-  49,  16, 191,  52, 120,  87, 179,   2,  27, 144, 223, 230, 184,   6, 129, 227,
-  69,  47, 215, 181, 162, 139,  72, 200,  45, 163, 159,  62,   2, 221, 124,  40,
- 159, 242,  35, 208, 179, 166,  98,  67, 178,  68, 143, 225, 178, 146, 187, 159,
-  57,  66, 176, 192, 236, 250, 168, 224, 122,  43, 159, 120, 133, 165, 122,  64,
-  87,  74, 161, 241,   9,  87,  90,  24, 255, 113, 203, 220,  57, 139, 197, 159,
-  31, 151,  27, 140,  77, 162,   7,  27,  84, 228, 187, 220,  53, 126, 162, 242,
-  84, 181, 223, 103,  86, 177, 207,  31, 140,  18, 207, 256, 201, 166,  96,  23,
- 233, 103, 197,  84, 161,  75,  59, 149, 138, 154, 119,  92,  16,  53, 116,  97,
- 220, 114,  35,  45,  77, 209,  40, 196,  71,  22,  81, 178, 110,  14,   3, 180,
- 110, 129, 112,  47,  18,  61, 134,  78,  73,  79, 254, 232, 125, 180, 205,  54,
- 220, 119,  63,  89, 181,  52,  77, 109, 151,  77,  80, 207, 144,  25,  20,   6,
- 208,  47, 201, 206, 192,  14,  73, 176, 256, 201, 207,  87, 216,  60,  56,  73,
-  92, 243, 179, 113,  49,  59,  55, 168, 121, 137,  69, 154,  95,  57, 187,  47,
- 129,   4,  15,  92,   6, 116,  69, 196,  48, 134,  84,  81, 111,  56,  38, 176,
- 239,   6, 128,  72, 242, 134,  36, 221,  59,  48, 242,  68, 130, 110, 171,  89,
-  13, 220,  48,  29,   5,  75, 104, 233,  91, 129, 105, 162,  44, 113, 163, 163,
-  85, 147, 190, 111, 197,  80, 213, 153,  81,  68, 203,  33, 161, 165,  10,  61,
- 120, 252,   0, 205,  28,  42, 193,  64,  39,  37,  83, 175,   5, 218, 215, 174,
- 128, 121, 231,  11, 150, 145, 135, 197, 136,  91, 193,   5, 107,  88,  82,   6,
-   4, 188, 256,  70,  40,   2, 167,  57, 169, 203, 115, 254, 215, 172,  84,  80,
- 188, 167,  34, 137,  43, 243,   2,  79, 178,  38, 188, 135, 233, 194, 208,  13,
-  11, 151, 231, 196,  12, 122, 162,  56,  17, 114, 191, 207,  90, 132,  64, 238,
- 187,   6, 198, 176, 240,  88, 118, 236,  15, 226, 166,  22, 193, 229,  82, 246,
- 213,  64,  37,  63,  31, 243, 252,  37, 156,  38, 175, 204, 138, 141, 211,  82,
- 106, 217,  97, 139, 153,  56, 129, 218, 158,   9,  83,  26,  87, 112,  71,  21,
- 250,   5,  65, 141,  68, 116, 231, 113,  10, 218,  99, 205, 201,  92, 157,   4,
-  97,  46,  49, 220,  72, 139, 103, 171, 149, 129, 193,  19,  69, 245,  43,  31,
-  58,  68,  36, 195, 159,  22,  54,  34, 233, 141, 205, 100, 226,  96,  22, 192,
-  41, 231,  24,  79, 234, 138,  30, 120, 117, 216, 172, 197, 172, 107,  86,  29,
- 181, 151,   0,   6, 146, 186,  68,  55,  54,  58, 213, 182,  60, 231,  33, 232,
-  77, 210, 216, 154,  80,  51, 141, 122,  68, 148, 219, 122, 254,  48,  64, 175,
-  41, 115,  62, 243, 141,  81, 119, 121,   5,  68, 121,  88, 239,  29, 230,  90,
- 135, 159,  35, 223, 168, 112,  49,  37, 146,  60, 126, 134,  42, 145, 115,  90,
-  73, 133, 211,  86, 120, 141, 122, 241, 127,  56, 130,  36, 174,  75,  83, 246,
- 112,  45, 136, 194, 201, 115,   1, 156, 114, 167, 208,  12, 176, 147,  32, 170,
- 251, 100, 102, 220, 122, 210,   6,  49,  75, 201,  38, 105, 132, 135, 126, 102,
-  13, 121,  76, 228, 202,  20,  61, 213, 246,  13, 207,  42, 148, 168,  37, 253,
-  34,  94, 141, 185,  18, 234, 157, 109, 104,  64, 250, 125,  49, 236,  86,  48,
- 196,  77,  75, 237, 156, 103, 225,  19, 110, 229,  22,  68, 177,  93, 221, 181,
- 152, 153,  61, 108, 101,  74, 247, 195, 127, 216,  30, 166, 168,  61,  83, 229,
- 120, 156,  96, 120, 201, 124,  43,  27, 253, 250, 120, 143,  89, 235, 189, 243,
- 150,   7, 127, 119, 149, 244,  84, 185, 134,  34, 128, 193, 236, 234, 132, 117,
- 137,  32, 145, 184,  44, 121,  51,  76,  11, 228, 142, 251,  39,  77, 228, 251,
-  41,  58, 246, 107, 125, 187,   9, 240,  35,   8,  11, 162, 242, 220, 158, 163,
-   2, 184, 163, 227, 242,   2, 100, 101,   2,  78, 129,  34,  89,  28,  26, 157,
-  79,  31, 107, 250, 194, 156, 186,  69, 212,  66,  41, 180, 139,  42, 211, 253,
- 256, 239,  29, 129, 104, 248, 182,  68,   1, 189,  48, 226,  36, 229,   3, 158,
-  41,  53, 241,  22, 115, 174,  16, 163, 224,  19, 112, 219, 177, 233,  42,  27,
- 250, 134,  18,  28, 145, 122,  68,  34, 134,  31, 147,  17,  39, 188, 150,  76,
-  45,  42, 167, 249,  12,  16,  23, 182,  13,  79, 121,   3,  70, 197, 239,  44,
-  86, 177, 255,  81,  64, 171, 138, 131,  73, 110,  44, 201, 254, 198, 146,  91,
-  48,   9, 104,  31,  29, 161, 101,  31, 138, 180, 231, 233,  79, 137,  61, 236,
- 140,  15, 249, 218, 234, 119,  99, 195, 110, 137, 237, 207,   8,  31,  45,  24,
-  90, 155, 203, 253, 192, 203,  65, 176, 210, 171, 142, 214, 220, 122, 136, 237,
- 189, 186, 147,  40,  80, 254, 173,  33, 191,  46, 192,  26, 108, 255, 228, 205,
-  61,  76,  39, 107, 225, 126, 228, 182, 140, 251, 143, 134, 252, 168, 221,   8,
- 185,  85,  60, 233, 147, 244,  87, 137,   8, 140,  96,  80,  53,  45, 175, 160,
- 124, 189, 112,  37, 144,  19,  70,  17, 170, 242,   2,   3,  28,  95, 120, 199,
- 212,  43,   9, 117,  86, 151, 101, 241, 200, 145, 241,  19, 178,  69, 204, 197,
- 227, 166,  94,   7, 193,  45, 247, 234,  19, 187, 212, 212, 236, 125,  33,  95,
- 198, 121, 122, 103,  77, 155, 235,  49,  25, 237, 249,  11, 162,   7, 238,  24,
-  16, 150, 129,  25, 152,  17,  42,  67, 247, 162,  77, 154,  31, 133,  55, 137,
-  79, 119, 153,  10,  86,  28, 244, 186,  41, 169, 106,  44,  10,  49, 110, 179,
-  32, 133, 155, 244,  61,  70, 131, 168, 170,  39, 231, 252,  32,  69,  92, 238,
- 239,  35, 132, 136, 236, 167,  90,  32, 123,  88,  69,  22,  20,  89, 145, 166,
-  30, 118,  75,   4,  49,  31, 225,  54,  11,  50,  56, 191, 246,   1, 187,  33,
- 119, 107, 139,  68,  19, 240, 131,  55,  94, 113,  31, 252,  12, 179, 121,   2,
- 120, 252,   0,  76,  41,  80, 185,  42,  62, 121, 105, 159, 121, 109, 111,  98,
-   7, 118,  86,  29, 210,  70, 231, 179, 223, 229, 164,  70,  62,  47,   0, 206,
- 204, 178, 168, 120, 224, 166,  99,  25, 103,  63, 246, 224, 117, 204,  75, 124,
- 140, 133, 110, 110, 222,  88, 151, 118,  46,  37,  22, 143, 158,  40,   2,  50,
- 153,  94, 190, 199,  13, 198, 127, 211, 180,  90, 183,  98,   0, 142, 210, 154,
- 100, 187,  67, 231, 202, 100, 198, 235, 252, 160, 247, 124, 247,  14, 121, 221,
-  57,  88, 253, 243, 185,  89,  45, 249, 221, 194, 108, 175, 193, 119,  50, 141,
- 223, 133, 136,  64, 176, 250, 129, 100, 124,  94, 181, 159,  99, 185, 177, 240,
- 135,  42, 103,  52, 202, 208, 143, 186, 193, 103, 154, 237, 102,  88, 225, 161,
-  50, 188, 191, 109,  12,  87,  19, 227, 247, 183,  13,  52, 205, 170, 205, 146,
-  89, 160,  18, 105, 192,  73, 231, 225, 184, 157, 252, 220,  61,  59, 169, 183,
- 221,  20, 141,  20, 158, 101, 245,   7, 245, 225, 118, 137,  84,  55,  19,  27,
- 164, 110,  35,  25, 202,  94, 150,  46,  91, 152, 130,   1,   7,  46,  16, 237,
- 171, 109,  19, 200,  65,  38,  10, 213,  70,  96, 126, 226, 185, 225, 181,  46,
-  10, 165,  11, 123,  53, 158,  22, 147,  64,  22, 227,  69, 182, 237, 197,  37,
-  39,  49, 186, 223, 139, 128,  55,  36, 166, 178, 220,  20,  98, 172, 166, 253,
-  45,   0, 120, 180, 189, 185, 158, 159, 196,   6, 214,  79, 141,  52, 156, 107,
-   5, 109, 142, 159,  33,  64, 190, 133,  95, 132,  95, 202, 160,  63, 186,  23,
- 231, 107, 163,  33, 234,  15, 244,  77, 108,  49,  51,   7, 164,  87, 142,  99,
- 240, 202,  47, 256, 118, 190, 196, 178, 217,  42,  39, 153,  21, 192, 232, 202,
-  14,  82, 179,  64, 233,   4, 219,  10, 133,  78,  43, 144, 146, 216, 202,  81,
-  71, 252,   8, 201,  68, 256,  85, 233, 164,  88, 176,  30,   5, 152, 126, 179,
- 249,  84, 140, 190, 159,  54, 118,  98,   2, 159,  27, 133,  74, 121, 239, 196,
-  71, 149, 119, 135, 102,  20,  87, 112,  44,  75, 221,   3, 151, 158,   5,  98,
- 152,  25,  97, 106,  63, 171, 240,  79, 234, 240, 230,  92,  76,  70, 173, 196,
-  36, 225, 218, 133,  64, 240, 150,  41, 146,  66, 133,  51, 134,  73, 170, 238,
- 140,  90,  45,  89,  46, 147,  96, 169, 174, 174, 244, 151,  90,  40,  32,  74,
-  38, 154, 246,  57,  31,  14, 189, 151,  83, 243, 197, 183, 220, 185,  53, 225,
-  51, 106, 188, 208, 222, 248,  93,  13,  93, 215, 131,  25, 142, 185, 113, 222,
- 131, 215, 149,  50, 159,  85,  32,   5, 205, 192,   2, 227,  42, 214, 197,  42,
- 126, 182,  68, 123, 109,  36, 237, 179, 170, 199,  77, 256,   5, 128, 214, 243,
- 137, 177, 170, 253, 179, 180, 153, 236, 100, 196, 216, 231, 198,  37, 192,  80,
- 121, 221, 246,   1,  16, 246,  29,  78,  64, 148, 124,  38,  96, 125,  28,  20,
-  48,  51,  73, 187, 139, 208,  98, 253, 221, 188,  84, 129,   1, 205,  95, 205,
- 117,  79,  71, 126, 134, 237,  19, 184, 137, 125, 129, 178, 223,  54, 188, 112,
-  30,   7, 225, 228, 205, 184, 233,  87, 117,  22,  58,  10,   8,  42,   2, 114,
- 254,  19,  17,  13, 150,  92, 233, 179,  63,  12,  60, 171, 127,  35,  50,   5,
- 195, 113, 241,  25, 249, 184, 166,  44, 221,  35, 151, 116,   8,  54, 195,  89,
- 218, 186, 132,   5,  41,  89, 226, 177,  11,  41,  87, 172,   5,  23,  20,  59,
- 228,  94,  76,  33, 137,  43, 151, 221,  61, 232,   4, 120,  93, 217,  80, 228,
- 228,   6,  58,  25,  62,  84,  91,  48, 209,  20, 247, 243,  55, 106,  80,  79,
- 235,  34,  20, 180, 146,   2, 236,  13, 236, 206, 243, 222, 204,  83, 148, 213,
- 214, 117, 237,  98,   0,  90, 204, 168,  32,  41, 126,  67, 191,  74,  27, 255,
-  26,  75, 240, 113, 185, 105, 167, 154, 112,  67, 151,  63, 161, 134, 239, 176,
-  42,  87, 249, 130,  45, 242,  17, 100, 107, 120, 212, 218, 237,  76, 231, 162,
- 175, 172, 118, 155,  92,  36, 124,  17, 121,  71,  13,   9,  82, 126, 147, 142,
- 218, 148, 138,  80, 163, 106, 164, 123, 140, 129,  35,  42, 186, 154, 228, 214,
-  75,  73,   8, 253,  42, 153, 232, 164,  95,  24, 110,  90, 231, 197,  90, 196,
-  57, 164, 252, 181,  31,   7,  97, 256,  35,  77, 200, 212,  99, 179,  92, 227,
-  17, 180,  49, 176,   9, 188,  13, 182,  93,  44, 128, 219, 134,  92, 151,   6,
-  23, 126, 200, 109,  66,  30, 140, 180, 146, 134,  67, 200,   7,   9, 223, 168,
- 186, 221,   3, 154, 150, 165,  43,  53, 138,  27,  86, 213, 235, 160,  70,   2,
- 240,  20,  89, 212,  84, 141, 168, 246, 183, 227,  30, 167, 138, 185, 253,  83,
-  52, 143, 236,  94,  59,  65,  89, 218, 194, 157, 164, 156, 111,  95, 202, 168,
- 245, 256, 151,  28, 222, 194,  72, 130, 217, 134, 253,  77, 246, 100,  76,  32,
- 254, 174, 182, 193,  14, 237,  74,   1,  74,  26, 135, 216, 152, 208, 112,  38,
- 181,  62,  25,  71,  61, 234, 254,  97, 191,  23,  92, 256, 190, 205,   6,  16,
- 134, 147, 210, 219, 148,  59,  73, 185,  24, 247, 174, 143, 116, 220, 128, 144,
- 111, 126, 101,  98, 130, 136, 101, 102,  69, 127,  24, 168, 146, 226, 226, 207,
- 176, 122, 149, 254, 134, 196,  22, 151, 197,  21,  50, 205, 116, 154,  65, 116,
- 177, 224, 127,  77, 177, 159, 225,  69, 176,  54, 100, 104, 140,   8,  11, 126,
-  11, 188, 185, 159, 107,  16, 254, 142,  80,  28,   5, 157, 104,  57, 109,  82,
- 102,  80, 173, 242, 238, 207,  57, 105, 237, 160,  59, 189, 189, 199,  26,  11,
- 190, 156,  97, 118,  20,  12, 254, 189, 165, 147, 142, 199,   5, 213,  64, 133,
- 108, 217, 133,  60,  94,  28, 116, 136,  47, 165, 125,  42, 183, 143,  14, 129,
- 223,  70, 212, 205, 181, 180,   3, 201, 182,  46,  57, 104, 239,  60,  99, 181,
- 220, 231,  45,  79, 156,  89, 149, 143, 190, 103, 153,  61, 235,  73, 136,  20,
-  89, 243,  16, 130, 247, 141, 134,  93,  80,  68,  85,  84,   8,  72, 194,   4,
- 242, 110,  19, 133, 199,  70, 172,  92, 132, 254,  67,  74,  36,  94,  13,  90,
- 154, 184,   9, 109, 118, 243, 214,  71,  36,  95,   0,  90, 201, 105, 112, 215,
-  69, 196, 224, 210, 236, 242, 155, 211,  37, 134,  69, 113, 157,  97,  68,  26,
- 230, 149, 219, 180,  20,  76, 172, 145, 154,  40, 129,   8,  93,  56, 162, 124,
- 207, 233, 105,  19,   3, 183, 155, 134,   8, 244, 213,  78, 139,  88, 156,  37,
-  51, 152, 111, 102, 112, 250, 114, 252, 201, 241, 133,  24, 136, 153,   5,  90,
- 210, 197, 216,  24, 131,  17, 147, 246,  13,  86,   3, 253, 179, 237, 101, 114,
- 243, 191, 207,   2, 220, 133, 244,  53,  87, 125, 154, 158, 197,  20,   8,  83,
-  32, 191,  38, 241, 204,  22, 168,  59, 217, 123, 162,  82,  21,  50, 130,  89,
- 239, 253, 195,  56, 253,  74, 147, 125, 234, 199, 250,  28,  65, 193,  22, 237,
- 193,  94,  58, 229, 139, 176,  69,  42, 179, 164, 150, 168, 246, 214,  86, 174,
-  59, 117,  15,  19,  76,  37, 214, 238, 153, 226, 154,  45, 109, 114, 198, 107,
-  45,  70, 238, 196, 142, 252, 244,  71, 123, 136, 134, 188,  99, 132,  25,  42,
- 240,   0, 196,  33,  26, 124, 256, 145,  27, 102, 153,  35,  28, 132, 221, 167,
- 138, 133,  41, 170,  95, 224,  40, 139, 239, 153,   1, 106, 255, 106, 170, 163,
- 127,  44, 155, 232, 194, 119, 232, 117, 239, 143, 108,  41,   3,   9, 180, 256,
- 144, 113, 133, 200,  79,  69, 128, 216,  31,  50, 102, 209, 249, 136, 150, 154,
- 182,  51, 228,  39, 127, 142,  87,  15,  94,  92, 187, 245,  31, 236,  64,  58,
- 114,  11,  17, 166, 189, 152, 218,  34, 123,  39,  58,  37, 153,  91,  63, 121,
-  31,  34,  12, 254, 106,  96, 171,  14, 155, 247, 214,  69,  24,  98,   3, 204,
- 202, 194, 207,  30, 253,  44, 119,  70,  14,  96,  82, 250,  63,   6, 232,  38,
-  89, 144, 102, 191,  82, 254,  20, 222,  96, 162, 110,   6, 159,  58, 200, 226,
-  98, 128,  42,  70,  84, 247, 128, 211, 136,  54, 143, 166,  60, 118,  99, 218,
-  27, 193,  85,  81, 219, 223,  46,  41,  23, 233, 152, 222,  36, 236,  54, 181,
-  56,  50,   4, 207, 129,  92,  78,  88, 197, 251, 131, 105,  31, 172,  38, 131,
-  19, 204, 129,  47, 227, 106, 202, 183,  23,   6,  77, 224, 102, 147,  11, 218,
- 131, 132,  60, 192, 208, 223, 236,  23, 103, 115,  89,  18, 185, 171,  70, 174,
- 139,   0, 100, 160, 221,  11, 228,  60,  12, 122, 114,  12, 157, 235, 148,  57,
-  83,  62, 173, 131, 169, 126,  85,  99,  93, 243,  81,  80,  29, 245, 206,  82,
- 236, 227, 166,  14, 230, 213, 144,  97,  27, 111,  99, 164, 105, 150,  89, 111,
- 252, 118, 140, 232, 120, 183, 137, 213, 232, 157, 224,  33, 134, 118, 186,  80,
- 159,   2, 186, 193,  54, 242,  25, 237, 232, 249, 226, 213,  90, 149,  90, 160,
- 118,  69,  64,  37,  10, 183, 109, 246,  30,  52, 219,  69, 189,  26, 116, 220,
-  50, 244, 243, 243, 139, 137, 232,  98,  38,  45, 256, 143, 171, 101,  73, 238,
- 123,  45, 194, 167, 250, 123,  12,  29, 136, 237, 141,  21,  89,  96, 199,  44,
-   8, 214, 208,  17, 113,  41, 137,  26, 166, 155,  89,  85,  54,  58,  97, 160,
-  50, 239,  58,  71,  21, 157, 139,  12,  37, 198, 182, 131, 149, 134,  16, 204,
- 164, 181, 248, 166,  52, 216, 136, 201,  37, 255, 187, 240,   5, 101, 147, 231,
-  14, 163, 253, 134, 146, 216,   8,  54, 224,  90, 220, 195,  75, 215, 186,  58,
-  71, 204, 124, 105, 239,  53,  16,  85,  69, 163, 195, 223,  33,  38,  69,  88,
-  88, 203,  99,  55, 176,  13, 156, 204, 236,  99, 194, 134,  75, 247, 126, 129,
- 160, 124, 233, 206, 139, 144, 154,  45, 233,  51, 206,  61,  60,  55, 205, 107,
-  84, 108,  96, 188, 203,  31,  89,  20, 115, 144, 137,  90, 237,  78, 231, 185,
- 120, 217,   1, 176, 169,  30, 155, 176, 100, 113,  53,  42, 193, 108,  14, 121,
- 176, 158, 137,  92, 178,  44, 110, 249, 108, 234,  94, 101, 128,  12, 250, 173,
-  72, 202, 232,  66, 139, 152, 189,  18,  32, 197,   9, 238, 246,  55, 119, 183,
- 196, 119, 113, 247, 191, 100, 200, 245,  46,  16, 234, 112, 136, 116, 232,  48,
- 176, 108,  11, 237,  14, 153,  93, 177, 124,  72,  67, 121, 135, 143,  45,  18,
-  97, 251, 184, 172, 136,  55, 213,   8, 103,  12, 221, 212,  13, 160, 116,  91,
- 237, 127, 218, 190, 103, 131,  77,  82,  36, 100,  22, 252,  79,  69,  54,  26,
-  65, 182, 115, 142, 247,  20,  89,  81, 188, 244,  27, 120, 240, 248,  13, 230,
-  67, 133,  32, 201, 129,  87,   9, 245,  66,  88, 166,  34,  46, 184, 119, 218,
- 144, 235, 163,  40, 138, 134, 127, 217,  64, 227, 116,  67,  55, 202, 130,  48,
- 199,  42, 251, 112, 124, 153, 123, 194, 243,  49, 250,  12,  78, 157, 167, 134,
- 210,  73, 156, 102,  21,  88, 216, 123,  45,  11, 208,  18,  47, 187,  20,  43,
-   3, 180, 124,   2, 136, 176,  77, 111, 138, 139,  91, 225, 126,   8,  74, 255,
-  88, 192, 193, 239, 138, 204, 139, 194, 166, 130, 252, 184, 140, 168,  30, 177,
- 121,  98, 131, 124,  69, 171,  75,  49, 184,  34,  76, 122, 202, 115, 184, 253,
- 120, 182,  33, 251,   1,  74, 216, 217, 243, 168,  70, 162, 119, 158, 197, 198,
-  61,  89,   7,   5,  54, 199, 211, 170,  23, 226,  44, 247, 165, 195,   7, 225,
-  91,  23,  50,  15,  51, 208, 106,  94,  12,  31,  43, 112, 146, 139, 246, 182,
- 113,   1,  97,  15,  66,   2,  51,  76, 164, 184, 237, 200, 218, 176,  72,  98,
-  33, 135,  38, 147, 140, 229,  50,  94,  81, 187, 129,  17, 238, 168, 146, 203,
- 181,  99, 164,   3, 104,  98, 255, 189, 114, 142,  86, 102, 229, 102,  80, 129,
-  64,  84,  79, 161,  81, 156, 128, 111, 164, 197,  18,  15,  55, 196, 198, 191,
-  28, 113, 117,  96, 207, 253,  19, 158, 231,  13,  53, 130, 252, 211,  58, 180,
- 212, 142,   7, 219,  38,  81,  62, 109, 167, 113,  33,  56,  97, 185, 157, 130,
- 186, 129, 119, 182, 196,  26,  54, 110,  65, 170, 166, 236,  30,  22, 162,   0,
- 106,  12, 248,  33,  48,  72, 159,  17,  76, 244, 172, 132,  89, 171, 196,  76,
- 254, 166,  76, 218, 226,   3,  52, 220, 238, 181, 179, 144, 225,  23,   3, 166,
- 158,  35, 228, 154, 204,  23, 203,  71, 134, 189,  18, 168, 236, 141, 117, 138,
-   2, 132,  78,  57, 154,  21, 250, 196, 184,  40, 161,  40,  10, 178, 134, 120,
- 132, 123, 101,  82, 205, 121,  55, 140, 231,  56, 231,  71, 206, 246, 198, 150,
- 146, 192,  45, 105, 242,   1, 125,  18, 176,  46, 222, 122,  19,  80, 113, 133,
- 131, 162,  81,  51,  98, 168, 247, 161, 139,  39,  63, 162,  22, 153, 170,  92,
-  91, 130, 174, 200,  45, 112,  99, 164, 132, 184, 191, 186, 200, 167,  86, 145,
- 167, 227, 130,  44,  12, 158, 172, 249, 204,  17,  54, 249,  16, 200,  21, 174,
-  67, 223, 105, 201,  50,  36, 133, 203, 244, 131, 228,  67,  29, 195,  91,  91,
-  55, 107, 167, 154, 170, 137, 218, 183, 169,  61,  99, 175, 128,  23, 142, 183,
-  66, 255,  59, 187,  66,  85, 212, 109, 168,  82,  16,  43,  67, 139, 114, 176,
- 216, 255, 130,  94, 152,  79, 183,  64, 100,  23, 214,  82,  34, 230,  48,  15,
- 242, 130,  50, 241,  81,  32,   5, 125, 183, 182, 184,  99, 248, 109, 159, 210,
- 226,  61, 119, 129,  39, 149,  78, 214, 107,  78, 147, 124, 228,  18, 143, 188,
-  84, 180, 233, 119,  64,  39, 158, 133, 177, 168,   6, 150,  80, 117, 150,  56,
-  49,  72,  49,  37,  30, 242,  49, 142,  33, 156,  34,  44,  44,  72,  58,  22,
- 249,  46, 168,  80,  25, 196,  64, 174,  97, 179, 244, 134, 213, 105,  63, 151,
-  21,  90, 168,  90, 245,  28, 157,  65, 250, 232, 188,  27,  99, 160, 156, 127,
-  68, 193,  10,  80, 205,  36, 138, 229,  12, 223,  70, 169, 251,  41,  48,  94,
-  41, 177,  99, 256, 158,   0,   6,  83, 231, 191, 120, 135, 157, 146, 218, 213,
- 160,   7,  47, 234,  98, 211,  79, 225, 179,  95, 175, 105, 185,  79, 115,   0,
- 104,  14,  65, 124,  15, 188,  52,   9, 253,  27, 132, 137,  13, 127,  75, 238,
- 185, 253,  33,   8,  52, 157, 164,  68, 232, 188,  69,  28, 209, 233,   5, 129,
- 216,  90, 252, 212,  33, 200, 222,   9, 112,  15,  43,  36, 226, 114,  15, 249,
- 217,   8, 148,  22, 147,  23, 143,  67, 222, 116, 235, 250, 212, 210,  39, 142,
- 108,  64, 209,  83,  73,  66,  99,  34,  17,  29,  45, 151, 244, 114,  28, 241,
- 144, 208, 146, 179, 132,  89, 217, 198, 252, 219, 205, 165,  75, 107,  11, 173,
-  76,   6, 196, 247, 152, 216, 248,  91, 209, 178,  57, 250, 174,  60,  79, 123,
-  18, 135,   9, 241, 230, 159, 184,  68, 156, 251, 215,   9, 113, 234,  75, 235,
- 103, 194, 205, 129, 230,  45,  96,  73, 157,  20, 200, 212, 212, 228, 161,   7,
- 231, 228, 108,  43, 198,  87, 140, 140,   4, 182, 164,   3,  53, 104, 250, 213,
-  85,  38,  89,  61,  52, 187,  35, 204,  86, 249, 100,  71, 248, 213, 163, 215,
-  66, 106, 252, 129,  40, 111,  47,  24, 186, 221,  85, 205, 199, 237, 122, 181,
-  32,  46, 182, 135,  33, 251, 142,  34, 208, 242, 128, 255,   4, 234,  15,  33,
- 167, 222,  32, 186, 191,  34, 255, 244,  98, 240, 228, 204,  30, 142,  32,  70,
-  69,  83, 110, 151,  10, 243, 141,  21, 223,  69,  61,  37,  59, 209, 102, 114,
- 223,  33, 129, 254, 255, 103,  86, 247, 235,  72, 126, 177, 102, 226, 102,  30,
- 149, 221,  62, 247, 251, 120, 163, 173,  57, 202, 204,  24,  39, 106, 120, 143,
- 202, 176, 191, 147,  37,  38,  51, 133,  47, 245, 157, 132, 154,  71, 183, 111,
-  30, 180,  18, 202,  82,  96, 170,  91, 157, 181, 212, 140, 256,   8, 196, 121,
- 149,  79,  66, 127, 113,  78,   4, 197,  84, 256, 111, 222, 102,  63, 228, 104,
- 136, 223,  67, 193,  93, 154, 249,  83, 204, 101, 200, 234,  84, 252, 230, 195,
-  43, 140, 120, 242,  89,  63, 166, 233, 209,  94,  43, 170, 126,   5, 205,  78,
- 112,  80, 143, 151, 146, 248, 137, 203,  45, 183,  61,   1, 155,   8, 102,  59,
-  68, 212, 230,  61, 254, 191, 128, 223, 176, 123, 229,  27, 146, 120,  96, 165,
- 213,  12, 232,  40, 186, 225,  66, 105, 200, 195, 212, 110, 237, 238, 151,  19,
-  12, 171, 150,  82,   7, 228,  79,  52,  15,  78,  62,  43,  21, 154, 114,  21,
-  12, 212, 256, 232, 125, 127,   5,  51,  37, 252, 136,  13,  47, 195, 168, 191,
- 231,  55,  57, 251, 214, 116,  15,  86, 210,  41, 249, 242, 119,  27, 250, 203,
- 107,  69,  90,  43, 206, 154, 127,  54, 100,  78, 187,  54, 244, 177, 234, 167,
- 202, 136, 209, 171,  69, 114, 133, 173,  26, 139,  78, 141, 128,  32, 124,  39,
-  45, 218,  96,  68,  90,  44,  67,  62,  83, 190, 188, 256, 103,  42, 102,  64,
- 249,   0, 141,  11,  61,  69,  70,  66, 233, 237,  29, 200, 251, 157,  71,  51,
-  64, 133, 113,  76,  35, 125,  76, 137, 217, 145,  35,  69, 226, 180,  56, 249,
- 156, 163, 176, 237,  81,  54,  85, 169, 115, 211, 129,  70, 248,  40, 252, 192,
- 194, 101, 247,   8, 181, 124, 217, 191, 194,  93,  99, 127, 117, 177, 144, 151,
- 228, 121,  32,  11,  89,  81,  26,  29, 183,  76, 249, 132, 179,  70,  34, 102,
-  20,  66,  87,  63, 124, 205, 174, 177,  87, 219,  73, 218,  91,  87, 176,  72,
-  15, 211,  47,  61, 251, 165,  39, 247, 146,  70, 150,  57,   1, 212,  36, 162,
-  39,  38,  16, 216,   3,  50, 116, 200,  32, 234,  77, 181, 155,  19,  90, 188,
-  36,   6, 254,  46,  46, 203,  25, 230, 181, 196,   4, 151, 225,  65, 122, 216,
- 168,  86, 158, 131, 136,  16,  49, 102, 233,  64, 154,  88, 228,  52, 146,  69,
-  93, 157, 243, 121,  70, 209, 126, 213,  88, 145, 236,  65,  70,  96, 204,  47,
-  10, 200,  77,   8, 103, 150,  48, 153,   5,  37,  52, 235, 209,  31, 181, 126,
-  83, 142, 224, 140,   6,  32, 200, 171, 160, 179, 115, 229,  75, 194, 208,  39,
-  59, 223,  52, 247,  38, 197, 135,   1,   6, 189, 106, 114, 168,   5, 211, 222,
-  44,  63,  90, 160, 116, 172, 170, 133, 125, 138,  39, 131,  23, 178,  10, 214,
-  36,  93,  28,  59,  68,  17, 123,  25, 255, 184, 204, 102, 194, 214, 129,  94,
- 159, 245, 112, 141,  62,  11,  61, 197, 124, 221, 205,  11,  79,  71, 201,  54,
-  58, 150,  29, 121,  87,  46, 240, 201,  68,  20, 194, 209,  47, 152, 158, 174,
- 193, 164, 120, 255, 216, 165, 247,  58,  85, 130, 220,  23, 122, 223, 188,  98,
-  21,  70,  72, 170, 150, 237,  76, 143, 112, 238, 206, 146, 215, 110,   4, 250,
-  68,  44, 174, 177,  30,  98, 143, 241, 180, 127, 113,  48,   0,   1, 179, 199,
-  59, 106, 201, 114,  29,  86, 173, 133, 217,  44, 200, 141, 107, 172,  16,  60,
-  82,  58, 239,  94, 141, 234, 186, 235, 109, 173, 249, 139, 141,  59, 100, 248,
-  84, 144,  49, 160,  51, 207, 164, 103,  74,  97, 146, 202, 193, 125, 168, 134,
- 236, 111, 135, 121,  59, 145, 168, 200, 181, 173, 109,   2, 255,   6,   9, 245,
-  90, 202, 214, 143, 121,  65,  85, 232, 132,  77, 228,  84,  26,  54, 184,  15,
- 161,  29, 177,  79,  43,   0, 156, 184, 163, 165,  62,  90, 179,  93,  45, 239,
-   1,  16, 120, 189, 127,  47,  74, 166,  20, 214, 233, 226,  89, 217, 229,  26,
- 156,  53, 162,  60,  21,   3, 192,  72, 111,  51,  53, 101, 181, 208,  88,  82,
- 179, 160, 219, 113, 240, 108,  43, 224, 162, 147,  62,  14,  95,  81, 205,   4,
- 160, 177, 225, 115,  29,  69, 235, 168, 148,  29, 128, 114, 124, 129, 172, 165,
- 215, 231, 214,  86, 160,  44, 157,  91, 248, 183,  73, 164,  56, 181, 162,  92,
- 141, 118, 127, 240, 196,  77,   0,   9, 244,  79, 250, 100, 195,  25, 255,  85,
-  94,  35, 212, 137, 107,  34, 110,  20, 200, 104,  17,  32, 231,  43, 150, 159,
- 231, 216, 223, 190, 226, 109, 162, 197,  87,  92, 224,  11, 111,  73,  60, 225,
- 238,  73, 246, 169,  19, 217, 119,  38, 121, 118,  70,  82,  99, 241, 110,  67,
-  31,  76, 146, 215, 124, 240,  31, 103, 139, 224,  75, 160,  31,  78,  93,   4,
-  64,   9, 103, 223,   6, 227, 119,  85, 116,  81,  21,  43,  46, 206, 234, 132,
-  85,  99,  22, 131, 135,  97,  86,  13, 234, 188,  21,  14,  89, 169, 207, 238,
- 219, 177, 190,  72, 157,  41, 114, 140,  92, 141, 186,   1,  63, 107, 225, 184,
- 118, 150, 153, 254, 241, 106, 120, 210, 104, 144, 151, 161,  88, 206, 125, 164,
-  15, 211, 173,  49, 146, 241,  71,  36,  58, 201,  46,  27,  33, 187,  91, 162,
- 117,  19, 210, 213, 187,  97, 193,  50, 190, 114, 217,  60,  61, 167, 207, 213,
- 213,  53, 135,  34, 156,  91, 115, 119,  46,  99, 242,   1,  90,  52, 198, 227,
- 201,  91, 216, 146, 210,  82, 121,  38,  73, 133, 182, 193, 132, 148, 246,  75,
- 109, 157, 179, 113, 176, 134, 205, 159, 148,  58, 103, 171, 132, 156, 133, 147,
- 161, 231,  39, 100, 175,  97, 125,  28, 183, 129, 135, 191, 202, 181,  29, 218,
-  43, 104, 148, 203, 189, 204,   4, 182, 169,   1, 134, 122, 141, 202,  13, 187,
- 177, 112, 162,  35, 231,   6,   8, 241,  99,   6, 191,  45, 113, 113, 101, 104};
-
-// The S-Box we use for further linearity breaking.
-// We created it by taking the digits of decimal expansion of e.
-// The code that created it can be found in 'ProduceRandomSBox.c'.
-unsigned char SBox[256] = {
-//0     1    2      3     4    5     6     7      8    9     A      B    C     D     E     F
-0x7d, 0xd1, 0x70, 0x0b, 0xfa, 0x39, 0x18, 0xc3, 0xf3, 0xbb, 0xa7, 0xd4, 0x84, 0x25, 0x3b, 0x3c,   // 0
-0x2c, 0x15, 0x69, 0x9a, 0xf9, 0x27, 0xfb, 0x02, 0x52, 0xba, 0xa8, 0x4b, 0x20, 0xb5, 0x8b, 0x3a,   // 1
-0x88, 0x8e, 0x26, 0xcb, 0x71, 0x5e, 0xaf, 0xad, 0x0c, 0xac, 0xa1, 0x93, 0xc6, 0x78, 0xce, 0xfc,   // 2
-0x2a, 0x76, 0x17, 0x1f, 0x62, 0xc2, 0x2e, 0x99, 0x11, 0x37, 0x65, 0x40, 0xfd, 0xa0, 0x03, 0xc1,   // 3
-0xca, 0x48, 0xe2, 0x9b, 0x81, 0xe4, 0x1c, 0x01, 0xec, 0x68, 0x7a, 0x5a, 0x50, 0xf8, 0x0e, 0xa3,   // 4
-0xe8, 0x61, 0x2b, 0xa2, 0xeb, 0xcf, 0x8c, 0x3d, 0xb4, 0x95, 0x13, 0x08, 0x46, 0xab, 0x91, 0x7b,   // 5
-0xea, 0x55, 0x67, 0x9d, 0xdd, 0x29, 0x6a, 0x8f, 0x9f, 0x22, 0x4e, 0xf2, 0x57, 0xd2, 0xa9, 0xbd,   // 6
-0x38, 0x16, 0x5f, 0x4c, 0xf7, 0x9e, 0x1b, 0x2f, 0x30, 0xc7, 0x41, 0x24, 0x5c, 0xbf, 0x05, 0xf6,   // 7
-0x0a, 0x31, 0xa5, 0x45, 0x21, 0x33, 0x6b, 0x6d, 0x6c, 0x86, 0xe1, 0xa4, 0xe6, 0x92, 0x9c, 0xdf,   // 8
-0xe7, 0xbe, 0x28, 0xe3, 0xfe, 0x06, 0x4d, 0x98, 0x80, 0x04, 0x96, 0x36, 0x3e, 0x14, 0x4a, 0x34,   // 9
-0xd3, 0xd5, 0xdb, 0x44, 0xcd, 0xf5, 0x54, 0xdc, 0x89, 0x09, 0x90, 0x42, 0x87, 0xff, 0x7e, 0x56,   // A
-0x5d, 0x59, 0xd7, 0x23, 0x75, 0x19, 0x97, 0x73, 0x83, 0x64, 0x53, 0xa6, 0x1e, 0xd8, 0xb0, 0x49,   // B
-0x3f, 0xef, 0xbc, 0x7f, 0x43, 0xf0, 0xc9, 0x72, 0x0f, 0x63, 0x79, 0x2d, 0xc0, 0xda, 0x66, 0xc8,   // C
-0x32, 0xde, 0x47, 0x07, 0xb8, 0xe9, 0x1d, 0xc4, 0x85, 0x74, 0x82, 0xcc, 0x60, 0x51, 0x77, 0x0d,   // D
-0xaa, 0x35, 0xed, 0x58, 0x7c, 0x5b, 0xb9, 0x94, 0x6e, 0x8d, 0xb1, 0xc5, 0xb7, 0xee, 0xb6, 0xae,   // E
-0x10, 0xe0, 0xd6, 0xd9, 0xe5, 0x4f, 0xf1, 0x12, 0x00, 0xd0, 0xf4, 0x1a, 0x6f, 0x8a, 0xb3, 0xb2 }; // F
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-//
-//	Helper functions definition portion.
-//
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-// Don't vectorize, move decl to header file
-
-// Translates an input array with values in base 257 to output array with values in base 256.
-// Returns the carry bit.
-//
-// Parameters:
-// - input: the input array of size EIGHTH_N. Each value in the array is a number in Z_257.
-//          The MSB is assumed to be the last one in the array.
-// - output: the input array encoded in base 256.
-//
-// Returns:
-// - The carry bit (MSB).
-swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N]);
-
-// Translates an input integer into the range (-FIELD_SIZE / 2) <= result <= (FIELD_SIZE / 2).
-//
-// Parameters:
-// - x: the input integer.
-//
-// Returns:
-// - The result, which equals (x MOD FIELD_SIZE), such that |result| <= (FIELD_SIZE / 2).
-int Center(int x);
-
-// Calculates bit reversal permutation.
-//
-// Parameters:
-// - input: the input to reverse.
-// - numOfBits: the number of bits in the input to reverse.
-//
-// Returns:
-// - The resulting number, which is obtained from the input by reversing its bits.
-int ReverseBits(int input, int numOfBits);
-
-// Initializes the FFT fast lookup table.
-// Shall be called only once.
-void InitializeSWIFFTX();
-
-// Calculates the FFT.
-//
-// Parameters:
-// - input: the input to the FFT.
-// - output: the resulting output.
-void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output);
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// Helper functions implementation portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-// Don't vectorize, delete this copy.
-
-swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N])
-{
-	swift_int32_t pairs[EIGHTH_N / 2];
-	int i;
-
-	for (i = 0; i < EIGHTH_N; i += 2)
-	{
-		// input[i] + 257 * input[i + 1]
-		pairs[i >> 1] = input[i] + input[i + 1] + (input[i + 1] << 8);
-	}
-
-	for (i = (EIGHTH_N / 2) - 1; i > 0; --i)
-	{
-		int j;
-
-		for (j = i - 1; j < (EIGHTH_N / 2) - 1; ++j)
-		{
-			// pairs[j + 1] * 513, because 257^2 = 513 % 256^2.
-			register swift_int32_t temp = pairs[j] + pairs[j + 1] + (pairs[j + 1] << 9);
-			pairs[j] = temp & 0xffff;
-			pairs[j + 1] += (temp >> 16);
-		}
-	}
-
-	for (i = 0; i < EIGHTH_N; i += 2)
-	{
-		output[i] = (unsigned char) (pairs[i >> 1] & 0xff);
-		output[i + 1] = (unsigned char) ((pairs[i >> 1] >> 8) & 0xff);
-	}
-
-	return (pairs[EIGHTH_N/2 - 1] >> 16);
-}
-
-int Center(int x)
-{
-	int result = x % FIELD_SIZE;
-
-	if (result > (FIELD_SIZE / 2))
-		result -= FIELD_SIZE;
-
-	if (result < (FIELD_SIZE / -2))
-		result += FIELD_SIZE;
-
-	return result;
-}
-
-int ReverseBits(int input, int numOfBits)
-{
-	register int reversed = 0;
-
-	for (input |= numOfBits; input > 1; input >>= 1)
-		reversed = (reversed << 1) | (input & 1);
-
-	return reversed;
-}
-
-void InitializeSWIFFTX()
-{
-	int i, j, k, x;
-	// The powers of OMEGA
-	int omegaPowers[2 * N];
-	omegaPowers[0] = 1;
-
-	if (wasSetupDone)
-		return;
-
-	for (i = 1; i < (2 * N); ++i)
-	{
-		omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA);
-	}
-
-	for (i = 0; i < (N / W); ++i)
-	{
-		for (j = 0; j < W; ++j)
-		{
-			multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)];
-		}
-	}
-
-	for (x = 0; x < 256; ++x)
-	{
-		for (j = 0; j < 8; ++j)
-		{
-			register int temp = 0;
-			for (k = 0; k < 8; ++k)
-			{
-				temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)]
-					  * ((x >> k) & 1);
-			}
-
-			fftTable[(x << 3) + j] = Center(temp);
-		}
-	}
-
-	wasSetupDone = true;
-}
-
-// input should be deinterleaved in contiguos memory
-// output and F are 4x32
-// multipliers & fftTable are scalar 16
-
-
-void FFT_4way(const unsigned char input[EIGHTH_N], swift_int32_t *output)
-{
-	swift_int16_t *mult = multipliers;
-   m128_swift_int32_t F[64];
-
-   for (int i = 0; i < 8; i++)
-   {
-      int j = i<<3;
-
-// Need to isolate bytes in input, 8 bytes per lane.
-// Each iteration of the loop process one input vector
-// Each lane reads a different index to ffttable.
-
-// deinterleave the input!
-
-// load table with 4 lanes from different indexes into fftTable
-// extract bytes into m128 4x16
-// mutiply by vectorized mult
-
-// input[lane][byte]
-
-      __m128i table;
-      table = _mm_set_epi32( fftTable[ input[3][i] ],
-                             fftTable[ input[2][i] ],
-                             fftTable[ input[1][i] ],
-                             fftTable[ input[0][i] ] );
-
-      F[i  ] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ), table );
-
-      table = _mm_set_epi32( fftTable[ input[3][i+1] ]
-                             fftTable[ input[2][i+1] ]
-                             fftTable[ input[1][i+1] ]
-                             fftTable[ input[0][i+1] ] );
-
-      F[i+8] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ), table );
-
-
-      m128_swift_int16_t *table = &( fftTable[input[i] << 3] );
-
-      F[i   ] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ),
-                                 mm128_const1_32( table[0] ) );
-      F[i+ 8] = _mm_mullo_epi32( mm128_const1_32( mult[j+1] ),
-                                 mm128_const1_32( table[1] ) );
-      F[i+16] = _mm_mullo_epi32( mm128_const1_32( mult[j+2] ),
-                                 mm128_const1_32( table[2] ) );
-      F[i+24] = _mm_mullo_epi32( mm128_const1_32( mult[j+3] ),
-                                 mm128_const1_32( table[3] ) );
-      F[i+32] = _mm_mullo_epi32( mm128_const1_32( mult[j+4] ),
-                                 mm128_const1_32( table[4] ) );
-      F[i+40] = _mm_mullo_epi32( mm128_const1_32( mult[j+5] ),
-                                 mm128_const1_32( table[5] ) );
-      F[i+48] = _mm_mullo_epi32( mm128_const1_32( mult[j+6] ),
-                                 mm128_const1_32( table[6] ) );
-      F[i+56] = _mm_mullo_epi32( mm128_const1_32( mult[j+7] ),
-                                 mm128_const1_32( table[7] ) );
-   }
-
-
-   for ( int i = 0; i < 8; i++ )
-   {
-      int j = i<<3;
-      ADD_SUB_4WAY( F[j  ], F[j+1] );
-      ADD_SUB_4WAY( F[j+2], F[j+3] );
-      ADD_SUB_4WAY( F[j+4], F[j+5] );
-      ADD_SUB_4WAY( F[j+6], F[j+7] );
-
-      F[j+3] = _mm_slli_epi32( F[j+3], 4 );
-      F[j+7] = _mm_slli_epi32( F[j+7], 4 );
-
-      ADD_SUB_4WAY( F[j  ], F[j+2] );
-      ADD_SUB_4WAY( F[j+1], F[j+3] );
-      ADD_SUB_4WAY( F[j+4], F[j+6] );
-      ADD_SUB_4WAY( F[j+5], F[j+7] );
-
-      F[j+5] = _mm_slli_epi32( F[j+5], 2 );
-      F[j+6] = _mm_slli_epi32( F[j+6], 4 );
-      F[j+7] = _mm_slli_epi32( F[j+7], 6 );
-
-      ADD_SUB_4WAY( F[j  ], F[j+4] );
-      ADD_SUB_4WAY( F[j+1], F[j+5] );
-      ADD_SUB_4WAY( F[j+2], F[j+6] );
-      ADD_SUB_4WAY( F[j+3], F[j+7] );
-
-      output[i   ] = Q_REDUCE_4WAY( F[j  ] );
-      output[i+ 8] = Q_REDUCE_4WAY( F[j+1] );
-      output[i+16] = Q_REDUCE_4WAY( F[j+2] );
-      output[i+24] = Q_REDUCE_4WAY( F[j+3] );
-      output[i+32] = Q_REDUCE_4WAY( F[j+4] );
-      output[i+40] = Q_REDUCE_4WAY( F[j+5] );
-      output[i+48] = Q_REDUCE_4WAY( F[j+6] );
-      output[i+56] = Q_REDUCE_4WAY( F[j+7] );
-   }
-}
-
-// Calculates the FFT part of SWIFFT.
-// We divided the SWIFFT calculation into two, because that way we could save 2 computations of
-// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs
-// is only the A's part.
-//
-// Parameters:
-// - input: the input to FFT.
-// - m: the input size divided by 8. The function performs m FFTs.
-// - output: will store the result.
-void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
-{
-	int i;
-
-	for (i = 0;
-		 i < m;
-		 i++, input += EIGHTH_N, output += N)
-	{
-		FFT(input, output);
-	}
-}
-
-// Calculates the 'sum' part of SWIFFT, including the base change at the end.
-// We divided the SWIFFT calculation into two, because that way we could save 2 computations of
-// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs
-// is only the A's part.
-//
-// Parameters:
-// - input: the input. Of size 64 * m.
-// - m: the input size divided by 64.
-// - output: will store the result.
-// - a: the coefficients in the sum. Of size 64 * m.
-void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
-{
-	int i, j;
-	swift_int32_t result[N];
-	register swift_int16_t carry = 0;
-
-	for (j = 0; j < N; ++j)
-	{
-		register swift_int32_t sum = 0;
-		const register swift_int32_t *f = input + j;
-		const register swift_int16_t *k = a + j;
-
-		for (i = 0; i < m; i++, f += N,k += N)
-		{
-			sum += (*f) * (*k);
-		}
-
-		result[j] = sum;
-	}
-
-	for (j = 0; j < N; ++j)
-	{
-		result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
-	}
-
-	for (j = 0; j < 8; ++j)
-	{
-		int register carryBit = TranslateToBase256(result + (j << 3), output + (j << 3));
-		carry |= carryBit << j;
-	}
-
-	output[N] = carry;
-}
-
-
-// On entry input is interleaved 4x64. SIZE is *4 lanes / 8 bytes,
-// multiply by 2.
-
-
-void ComputeSingleSWIFFTX_4way( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
-                          unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE],
-						  bool doSmooth)
-{
-	int i;
-	// Will store the result of the FFT parts:
-   m128_swift_int32_t fftOut[N * M];
-//   swift_int32_t fftOut[N * M];
-	unsigned char intermediate[N * 3 + 8];
-	unsigned char carry0,carry1,carry2;
-
-	// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
-	// overriden by the following SWIFFT):
-
-	// 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs:
-	SWIFFTFFT(input, M, fftOut);
-
-	// 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients:
-
-	// 2a. The first SWIFFT:
-	SWIFFTSum(fftOut, M, intermediate, As);
-	// Remember the carry byte:
-	carry0 = intermediate[N];
-
-	// 2b. The second one:
-	SWIFFTSum(fftOut, M, intermediate + N, As + (M * N));
-	carry1 = intermediate[2 * N];
-
-	// 2c. The third one:
-	SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N));
-	carry2 = intermediate[3 * N];
-
-	//2d. Put three carry bytes in their place
-	intermediate[3 * N] = carry0;
-	intermediate[(3 * N) + 1] = carry1;
-	intermediate[(3 * N) + 2] = carry2;
-
-	// Padding  intermediate output with 5 zeroes.
-	memset(intermediate + (3 * N) + 3, 0, 5);
-
-	// Apply the S-Box:
-	for (i = 0; i < (3 * N) + 8; ++i)
-	{
-		intermediate[i] = SBox[intermediate[i]];
-	}
-
-	// 3. The final and last SWIFFT:
-	SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut);
-	SWIFFTSum(fftOut,       3 * (N/8) + 1, output, As);
-
-	if (doSmooth)
-	{
-		unsigned char sum[N];
-		register int i, j;
-		memset(sum, 0, N);
-
-		for (i = 0; i < (N + 1) * 8; ++i)
-		{
-			register const swift_int16_t *AsRow;
-			register int AShift;
-
-			if  (!(output[i >> 3] & (1 << (i & 7))))
-			{
-				continue;
-			}
-
-			AsRow = As + N * M + (i & ~(N - 1)) ;
-			AShift = i & 63;
-
-			for (j = AShift; j < N; ++j)
-			{
-				sum[j] += AsRow[j - AShift];
-			}
-
-			for(j = 0; j < AShift; ++j)
-			{
-				sum[j] -= AsRow[N - AShift + j];
-			}
-		}
-
-		for (i = 0; i < N; ++i)
-		{
-			output[i] = sum[i];
-		}
-
-		output[N] = 0;
-	}
-}
diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c
index d3ecd15c..c7d8c727 100644
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -604,21 +604,14 @@ void InitializeSWIFFTX()
 	int omegaPowers[2 * N];
 	omegaPowers[0] = 1;
 
-	if (wasSetupDone)
-		return;
+	if (wasSetupDone) return;
 
 	for (i = 1; i < (2 * N); ++i)
-	{
 		omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA);
-	}
 
 	for (i = 0; i < (N / W); ++i)
-	{
 		for (j = 0; j < W; ++j)
-		{
 			multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)];
-		}
-	}
 
 	for (x = 0; x < 256; ++x)
 	{
@@ -626,10 +619,8 @@ void InitializeSWIFFTX()
 		{
 			register int temp = 0;
 			for (k = 0; k < 8; ++k)
-			{
 				temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)]
 					  * ((x >> k) & 1);
-			}
 
 			fftTable[(x << 3) + j] = Center(temp);
 		}
@@ -703,18 +694,18 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 
 #if defined (__AVX512VL__) && defined(__AVX512BW__)   
 
-   #define Q_REDUCE( a ) \
-       _mm256_sub_epi32( _mm256_and_si256( a, \
-                 _mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) ) 
+   const __m256i mask = _mm256_movm_epi8( 0x11111111 );
+
+#else
+
+   const __m256i mask = m256_const1_32( 0x000000ff );
 
-#else   
+#endif
 
    #define Q_REDUCE( a ) \
-       _mm256_sub_epi32( _mm256_and_si256( a, \
-                   m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) ) 
+       _mm256_sub_epi32( _mm256_and_si256( a, mask ), \
+                         _mm256_srai_epi32( a, 8 ) )
 
-#endif
-                          
    out[0] = Q_REDUCE( F[0] );  
    out[1] = Q_REDUCE( F[1] );                        
    out[2] = Q_REDUCE( F[2] );                        
@@ -805,9 +796,10 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 
    #undef ADD_SUB
 
+   const __m128i mask = m128_const1_32( 0x000000ff );
+
    #define Q_REDUCE( a ) \
-      _mm_sub_epi32( _mm_and_si128( a, \
-                   m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) ) 
+      _mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) ) 
 
    out[ 0] = Q_REDUCE( F[ 0] );
    out[ 1] = Q_REDUCE( F[ 1] );
@@ -1357,6 +1349,7 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
 	output[N] = carry;
 }
 
+/*
 void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
                           unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE],
 						  bool doSmooth)
@@ -1434,51 +1427,50 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 		output[N] = 0;
 	}
 }
+*/
 
-void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
-                           unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE] )
+void ComputeSingleSWIFFTX( unsigned char *input, unsigned char *output )
 {
    int i;
    // Will store the result of the FFT parts:
    swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
-   unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
+   unsigned char sum[ N*3 + 8 ] __attribute__ ((aligned (64)));
    unsigned char carry0,carry1,carry2;
 
    // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
    // overriden by the following SWIFFT):
 
    // 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs:
-   SWIFFTFFT(input, M, fftOut);
+   SWIFFTFFT( input, M, fftOut );
 
    // 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients:
 
    // 2a. The first SWIFFT:
-   SWIFFTSum(fftOut, M, intermediate, As);
-   // Remember the carry byte:
-   carry0 = intermediate[N];
+   SWIFFTSum( fftOut, M, sum,       As         );
+   carry0 = sum[N];
 
    // 2b. The second one:
-   SWIFFTSum(fftOut, M, intermediate + N, As + (M * N));
-   carry1 = intermediate[2 * N];
+   SWIFFTSum( fftOut, M, sum + N,   As +   M*N );
+   carry1 = sum[ 2*N ];
 
    // 2c. The third one:
-   SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N));
-   carry2 = intermediate[3 * N];
+   SWIFFTSum( fftOut, M, sum + 2*N, As + 2*M*N );
+   carry2 = sum[ 3*N ];
 
    //2d. Put three carry bytes in their place
-   intermediate[3 * N] = carry0;
-   intermediate[(3 * N) + 1] = carry1;
-   intermediate[(3 * N) + 2] = carry2;
+   sum[ 3*N     ] = carry0;
+   sum[ 3*N + 1 ] = carry1;
+   sum[ 3*N + 2 ] = carry2;
 
    // Padding  intermediate output with 5 zeroes.
-   memset(intermediate + (3 * N) + 3, 0, 5);
+   memset( sum + 3*N + 3, 0, 5 );
 
    // Apply the S-Box:
    for ( i = 0; i < (3 * N) + 8; ++i )
-      intermediate[i] = SBox[intermediate[i]];
+      sum[i] = SBox[ sum[i] ];
 
    // 3. The final and last SWIFFT:
-   SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut);
-   SWIFFTSum(fftOut,       3 * (N/8) + 1, output, As);
-
+   SWIFFTFFT( sum, 3 * (N/8) + 1, fftOut );
+   SWIFFTSum( fftOut,       3 * (N/8) + 1, sum, As );
+   memcpy( output, sum, SWIFFTX_OUTPUT_BLOCK_SIZE - 1 );
 }
diff --git a/algo/swifftx/swifftx.h b/algo/swifftx/swifftx.h
index eedbc8f0..ad2214a6 100644
--- a/algo/swifftx/swifftx.h
+++ b/algo/swifftx/swifftx.h
@@ -61,11 +61,10 @@ void ComputeSingleSWIFFT(unsigned char *input, unsigned short m,
 //
 // Returns:
 // - Success value.
-void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
-                           unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE] );
+void ComputeSingleSWIFFTX( unsigned char *input, unsigned char *output );
 
-void ComputeSingleSWIFFTX_smooth( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
-	            unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], bool doSmooth);
+//void ComputeSingleSWIFFTX_smooth( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
+//	            unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], bool doSmooth);
 
 // Calculates the powers of OMEGA and generates the bit reversal permutation.
 // You must call this function before doing SWIFFT/X, otherwise you will get zeroes everywhere.
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index 3a94344b..88401062 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -62,8 +62,7 @@ bool register_x16r_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
-	                VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -81,8 +80,7 @@ bool register_x16rv2_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rv2;
   gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
-	                VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -100,8 +98,7 @@ bool register_x16s_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
-	                VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -234,8 +231,7 @@ bool register_x16rt_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
-	                VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   opt_target_factor = 256.0;
   return true;
 };
@@ -252,8 +248,7 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
-	                VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   gate->build_extraheader = (void*)&veil_build_extraheader;
   opt_target_factor = 256.0;
   return true;
@@ -292,8 +287,7 @@ bool register_x21s_algo( algo_gate_t* gate )
   gate->hash              = (void*)&x21s_hash;
   gate->miner_thread_init = (void*)&x21s_thread_init;
 #endif
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
-	                    VAES_OPT | VAES256_OPT;
+  gate->optimizations  = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
   opt_target_factor = 256.0;
   return true;
diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c
index d192b0df..926beb4c 100644
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -12,7 +12,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
   init_sonoa_ctx();
   gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c
index 6ab09ff0..eee3d60d 100644
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
 #else
   gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c
index 545a0aa6..184ed2df 100644
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -12,7 +12,7 @@ bool register_xevan_algo( algo_gate_t* gate )
   init_xevan_ctx();
   gate->hash      = (void*)&xevan_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   opt_target_factor = 256.0;
   return true;
 };
diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c
index ff0cc805..826f0f88 100644
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -31,8 +31,8 @@ bool register_x22i_algo( algo_gate_t* gate )
 
 #endif
 
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                      | AVX512_OPT | VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+                      | AVX512_OPT | VAES_OPT;
   return true;
 };
 
@@ -48,8 +48,8 @@ bool register_x25x_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x25x;
   gate->hash      = (void*)&x25x_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
-                        AVX512_OPT | VAES_OPT | VAES256_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
+                        AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/build-allarch.sh b/build-allarch.sh
index 5fa38f6c..4a80588e 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,128 +4,97 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
 
-# Icelake AVX512 SHA VAES
+# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=icelake-client -Wall -fno-common" ./configure --with-curl
+#CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-avx512-sha-vaes.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes
 
-# Rocketlake AVX512 SHA AES
+# AVX512 AES: Intel Core HEDT Sylake-X, Cascadelake
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=cascadelake -msha -Wall -fno-common" ./configure --with-curl
-#CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl
-# CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -march=skylake-avx512 -maes -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-avx512-sha.exe
 strip -s cpuminer
-mv cpuminer cpuminer-avx512-sha
+mv cpuminer cpuminer-avx512
 
-# Slylake-X AVX512 AES
-make clean || echo clean
+# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
+make clean || echo done
 rm -f config.status
-CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-common" ./configure --with-curl
+# vaes doesn't include aes
+CFLAGS="-O3 -maes -mavx2 -msha -mvaes -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-avx512.exe
 strip -s cpuminer
-mv cpuminer cpuminer-avx512
+mv cpuminer cpuminer-avx2-sha-vaes
+
+# AVX2 SHA AES: AMD Zen1
+make clean || echo done
+rm -f config.status
+CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
+#CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
+make -j 8
+strip -s cpuminer
+mv cpuminer cpuminer-avx2-sha
 
-# Haswell AVX2 AES
+# AVX2 AES: Intel Haswell..Cometlake
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES with core-avx2
 CFLAGS="-O3 -march=core-avx2 -maes -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-avx2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx2
 
-# Sandybridge AVX AES
+# AVX AES: Intel Sandybridge, Ivybridge
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-avx.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx
 
-# Westmere SSE4.2 AES
+# SSE4.2 AES: Intel Westmere
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-aes-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-aes-sse42
 
-# Nehalem SSE4.2
+# SSE4.2: Intel Nehalem
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse42
 
-# Core2 SSSE3
+# SSSE3: Intel Core2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-ssse3.exe
 strip -s cpuminer
 mv cpuminer cpuminer-ssse3
 
-# Generic SSE2
+# SSE2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-sse2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse2
 
-# AMD Zen1 AVX2 SHA
-make clean || echo done
-rm -f config.status
-CFLAGS="-O3 -march=znver1 -Wall -fno-common" ./configure --with-curl
-make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-zen.exe
-strip -s cpuminer
-mv cpuminer cpuminer-zen
-
-# AMD Zen3 AVX2 SHA VAES
-make clean || echo done
-rm -f config.status
-CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl
-# CFLAGS="-O3 -march=znver3 -Wall -fno-common" ./configure --with-curl
-make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-zen3.exe
-strip -s cpuminer
-mv cpuminer cpuminer-zen3
-
-# Native to current CPU
+# Native to host CPU
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
 make -j 8
-strip -s cpuminer.exe
 strip -s cpuminer
 
diff --git a/clean-all.sh b/clean-all.sh
index e91bbb5b..87183d5e 100755
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes > /dev/null
 
-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null
+rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe > /dev/null
 
 make distclean > /dev/null
diff --git a/configure b/configure
index b93191f8..eca6ff1f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.3'
-PACKAGE_STRING='cpuminer-opt 3.18.3'
+PACKAGE_VERSION='3.19.1'
+PACKAGE_STRING='cpuminer-opt 3.19.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.18.3
+cpuminer-opt configure 3.19.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.18.3, which was
+It was created by cpuminer-opt $as_me 3.19.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.18.3'
+ VERSION='3.19.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.3, which was
+This file was extended by cpuminer-opt $as_me 3.19.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.3
+cpuminer-opt config.status 3.19.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 8b80c385..11d4e595 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.0])
+AC_INIT([cpuminer-opt], [3.19.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 179881c6..ee31ae58 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1038,9 +1038,17 @@ void report_summary_log( bool force )
 
 #endif
 
-   if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) )
-     && ( et.tv_sec < 300 ) )
-      return;
+   if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) ) )
+   {
+      if ( et.tv_sec < 300 )
+         return;
+      if ( ( s_get_ptr != s_put_ptr ) && ( et.tv_sec < 360 ) )
+         return;
+   }
+   
+//   if ( !( force && ( submit_sum || ( et.tv_sec > 5 ) ) )
+//     && ( et.tv_sec < 300 ) )
+//      return;
    
    // collect and reset periodic counters
    pthread_mutex_lock( &stats_lock );
@@ -1983,6 +1991,10 @@ void set_work_data_big_endian( struct work *work )
 // calculate net diff from nbits.
 double std_calc_network_diff( struct work* work )
 {
+   uint32_t nbits = work->data[ algo_gate.nbits_index ];
+   uint32_t shift = nbits & 0xff;
+   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
+/*
    // sample for diff 43.281 : 1c05ea29
    // todo: endian reversed on longpoll could be zr5 specific...
    int nbits_index = algo_gate.nbits_index;
@@ -1990,15 +2002,17 @@ double std_calc_network_diff( struct work* work )
                                   : swab32( work->data[ nbits_index ] );
    uint32_t bits  = ( nbits & 0xffffff );
    int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
+*/
+
    int m;
-   double d = (double)0x0000ffff / (double)bits;
+   long double d = (long double)0x0000ffff / (long double)bits;
    for ( m = shift; m < 29; m++ )
        d *= 256.0;
    for ( m = 29; m < shift; m++ )
        d /= 256.0;
    if ( opt_debug_diff )
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-   return d;
+      applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits);
+   return (double)d;
 }
 
 void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
@@ -2137,7 +2151,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                uint64_t net_ttf =
                     ( last_block_height - session_first_block ) == 0 ? 0
                     : et.tv_sec / ( last_block_height - session_first_block );
-               if ( net_diff && net_ttf )
+               if ( net_diff > 0. && net_ttf )
                {
                   double net_hr = nd / net_ttf;
                   char net_hr_units[4] = {0};
@@ -2572,7 +2586,7 @@ static void *longpoll_thread(void *userdata)
 	       if (!opt_quiet)
           {
 	         char netinfo[64] = { 0 };
-	         if (net_diff > 0.)
+	         if ( net_diff > 0. )
             {
 	 	         sprintf(netinfo, ", diff %.3f", net_diff);
 	         }
@@ -2844,7 +2858,6 @@ static bool cpu_capability( bool display_only )
      bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
      bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
      bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
      bool use_aes;
      bool use_sse2;
      bool use_sse42;
@@ -2924,14 +2937,13 @@ static bool cpu_capability( bool display_only )
         if ( algo_features == EMPTY_SET ) printf( " None" );
         else
         {
-           if      ( algo_has_avx512  )  printf( " AVX512" );
-           else if ( algo_has_avx2    )  printf( " AVX2  " );
-           else if ( algo_has_sse42   )  printf( " SSE4.2" );
-           else if ( algo_has_sse2    )  printf( " SSE2  " );
-           if      ( algo_has_vaes ||
-                     algo_has_vaes256 )  printf( " VAES"   );
-           else if ( algo_has_aes     )  printf( "  AES"   );
-           if      ( algo_has_sha     )  printf( " SHA"    );
+           if      ( algo_has_avx512 )  printf( " AVX512" );
+           else if ( algo_has_avx2   )  printf( " AVX2  " );
+           else if ( algo_has_sse42  )  printf( " SSE4.2" );
+           else if ( algo_has_sse2   )  printf( " SSE2  " );
+           if      ( algo_has_vaes   )  printf( " VAES"   );
+           else if ( algo_has_aes    )  printf( "  AES"   );
+           if      ( algo_has_sha    )  printf( " SHA"    );
         }
      }
      printf("\n");
@@ -2973,8 +2985,7 @@ static bool cpu_capability( bool display_only )
      use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
      use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
      use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes   && sw_has_vaes   && ( algo_has_vaes
-                                                    || algo_has_vaes256 );
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
      use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
                    use_sha || use_vaes );
 
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 71e42981..ec738593 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,9 +16,9 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
-# support for Windows CPU groups
-export DEFAULT_CFLAGS="-O3 -Wall -D_WIN32_WINNT=0x0601"
-#export DEFAULT_CFLAGS="-O3 -Wall"
+# support for Windows CPU groups, AES sometimes not included in -march
+export DEFAULT_CFLAGS="-O3 -maes -Wall -D_WIN32_WINNT=0x0601"
+export DEFAULT_CFLAGS_OLD="-O3 -Wall"
 
 # make link to local gmp header file.
 ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
@@ -41,7 +41,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 
 # Start building...
 
-# Icelake AVX512 SHA VAES
+# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
@@ -50,65 +50,50 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 
-# Rocketlake AVX512 SHA AES
+# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake 
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=cascadelake -msha" ./configure $CONFIGURE_ARGS
-#CFLAGS="$DEFAULT_CFLAGS -march=rocketlake" ./configure $CONFIGURE_ARGS
-make -j 8
-strip -s cpuminer.exe
-mv cpuminer.exe release/cpuminer-avx512-sha.exe
-
-# Zen1 AVX2 AES SHA
-make clean || echo clean
-rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
-mv cpuminer.exe release/cpuminer-zen.exe
+mv cpuminer.exe release/cpuminer-avx512.exe
 
-# Zen3 AVX2 SHA VAES
-make clean || echo clean
+# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
+make clean || echo done
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=znver2 -mvaes" ./configure $CONFIGURE_ARGS
-# CFLAGS="$DEFAULT_CFLAGS -march=znver3" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
-mv cpuminer.exe release/cpuminer-zen3.exe
+mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
 
-# Slylake-X AVX512 AES
-# mingw won't compile avx512 without -fno-asynchronous-unwind-tables
+# AVX2 AES SHA: AMD Zen1
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
-#CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
-mv cpuminer.exe release/cpuminer-avx512.exe
+mv cpuminer.exe release/cpuminer-avx2-sha.exe
 
-# Haswell AVX2 AES
+# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
 make clean || echo clean
 rm -f config.status
-# GCC 9 doesn't include AES in -march=core-avx2
-CFLAGS="$DEFAULT_CFLAGS -march=core-avx2 -maes" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
 
-# Sandybridge AVX AES
+# AVX AES: Intel Sandybridge, Ivybridge
 make clean || echo clean
 rm -f config.status
-# -march=corei7-avx still includes aes, but just in case
-CFLAGS="$DEFAULT_CFLAGS -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS 
+CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS 
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
 
-# Westmere SSE4.2 AES
+# SSE4.2 AES: Intel Westmere
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=westmere -maes" ./configure $CONFIGURE_ARGS
-#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-aes-sse42.exe
@@ -116,7 +101,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Nehalem SSE4.2
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="$DEFAULT_CFLAGS -march=corei7" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-sse42.exe
@@ -124,7 +109,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Core2 SSSE3
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="$DEFAULT_CFLAGS -march=core2" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS_OLD -march=core2" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-ssse3.exe
@@ -133,7 +118,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -msse2" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe

From 0e3945ddb5c647b2d01ab3854cbffb5881b55b44 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 30 Dec 2021 16:28:24 -0500
Subject: [PATCH 16/20] v3.19.2

---
 RELEASE_NOTES                |  13 +++++
 algo/fugue/fugue-aesni.h     |  11 ++++
 algo/hamsi/hamsi-hash-4way.c |  37 +++++++++---
 algo/scrypt/scrypt.c         |   1 -
 algo/x16/x16r-4way.c         | 105 +++++++++++++++++++++++++++--------
 configure                    |  20 +++----
 configure.ac                 |   2 +-
 cpu-miner.c                  |  13 +++--
 8 files changed, 154 insertions(+), 48 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index e22c2baa..a8a5e1a8 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,19 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.19.2
+
+Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
+
+Reduce log noise when replies to submitted shares are lost due to stratum errors.
+
+Fugue prehash optimization for X16r family AVX2 & AVX512.
+
+Small speed improvement for Hamsi AVX2 & AVX512.
+
+Win: With CPU groups enabled the number of CPUs displayed in the ASCII art
+affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64.
+
 v3.19.1
 
 Changes to Windows binaries package:
diff --git a/algo/fugue/fugue-aesni.h b/algo/fugue/fugue-aesni.h
index d1536641..13fd8f87 100644
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -37,12 +37,23 @@ typedef struct
 
 } hashState_fugue __attribute__ ((aligned (64)));
 
+
+// These functions are deprecated, use the lower case macro aliases that use
+// the standard interface. This will be cleaned up at a later date.
 HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
 
 HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
 
 HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
 
+#define fugue512_init( state ) \
+   fugue512_Init( state, 512 )
+#define fugue512_update( state, data, len ) \
+   fugue512_Update( state, data, (len)<<3 )
+#define fugue512_final \
+   fugue512_Final
+
+
 HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
 
 #endif // AES
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 26e133c9..b7b7c705 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -550,16 +550,38 @@ static const sph_u32 T512[64][16] = {
 
 // Hamsi 8 way AVX512 
 
+// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero
+// produces the same result but is faster.
+#define INPUT_BIG8 \
+do { \
+  __m512i db = _mm512_ror_epi64( *buf, 1 ); \
+  const uint64_t *tp = (const uint64_t*)T512; \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \
+     m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
+     m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
+     m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
+     m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
+     m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
+     m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
+     m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
+     m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
+     db = _mm512_ror_epi64( db, 1 ); \
+     tp += 8; \
+  } \
+} while (0)
+
+/*
 #define INPUT_BIG8 \
 do { \
   __m512i db = *buf; \
-  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
+  const uint64_t *tp = (const uint64_t*)T512;  \
   m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
   for ( int u = 0; u < 64; u++ ) \
   { \
-     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
-     dm = mm512_negate_32( _mm512_or_si512( dm, \
-                                          _mm512_slli_epi64( dm, 32 ) ) ); \
+     __m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \
      m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
      m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
      m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
@@ -572,6 +594,7 @@ do { \
      db = _mm512_srli_epi64( db, 1 ); \
   } \
 } while (0)
+*/
 
 #define SBOX8( a, b, c, d ) \
 do { \
@@ -888,13 +911,11 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 #define INPUT_BIG \
 do { \
   __m256i db = *buf; \
-  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
+  const uint64_t *tp = (const uint64_t*)T512;  \
   m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
   for ( int u = 0; u < 64; u++ ) \
   { \
-     __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
-     dm = mm256_negate_32( _mm256_or_si256( dm, \
-                         _mm256_slli_epi64( dm, 32 ) ) ); \
+     __m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \
      m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
                                           m256_const1_64( tp[0] ) ) ); \
      m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index 95639691..5557ca33 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -1544,7 +1544,6 @@ bool register_scrypt_algo( algo_gate_t* gate )
 
    format_number_si( &t_size, t_units );
    format_number_si( &d_size, d_units );
-   
    applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
           SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
 
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 8d4fb058..39efd257 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -60,7 +60,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
       case HAMSI:
          mm512_bswap32_intrlv80_8x64( vdata, pdata );
          hamsi512_8way_init( &x16r_ctx.hamsi );
-         hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 );
+         hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 );
+      break;
+      case FUGUE:
+         mm128_bswap32_80( edata, pdata );
+         fugue512_init( &x16r_ctx.fugue );
+         fugue512_update( &x16r_ctx.fugue, edata, 76 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
       break;
       case SHABAL:
          mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -306,7 +313,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
          break;
          case HAMSI:
             if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -319,14 +326,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
                           hash7, vhash );
          break;
          case FUGUE:
-             fugue512_full( &ctx.fugue, hash0, in0, size );
-             fugue512_full( &ctx.fugue, hash1, in1, size );
-             fugue512_full( &ctx.fugue, hash2, in2, size );
-             fugue512_full( &ctx.fugue, hash3, in3, size );
-             fugue512_full( &ctx.fugue, hash4, in4, size );
-             fugue512_full( &ctx.fugue, hash5, in5, size );
-             fugue512_full( &ctx.fugue, hash6, in6, size );
-             fugue512_full( &ctx.fugue, hash7, in7, size );
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in2 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash2 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in3 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash3 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in4 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash4 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in5 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash5 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in6 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash6 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in7 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash7 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, in0, size );
+               fugue512_full( &ctx.fugue, hash1, in1, size );
+               fugue512_full( &ctx.fugue, hash2, in2, size );
+               fugue512_full( &ctx.fugue, hash3, in3, size );
+               fugue512_full( &ctx.fugue, hash4, in4, size );
+               fugue512_full( &ctx.fugue, hash5, in5, size );
+               fugue512_full( &ctx.fugue, hash6, in6, size );
+               fugue512_full( &ctx.fugue, hash7, in7, size );
+            }
          break;
          case SHABAL:
              intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -347,25 +383,25 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
             {
                sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
                sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
                sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
                sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash3 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
                sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash4 ); 
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
                sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash5 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
                sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash6 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
                sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
                sph_whirlpool_close( &ctx.whirlpool, hash7 );
             }
@@ -532,7 +568,13 @@ void x16r_4way_prehash( void *vdata, void *pdata )
       case HAMSI:
          mm256_bswap32_intrlv80_4x64( vdata, pdata );
          hamsi512_4way_init( &x16r_ctx.hamsi );
-         hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 );
+         hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 );
+      break;
+      case FUGUE:
+         mm128_bswap32_80( edata, pdata );
+         fugue512_init( &x16r_ctx.fugue );
+         fugue512_update( &x16r_ctx.fugue, edata, 76 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
       break;
       case SHABAL:
          mm128_bswap32_intrlv80_4x32( vdata2, pdata );
@@ -734,7 +776,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
    	    break;
          case HAMSI:
             if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -745,10 +787,27 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case FUGUE:
-             fugue512_full( &ctx.fugue, hash0, in0, size );
-             fugue512_full( &ctx.fugue, hash1, in1, size );
-             fugue512_full( &ctx.fugue, hash2, in2, size );
-             fugue512_full( &ctx.fugue, hash3, in3, size );
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in2 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash2 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in3 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash3 );
+             }
+             else
+             {
+                fugue512_full( &ctx.fugue, hash0, in0, size );
+                fugue512_full( &ctx.fugue, hash1, in1, size );
+                fugue512_full( &ctx.fugue, hash2, in2, size );
+                fugue512_full( &ctx.fugue, hash3, in3, size );
+             }
          break;
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
diff --git a/configure b/configure
index eca6ff1f..f678bda4 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.1'
-PACKAGE_STRING='cpuminer-opt 3.19.1'
+PACKAGE_VERSION='3.19.2'
+PACKAGE_STRING='cpuminer-opt 3.19.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.19.1
+cpuminer-opt configure 3.19.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.19.1, which was
+It was created by cpuminer-opt $as_me 3.19.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.19.1'
+ VERSION='3.19.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.1, which was
+This file was extended by cpuminer-opt $as_me 3.19.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.1
+cpuminer-opt config.status 3.19.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 11d4e595..314b0d5b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.1])
+AC_INIT([cpuminer-opt], [3.19.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index ee31ae58..7a70f9f1 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -224,7 +224,11 @@ static uint8_t thread_affinity_map[ max_cpus ];
 // display affinity mask graphically
 static void format_affinity_mask( char *mask_str, uint64_t mask )
 {
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
+   int n = num_cpus / num_cpugroups;
+#else
    int n = num_cpus < 64 ? num_cpus : 64;
+#endif
    int i;
    for ( i = 0; i < n; i++ )
    {
@@ -2164,7 +2168,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
       } // !quiet
    }  // new diff/block
 
-   if ( new_job && !opt_quiet )
+   if ( new_job && !( opt_quiet || stratum_errors ) )
    {
       int mismatch = submitted_share_count - ( accepted_share_count
                                              + stale_share_count
@@ -3609,7 +3613,9 @@ int main(int argc, char *argv[])
 	num_cpus = 1;
 #endif
 
-   if ( num_cpus < 1 )    num_cpus = 1;
+   if ( num_cpus < 1 )
+      num_cpus = 1;
+   opt_n_threads = num_cpus;
 
    parse_cmdline( argc, argv );
 
@@ -3745,9 +3751,6 @@ int main(int argc, char *argv[])
 	}
 #endif
 
-   if ( ( opt_n_threads == 0 ) || ( opt_n_threads > num_cpus ) )
-      opt_n_threads = num_cpus;
-
    if ( opt_affinity && num_cpus > max_cpus )
    {
       applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",

From 17ccbc328fce6af2d9afb407df04a259c3fab945 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Fri, 7 Jan 2022 12:07:38 -0500
Subject: [PATCH 17/20] v3.19.3

---
 Makefile.am                                   |   1 +
 RELEASE_NOTES                                 |   6 +
 algo/argon2/argon2d/argon2d/opt.c             |   7 +
 algo/argon2/argon2d/blake2/blamka-round-opt.h |  29 +-
 algo/hamsi/hamsi-hash-4way.c                  | 520 ++++++++----------
 algo/keccak/keccak-hash-4way.c                |  17 +-
 algo/keccak/keccak-macros.c                   |  15 +-
 algo/scrypt/scrypt.c                          |  15 +-
 algo/shabal/shabal-hash-4way.c                | 496 ++++++++---------
 algo/verthash/Verthash.c                      |  17 +-
 algo/verthash/tiny_sha3/sha3-4way.c           |  33 +-
 algo/verthash/verthash-gate.c                 |   2 +-
 configure                                     |  20 +-
 configure.ac                                  |   2 +-
 malloc-huge.c                                 |  36 ++
 malloc-huge.h                                 |  24 +
 winbuild-cross.sh                             |  23 +-
 17 files changed, 657 insertions(+), 606 deletions(-)
 create mode 100644 malloc-huge.c
 create mode 100644 malloc-huge.h

diff --git a/Makefile.am b/Makefile.am
index 36e208ae..db71cf12 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
   api.c \
   sysinfos.c \
   algo-gate-api.c\
+  malloc-huge.c \
   algo/argon2/argon2a/argon2a.c \
   algo/argon2/argon2a/ar2/argon2.c \
   algo/argon2/argon2a/ar2/opt.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index a8a5e1a8..9f3fb6a3 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.19.3
+
+Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
+
+Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
+
 v3.19.2
 
 Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c
index 31829304..5164a1e9 100644
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -37,6 +37,13 @@
 
 #if defined(__AVX512F__)
 
+static inline __m512i blamka( __m512i x, __m512i y )
+{
+    __m512i xy = _mm512_mul_epu32( x, y );
+    return _mm512_add_epi64( _mm512_add_epi64( x, y ),
+                             _mm512_add_epi64( xy, xy ) );
+}
+
 static void fill_block( __m512i *state, const block *ref_block,
                        block *next_block, int with_xor )
 {
diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h
index 809961c3..4cb8bdad 100644
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
 
 #include <immintrin.h>
 
-#define ROR64(x, n) _mm512_ror_epi64((x), (n))
-
-static __m512i muladd(__m512i x, __m512i y)
+static inline __m512i muladd(__m512i x, __m512i y)
 {
     __m512i z = _mm512_mul_epu32(x, y);
     return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
         D0 = _mm512_xor_si512(D0, A0); \
         D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ROR64(D0, 32); \
-        D1 = ROR64(D1, 32); \
+        D0 = _mm512_ror_epi64(D0, 32); \
+        D1 = _mm512_ror_epi64(D1, 32); \
 \
         C0 = muladd(C0, D0); \
         C1 = muladd(C1, D1); \
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
         B0 = _mm512_xor_si512(B0, C0); \
         B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ROR64(B0, 24); \
-        B1 = ROR64(B1, 24); \
+        B0 = _mm512_ror_epi64(B0, 24); \
+        B1 = _mm512_ror_epi64(B1, 24); \
     } while ((void)0, 0)
 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
         D0 = _mm512_xor_si512(D0, A0); \
         D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ROR64(D0, 16); \
-        D1 = ROR64(D1, 16); \
+        D0 = _mm512_ror_epi64(D0, 16); \
+        D1 = _mm512_ror_epi64(D1, 16); \
 \
         C0 = muladd(C0, D0); \
         C1 = muladd(C1, D1); \
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
         B0 = _mm512_xor_si512(B0, C0); \
         B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ROR64(B0, 63); \
-        B1 = ROR64(B1, 63); \
+        B0 = _mm512_ror_epi64(B0, 63); \
+        B1 = _mm512_ror_epi64(B1, 63); \
     } while ((void)0, 0)
 
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)
 
 #define SWAP_HALVES(A0, A1) \
     do { \
-        __m512i t0, t1; \
-        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
-        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
-        A0 = t0; \
-        A1 = t1; \
+        __m512i t; \
+        t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t; \
     } while((void)0, 0)
 
 #define SWAP_QUARTERS(A0, A1) \
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index b7b7c705..38bf0763 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -545,21 +545,23 @@ static const sph_u32 T512[64][16] = {
 #define sE   c7
 #define sF   m7
 
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 // Hamsi 8 way AVX512 
 
-// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero
-// produces the same result but is faster.
+// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
+// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
+// on i9-9940x cmplt with zero was 3% faster than movepi. 
+
 #define INPUT_BIG8 \
 do { \
   __m512i db = _mm512_ror_epi64( *buf, 1 ); \
+  const __m512i zero = m512_zero; \
   const uint64_t *tp = (const uint64_t*)T512; \
-  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
   for ( int u = 0; u < 64; u++ ) \
   { \
-     __mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \
+     const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
      m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
      m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
      m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
@@ -573,29 +575,6 @@ do { \
   } \
 } while (0)
 
-/*
-#define INPUT_BIG8 \
-do { \
-  __m512i db = *buf; \
-  const uint64_t *tp = (const uint64_t*)T512;  \
-  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
-  for ( int u = 0; u < 64; u++ ) \
-  { \
-     __m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \
-     m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
-     m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
-     m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
-     m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
-     m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
-     m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
-     m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
-     m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
-     tp += 8; \
-     db = _mm512_srli_epi64( db, 1 ); \
-  } \
-} while (0)
-*/
-
 #define SBOX8( a, b, c, d ) \
 do { \
   __m512i t; \
@@ -632,199 +611,192 @@ do { \
 
 #define READ_STATE_BIG8(sc) \
 do { \
-   c0 = sc->h[0x0]; \
-   c1 = sc->h[0x1]; \
-   c2 = sc->h[0x2]; \
-   c3 = sc->h[0x3]; \
-   c4 = sc->h[0x4]; \
-   c5 = sc->h[0x5]; \
-   c6 = sc->h[0x6]; \
-   c7 = sc->h[0x7]; \
+   c0 = sc->h[0]; \
+   c1 = sc->h[1]; \
+   c2 = sc->h[2]; \
+   c3 = sc->h[3]; \
+   c4 = sc->h[4]; \
+   c5 = sc->h[5]; \
+   c6 = sc->h[6]; \
+   c7 = sc->h[7]; \
 } while (0)
 
 #define WRITE_STATE_BIG8(sc) \
 do { \
-   sc->h[0x0] = c0; \
-   sc->h[0x1] = c1; \
-   sc->h[0x2] = c2; \
-   sc->h[0x3] = c3; \
-   sc->h[0x4] = c4; \
-   sc->h[0x5] = c5; \
-   sc->h[0x6] = c6; \
-   sc->h[0x7] = c7; \
+   sc->h[0] = c0; \
+   sc->h[1] = c1; \
+   sc->h[2] = c2; \
+   sc->h[3] = c3; \
+   sc->h[4] = c4; \
+   sc->h[5] = c5; \
+   sc->h[6] = c6; \
+   sc->h[7] = c7; \
 } while (0)
 
-
 #define ROUND_BIG8( alpha ) \
 do { \
    __m512i t0, t1, t2, t3; \
-   s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
-   s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
-   s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
-   s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
-   s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
-   s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
-   s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
-   s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
-   s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
-   s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
-   sA = _mm512_xor_si512( sA, alpha[10] ); \
-   sB = _mm512_xor_si512( sB, alpha[11] ); \
-   sC = _mm512_xor_si512( sC, alpha[12] ); \
-   sD = _mm512_xor_si512( sD, alpha[13] ); \
-   sE = _mm512_xor_si512( sE, alpha[14] ); \
-   sF = _mm512_xor_si512( sF, alpha[15] ); \
+   s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
+   s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
+   s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
+   s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
+   s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
+   s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
+   s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
+   s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
+   s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
+   s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
+   sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
+   sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
+   sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
+   sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
+   sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
+   sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
 \
-  SBOX8( s0, s4, s8, sC ); \
-  SBOX8( s1, s5, s9, sD ); \
-  SBOX8( s2, s6, sA, sE ); \
-  SBOX8( s3, s7, sB, sF ); \
+  SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
+  SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
+  SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
+  SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
-                                        _mm512_bslli_epi128( s5, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
-                                        _mm512_bslli_epi128( sE, 4 ) ); \
+  s4 = mm512_swap64_32( s4 ); \
+  s5 = mm512_swap64_32( s5 ); \
+  sD = mm512_swap64_32( sD ); \
+  sE = mm512_swap64_32( sE ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
   L8( s0, t1, s9, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
-  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
-                                        _mm512_bslli_epi128( s6, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
-                                        _mm512_bslli_epi128( sF, 4 ) ); \
+  s6 = mm512_swap64_32( s6 ); \
+  sF = mm512_swap64_32( sF ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
   L8( s1, t1, sA, t3 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
-  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
-  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
+  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
-                                        _mm512_bslli_epi128( s7, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
-                                        _mm512_bslli_epi128( sC, 4 ) ); \
+  s7 = mm512_swap64_32( s7 ); \
+  sC = mm512_swap64_32( sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
   L8( s2, t1, sB, t3 ); \
-  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
-  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
-  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
+  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
+  s6 = mm512_swap64_32( s6 ); \
+  sF = mm512_swap64_32( sF ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
-                                        _mm512_bslli_epi128( s4, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
-                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
   L8( s3, t1, s8, t3 ); \
-  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
-  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
+  s7 = mm512_swap64_32( s7 ); \
+  sC = mm512_swap64_32( sC ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
   t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
-                                        _mm512_bslli_epi128( sB, 4 ) ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
+  t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
+  t3 = mm512_swap64_32( t3 ); \
   L8( t0, t1, t2, t3 ); \
+  t3 = mm512_swap64_32( t3 ); \
   s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
-  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
+  s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
   s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
   s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
-  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
+  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
   sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
-  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
-  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
+  sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
-                                        _mm512_bslli_epi128( sD, 4 ) ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
   t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
   L8( t0, t1, t2, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
   sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
   s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
   s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
   sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+  s4 = mm512_swap64_32( s4 ); \
+  s5 = mm512_swap64_32( s5 ); \
+  sD = mm512_swap64_32( sD ); \
+  sE = mm512_swap64_32( sE ); \
 } while (0)
 
 #define P_BIG8 \
 do { \
    __m512i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
    for( int i = 0; i < 16; i++ ) \
       alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
 } while (0)
 
 #define PF_BIG8 \
 do { \
    __m512i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
    for( int i = 0; i < 16; i++ ) \
       alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
 } while (0)
 
 #define T_BIG8 \
 do { /* order is important */ \
-   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
-   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
-   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
-   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
-   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
-   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
-   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
-   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
+   c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
+   c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
+   c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
+   c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
+   c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
+   c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
+   c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
+   c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
 } while (0)
 
 void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
@@ -861,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
    WRITE_STATE_BIG8( sc );
 }
 
-
 void hamsi512_8way_init( hamsi_8way_big_context *sc )
 {
    sc->partial_len = 0;
@@ -911,11 +882,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 #define INPUT_BIG \
 do { \
   __m256i db = *buf; \
+  const __m256i zero = m256_zero; \
   const uint64_t *tp = (const uint64_t*)T512;  \
-  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
-  for ( int u = 0; u < 64; u++ ) \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  for ( int u = 63; u >= 0; u-- ) \
   { \
-     __m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \
+     __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
      m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
                                           m256_const1_64( tp[0] ) ) ); \
      m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
@@ -933,7 +905,6 @@ do { \
      m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
                                           m256_const1_64( tp[7] ) ) ); \
      tp += 8; \
-     db = _mm256_srli_epi64( db, 1 ); \
   } \
 } while (0)
 
@@ -982,47 +953,28 @@ do { \
 
 #define READ_STATE_BIG(sc) \
 do { \
-   c0 = sc->h[0x0]; \
-   c1 = sc->h[0x1]; \
-   c2 = sc->h[0x2]; \
-   c3 = sc->h[0x3]; \
-   c4 = sc->h[0x4]; \
-   c5 = sc->h[0x5]; \
-   c6 = sc->h[0x6]; \
-   c7 = sc->h[0x7]; \
+   c0 = sc->h[0]; \
+   c1 = sc->h[1]; \
+   c2 = sc->h[2]; \
+   c3 = sc->h[3]; \
+   c4 = sc->h[4]; \
+   c5 = sc->h[5]; \
+   c6 = sc->h[6]; \
+   c7 = sc->h[7]; \
 } while (0)
 
 #define WRITE_STATE_BIG(sc) \
 do { \
-   sc->h[0x0] = c0; \
-   sc->h[0x1] = c1; \
-   sc->h[0x2] = c2; \
-   sc->h[0x3] = c3; \
-   sc->h[0x4] = c4; \
-   sc->h[0x5] = c5; \
-   sc->h[0x6] = c6; \
-   sc->h[0x7] = c7; \
+   sc->h[0] = c0; \
+   sc->h[1] = c1; \
+   sc->h[2] = c2; \
+   sc->h[3] = c3; \
+   sc->h[4] = c4; \
+   sc->h[5] = c5; \
+   sc->h[6] = c6; \
+   sc->h[7] = c7; \
 } while (0)
 
-/*
-#define s0   m0
-#define s1   c0
-#define s2   m1
-#define s3   c1
-#define s4   c2
-#define s5   m2
-#define s6   c3
-#define s7   m3
-#define s8   m4
-#define s9   c4
-#define sA   m5
-#define sB   c5
-#define sC   c6
-#define sD   m6
-#define sE   c7
-#define sF   m7
-*/
-
 #define ROUND_BIG( alpha ) \
 do { \
    __m256i t0, t1, t2, t3; \
@@ -1048,151 +1000,145 @@ do { \
   SBOX( s2, s6, sA, sE ); \
   SBOX( s3, s7, sB, sF ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
-                           _mm256_bslli_epi128( s5, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
-                           _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  s4 = mm256_swap64_32( s4 ); \
+  s5 = mm256_swap64_32( s5 ); \
+  sD = mm256_swap64_32( sD ); \
+  sE = mm256_swap64_32( sE ); \
+  t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
+  t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
   L( s0, t1, s9, t3 ); \
-  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
+  s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
+  sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
+  sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
-                           _mm256_bslli_epi128( s6, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
-                           _mm256_bslli_epi128( sF, 4 ), 0xAA ); \
+  s6 = mm256_swap64_32( s6 ); \
+  sF = mm256_swap64_32( sF ); \
+  t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
+  t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
   L( s1, t1, sA, t3 ); \
-  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
+  s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
+  sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
-                           _mm256_bslli_epi128( s7, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
-                           _mm256_bslli_epi128( sC, 4 ), 0xAA ); \
+  s7 = mm256_swap64_32( s7 ); \
+  sC = mm256_swap64_32( sC ); \
+  t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
+  t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
   L( s2, t1, sB, t3 ); \
-  s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
+  s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
+  sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
+  sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
+  s6 = mm256_swap64_32( s6 ); \
+  sF = mm256_swap64_32( sF ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
-                           _mm256_bslli_epi128( s4, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
-                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
+  t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
   L( s3, t1, s8, t3 ); \
-  s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
+  s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
+  sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
+  sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
+  s7 = mm256_swap64_32( s7 ); \
+  sC = mm256_swap64_32( sC ); \
 \
-  t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
-  t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
-  t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
-                           _mm256_bslli_epi128( sB, 4 ), 0xAA ); \
+  t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
+  t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
+  t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
+  t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
+  t3 = mm256_swap64_32( t3 ); \
   L( t0, t1, t2, t3 ); \
+  t3 = mm256_swap64_32( t3 ); \
   s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
-  s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
+  s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
   s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
-  s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
-  s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
-  sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
-  s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
-  sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
+  s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
+  s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
+  sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
+  s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
+  sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
 \
-  t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
-                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
-  t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
+  t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
+  t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
+  t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
+  t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
   L( t0, t1, t2, t3 ); \
-  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
-  sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
-  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
-  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
+  s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
+  sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
+  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
+  sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
   s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
-  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
+  sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
   s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
-  sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
+  s4 = mm256_swap64_32( s4 ); \
+  s5 = mm256_swap64_32( s5 ); \
+  sD = mm256_swap64_32( sD ); \
+  sE = mm256_swap64_32( sE ); \
 } while (0)
 
 #define P_BIG \
 do { \
    __m256i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
    for( int i = 0; i < 16; i++ ) \
       alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
 } while (0)
 
 #define PF_BIG \
 do { \
    __m256i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
    for( int i = 0; i < 16; i++ ) \
       alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
 } while (0)
 
 #define T_BIG \
 do { /* order is important */ \
-   c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
-   c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
-   c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
-   c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
-   c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
-   c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
-   c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
-   c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
+   c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
+   c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
+   c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
+   c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
+   c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
+   c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
+   c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
+   c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
 } while (0)
 
 void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c
index af37d6f6..458201cb 100644
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
 #define WRITE_STATE(sc)
 
 #define MOV64(d, s)      (d = s)
-#define XOR64_IOTA       XOR64
+#define XOR64_IOTA       XOR
+
 
 #define LPAR   (
 #define RPAR   )
@@ -71,14 +72,15 @@ static const uint64_t RC[] = {
 // Targetted macros, keccak-macros.h is included for each target.
 
 #define DECL64(x)          __m512i x
-#define XOR64(d, a, b)     (d = _mm512_xor_si512(a,b))
+#define XOR(d, a, b)     (d = _mm512_xor_si512(a,b))
+#define XOR64 XOR
 #define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
 #define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
 #define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
 #define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
 #define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
-
+#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))
 
 #include "keccak-macros.c"
 
@@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
 #undef INPUT_BUF
 #undef DECL64
 #undef XOR64
+#undef XOR
 #undef AND64
 #undef OR64
 #undef NOT64
@@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst)
 #undef KECCAK_F_1600
 #undef XOROR
 #undef XORAND
-
+#undef XOR3
 #endif  // AVX512
 
 // AVX2
@@ -255,13 +258,15 @@ keccak512_8way_close(void *cc, void *dst)
 } while (0)
 
 #define DECL64(x)        __m256i x
-#define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
+#define XOR(d, a, b)    (d = _mm256_xor_si256(a,b))
+#define XOR64 XOR
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
 #define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
 #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
+#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
 
 #include "keccak-macros.c"
 
@@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
 #undef INPUT_BUF
 #undef DECL64
 #undef XOR64
+#undef XOR
 #undef AND64
 #undef OR64
 #undef NOT64
@@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst)
 #undef KECCAK_F_1600
 #undef XOROR
 #undef XORAND
+#undef XOR3
 
 #endif  // AVX2
diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c
index 436d1ca3..6b7776d3 100644
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -1,6 +1,19 @@
 #ifdef TH_ELT
 #undef TH_ELT
 #endif
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+    DECL64(tt0); \
+    DECL64(tt1); \
+    XOR3( tt0, d0, d1, d4 ); \
+    XOR( tt1, d2, d3 ); \
+    XOR( tt0, tt0, tt1 ); \
+    ROL64( tt0, tt0, 1 ); \
+    XOR3( tt1, c0, c1, c4 ); \
+    XOR3( tt0, tt0, c2, c3 ); \
+    XOR( t, tt0, tt1 ); \
+} while (0)
+/*
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
                 DECL64(tt0); \
                 DECL64(tt1); \
@@ -17,7 +30,7 @@
                 XOR64(tt2, tt2, tt3); \
                 XOR64(t, tt0, tt2); \
         } while (0)
-
+*/
 #ifdef THETA
 #undef THETA
 #endif
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index 5557ca33..c36411bd 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -34,6 +34,7 @@
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"
 #include <mm_malloc.h>
+#include "malloc-huge.h"
 
 static const uint32_t keypad[12] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
@@ -1487,11 +1488,19 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 
 bool scrypt_miner_thread_init( int thr_id )
 {
-   scratchbuf = _mm_malloc( scratchbuf_size, 128 );
+   scratchbuf = malloc_hugepages( scratchbuf_size );
    if ( scratchbuf )
-      return true;
+   {
+      if ( opt_debug )
+         applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id );
+   }
+   else
+       scratchbuf = _mm_malloc( scratchbuf_size, 128 );
+   
+   if ( scratchbuf ) return true;
+   
    applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
-   return false; 
+   return false;
 }
 
 bool register_scrypt_algo( algo_gate_t* gate )
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index 8225595b..06116ff7 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -62,8 +62,8 @@ extern "C"{
 #if defined(__AVX2__)
 
 #define DECL_STATE8   \
-   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
-           A08, A09, A0A, A0B; \
+   __m256i A0, A1, A2, A3, A4, A5, A6, A7, \
+           A8, A9, AA, AB; \
    __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
            B8, B9, BA, BB, BC, BD, BE, BF; \
    __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -78,18 +78,18 @@ extern "C"{
 { \
    if ( (state)->state_loaded ) \
    { \
-      A00 = (state)->A[0]; \
-      A01 = (state)->A[1]; \
-      A02 = (state)->A[2]; \
-      A03 = (state)->A[3]; \
-      A04 = (state)->A[4]; \
-      A05 = (state)->A[5]; \
-      A06 = (state)->A[6]; \
-      A07 = (state)->A[7]; \
-      A08 = (state)->A[8]; \
-      A09 = (state)->A[9]; \
-      A0A = (state)->A[10]; \
-      A0B = (state)->A[11]; \
+      A0 = (state)->A[0]; \
+      A1 = (state)->A[1]; \
+      A2 = (state)->A[2]; \
+      A3 = (state)->A[3]; \
+      A4 = (state)->A[4]; \
+      A5 = (state)->A[5]; \
+      A6 = (state)->A[6]; \
+      A7 = (state)->A[7]; \
+      A8 = (state)->A[8]; \
+      A9 = (state)->A[9]; \
+      AA = (state)->A[10]; \
+      AB = (state)->A[11]; \
       B0 = (state)->B[0]; \
       B1 = (state)->B[1]; \
       B2 = (state)->B[2]; \
@@ -126,18 +126,18 @@ extern "C"{
    else \
    { \
        (state)->state_loaded = true; \
-       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
-       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
-       A02 = m256_const1_64( 0xE782B699E782B699 ); \
-       A03 = m256_const1_64( 0x5530463255304632 ); \
-       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
-       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
-       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
-       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
-       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
-       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
-       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
-       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
+       A0 = m256_const1_64( 0x20728DFD20728DFD ); \
+       A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
+       A2 = m256_const1_64( 0xE782B699E782B699 ); \
+       A3 = m256_const1_64( 0x5530463255304632 ); \
+       A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
+       A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
+       A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A9 = m256_const1_64( 0x8BD144108BD14410 ); \
+       AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
+       AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
        B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
        B1 = m256_const1_64( 0x07B385F307B385F3 ); \
        B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
@@ -176,18 +176,18 @@ extern "C"{
 } while (0)
 
 #define WRITE_STATE8(state)   do { \
-      (state)->A[0] = A00; \
-      (state)->A[1] = A01; \
-      (state)->A[2] = A02; \
-      (state)->A[3] = A03; \
-      (state)->A[4] = A04; \
-      (state)->A[5] = A05; \
-      (state)->A[6] = A06; \
-      (state)->A[7] = A07; \
-      (state)->A[8] = A08; \
-      (state)->A[9] = A09; \
-      (state)->A[10] = A0A; \
-      (state)->A[11] = A0B; \
+      (state)->A[0] = A0; \
+      (state)->A[1] = A1; \
+      (state)->A[2] = A2; \
+      (state)->A[3] = A3; \
+      (state)->A[4] = A4; \
+      (state)->A[5] = A5; \
+      (state)->A[6] = A6; \
+      (state)->A[7] = A7; \
+      (state)->A[8] = A8; \
+      (state)->A[9] = A9; \
+      (state)->A[10] = AA; \
+      (state)->A[11] = AB; \
       (state)->B[0] = B0; \
       (state)->B[1] = B1; \
       (state)->B[2] = B2; \
@@ -286,8 +286,8 @@ do { \
 
 #define XOR_W8 \
 do { \
-   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
-   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
+   A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \
+   A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
 } while (0)
 
 #define SWAP_BC8 \
@@ -321,60 +321,60 @@ do { \
 } while (0)
 
 #define PERM_STEP_0_8   do { \
-      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
+      PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
    } while (0)
 
 #define PERM_STEP_1_8   do { \
-      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
+      PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
    } while (0)
 
 #define PERM_STEP_2_8   do { \
-      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+      PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
    } while (0)
 
 #define APPLY_P8 \
@@ -398,42 +398,42 @@ do { \
     PERM_STEP_0_8; \
     PERM_STEP_1_8; \
     PERM_STEP_2_8; \
-    A0B = _mm256_add_epi32( A0B, C6 ); \
-    A0A = _mm256_add_epi32( A0A, C5 ); \
-    A09 = _mm256_add_epi32( A09, C4 ); \
-    A08 = _mm256_add_epi32( A08, C3 ); \
-    A07 = _mm256_add_epi32( A07, C2 ); \
-    A06 = _mm256_add_epi32( A06, C1 ); \
-    A05 = _mm256_add_epi32( A05, C0 ); \
-    A04 = _mm256_add_epi32( A04, CF ); \
-    A03 = _mm256_add_epi32( A03, CE ); \
-    A02 = _mm256_add_epi32( A02, CD ); \
-    A01 = _mm256_add_epi32( A01, CC ); \
-    A00 = _mm256_add_epi32( A00, CB ); \
-    A0B = _mm256_add_epi32( A0B, CA ); \
-    A0A = _mm256_add_epi32( A0A, C9 ); \
-    A09 = _mm256_add_epi32( A09, C8 ); \
-    A08 = _mm256_add_epi32( A08, C7 ); \
-    A07 = _mm256_add_epi32( A07, C6 ); \
-    A06 = _mm256_add_epi32( A06, C5 ); \
-    A05 = _mm256_add_epi32( A05, C4 ); \
-    A04 = _mm256_add_epi32( A04, C3 ); \
-    A03 = _mm256_add_epi32( A03, C2 ); \
-    A02 = _mm256_add_epi32( A02, C1 ); \
-    A01 = _mm256_add_epi32( A01, C0 ); \
-    A00 = _mm256_add_epi32( A00, CF ); \
-    A0B = _mm256_add_epi32( A0B, CE ); \
-    A0A = _mm256_add_epi32( A0A, CD ); \
-    A09 = _mm256_add_epi32( A09, CC ); \
-    A08 = _mm256_add_epi32( A08, CB ); \
-    A07 = _mm256_add_epi32( A07, CA ); \
-    A06 = _mm256_add_epi32( A06, C9 ); \
-    A05 = _mm256_add_epi32( A05, C8 ); \
-    A04 = _mm256_add_epi32( A04, C7 ); \
-    A03 = _mm256_add_epi32( A03, C6 ); \
-    A02 = _mm256_add_epi32( A02, C5 ); \
-    A01 = _mm256_add_epi32( A01, C4 ); \
-    A00 = _mm256_add_epi32( A00, C3 ); \
+    AB = _mm256_add_epi32( AB, C6 ); \
+    AA = _mm256_add_epi32( AA, C5 ); \
+    A9 = _mm256_add_epi32( A9, C4 ); \
+    A8 = _mm256_add_epi32( A8, C3 ); \
+    A7 = _mm256_add_epi32( A7, C2 ); \
+    A6 = _mm256_add_epi32( A6, C1 ); \
+    A5 = _mm256_add_epi32( A5, C0 ); \
+    A4 = _mm256_add_epi32( A4, CF ); \
+    A3 = _mm256_add_epi32( A3, CE ); \
+    A2 = _mm256_add_epi32( A2, CD ); \
+    A1 = _mm256_add_epi32( A1, CC ); \
+    A0 = _mm256_add_epi32( A0, CB ); \
+    AB = _mm256_add_epi32( AB, CA ); \
+    AA = _mm256_add_epi32( AA, C9 ); \
+    A9 = _mm256_add_epi32( A9, C8 ); \
+    A8 = _mm256_add_epi32( A8, C7 ); \
+    A7 = _mm256_add_epi32( A7, C6 ); \
+    A6 = _mm256_add_epi32( A6, C5 ); \
+    A5 = _mm256_add_epi32( A5, C4 ); \
+    A4 = _mm256_add_epi32( A4, C3 ); \
+    A3 = _mm256_add_epi32( A3, C2 ); \
+    A2 = _mm256_add_epi32( A2, C1 ); \
+    A1 = _mm256_add_epi32( A1, C0 ); \
+    A0 = _mm256_add_epi32( A0, CF ); \
+    AB = _mm256_add_epi32( AB, CE ); \
+    AA = _mm256_add_epi32( AA, CD ); \
+    A9 = _mm256_add_epi32( A9, CC ); \
+    A8 = _mm256_add_epi32( A8, CB ); \
+    A7 = _mm256_add_epi32( A7, CA ); \
+    A6 = _mm256_add_epi32( A6, C9 ); \
+    A5 = _mm256_add_epi32( A5, C8 ); \
+    A4 = _mm256_add_epi32( A4, C7 ); \
+    A3 = _mm256_add_epi32( A3, C6 ); \
+    A2 = _mm256_add_epi32( A2, C5 ); \
+    A1 = _mm256_add_epi32( A1, C4 ); \
+    A0 = _mm256_add_epi32( A0, C3 ); \
 } while (0)
 
 #define INCR_W8   do { \
@@ -660,8 +660,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 
 
 #define DECL_STATE   \
-	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
-	        A08, A09, A0A, A0B; \
+	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
+	        A8, A9, AA, AB; \
 	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
 	        B8, B9, BA, BB, BC, BD, BE, BF; \
 	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -676,18 +676,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 { \
    if ( (state)->state_loaded ) \
    { \
-      A00 = (state)->A[0]; \
-		A01 = (state)->A[1]; \
-		A02 = (state)->A[2]; \
-		A03 = (state)->A[3]; \
-		A04 = (state)->A[4]; \
-		A05 = (state)->A[5]; \
-		A06 = (state)->A[6]; \
-		A07 = (state)->A[7]; \
-		A08 = (state)->A[8]; \
-		A09 = (state)->A[9]; \
-		A0A = (state)->A[10]; \
-		A0B = (state)->A[11]; \
+      A0 = (state)->A[0]; \
+		A1 = (state)->A[1]; \
+		A2 = (state)->A[2]; \
+		A3 = (state)->A[3]; \
+		A4 = (state)->A[4]; \
+		A5 = (state)->A[5]; \
+		A6 = (state)->A[6]; \
+		A7 = (state)->A[7]; \
+		A8 = (state)->A[8]; \
+		A9 = (state)->A[9]; \
+		AA = (state)->A[10]; \
+		AB = (state)->A[11]; \
 		B0 = (state)->B[0]; \
 		B1 = (state)->B[1]; \
 		B2 = (state)->B[2]; \
@@ -724,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
    else \
    { \
        (state)->state_loaded = true; \
-       A00 = m128_const1_64( 0x20728DFD20728DFD ); \
-       A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
-       A02 = m128_const1_64( 0xE782B699E782B699 ); \
-       A03 = m128_const1_64( 0x5530463255304632 ); \
-       A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
-       A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
-       A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
-       A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
-       A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
-       A09 = m128_const1_64( 0x8BD144108BD14410 ); \
-       A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
-       A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
+       A0 = m128_const1_64( 0x20728DFD20728DFD ); \
+       A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
+       A2 = m128_const1_64( 0xE782B699E782B699 ); \
+       A3 = m128_const1_64( 0x5530463255304632 ); \
+       A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
+       A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
+       A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A9 = m128_const1_64( 0x8BD144108BD14410 ); \
+       AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
+       AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
        B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
        B1 = m128_const1_64( 0x07B385F307B385F3 ); \
        B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
@@ -774,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 } while (0)
 
 #define WRITE_STATE(state)   do { \
-		(state)->A[0] = A00; \
-		(state)->A[1] = A01; \
-		(state)->A[2] = A02; \
-		(state)->A[3] = A03; \
-		(state)->A[4] = A04; \
-		(state)->A[5] = A05; \
-		(state)->A[6] = A06; \
-		(state)->A[7] = A07; \
-		(state)->A[8] = A08; \
-		(state)->A[9] = A09; \
-		(state)->A[10] = A0A; \
-		(state)->A[11] = A0B; \
+		(state)->A[0] = A0; \
+		(state)->A[1] = A1; \
+		(state)->A[2] = A2; \
+		(state)->A[3] = A3; \
+		(state)->A[4] = A4; \
+		(state)->A[5] = A5; \
+		(state)->A[6] = A6; \
+		(state)->A[7] = A7; \
+		(state)->A[8] = A8; \
+		(state)->A[9] = A9; \
+		(state)->A[10] = AA; \
+		(state)->A[11] = AB; \
 		(state)->B[0] = B0; \
 		(state)->B[1] = B1; \
 		(state)->B[2] = B2; \
@@ -884,8 +884,8 @@ do { \
 
 #define XOR_W \
 do { \
-   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
-   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
+   A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \
+   A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
 } while (0)
 
 
@@ -940,60 +940,60 @@ do { \
 } while (0)
 
 #define PERM_STEP_0   do { \
-		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
-		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
-		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
-		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
-		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
-		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
-		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
-		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
-		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
-		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
-		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
-		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
-		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
-		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
-		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
-		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+		PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
 	} while (0)
 
 #define PERM_STEP_1   do { \
-		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
-		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
-		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
-		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
-		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
-		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
-		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
-		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
-		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
-		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
-		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
-		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
-		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
-		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
-		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
-		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+		PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
 	} while (0)
 
 #define PERM_STEP_2   do { \
-		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
-		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
-		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
-		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
-		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
-		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
-		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
-		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
-		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
-		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
-		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
-		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
-		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
-		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
-		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
-		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+		PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
 	} while (0)
 
 #define APPLY_P \
@@ -1017,42 +1017,42 @@ do { \
     PERM_STEP_0; \
     PERM_STEP_1; \
     PERM_STEP_2; \
-    A0B = _mm_add_epi32( A0B, C6 ); \
-    A0A = _mm_add_epi32( A0A, C5 ); \
-    A09 = _mm_add_epi32( A09, C4 ); \
-    A08 = _mm_add_epi32( A08, C3 ); \
-    A07 = _mm_add_epi32( A07, C2 ); \
-    A06 = _mm_add_epi32( A06, C1 ); \
-    A05 = _mm_add_epi32( A05, C0 ); \
-    A04 = _mm_add_epi32( A04, CF ); \
-    A03 = _mm_add_epi32( A03, CE ); \
-    A02 = _mm_add_epi32( A02, CD ); \
-    A01 = _mm_add_epi32( A01, CC ); \
-    A00 = _mm_add_epi32( A00, CB ); \
-    A0B = _mm_add_epi32( A0B, CA ); \
-    A0A = _mm_add_epi32( A0A, C9 ); \
-    A09 = _mm_add_epi32( A09, C8 ); \
-    A08 = _mm_add_epi32( A08, C7 ); \
-    A07 = _mm_add_epi32( A07, C6 ); \
-    A06 = _mm_add_epi32( A06, C5 ); \
-    A05 = _mm_add_epi32( A05, C4 ); \
-    A04 = _mm_add_epi32( A04, C3 ); \
-    A03 = _mm_add_epi32( A03, C2 ); \
-    A02 = _mm_add_epi32( A02, C1 ); \
-    A01 = _mm_add_epi32( A01, C0 ); \
-    A00 = _mm_add_epi32( A00, CF ); \
-    A0B = _mm_add_epi32( A0B, CE ); \
-    A0A = _mm_add_epi32( A0A, CD ); \
-    A09 = _mm_add_epi32( A09, CC ); \
-    A08 = _mm_add_epi32( A08, CB ); \
-    A07 = _mm_add_epi32( A07, CA ); \
-    A06 = _mm_add_epi32( A06, C9 ); \
-    A05 = _mm_add_epi32( A05, C8 ); \
-    A04 = _mm_add_epi32( A04, C7 ); \
-    A03 = _mm_add_epi32( A03, C6 ); \
-    A02 = _mm_add_epi32( A02, C5 ); \
-    A01 = _mm_add_epi32( A01, C4 ); \
-    A00 = _mm_add_epi32( A00, C3 ); \
+    AB = _mm_add_epi32( AB, C6 ); \
+    AA = _mm_add_epi32( AA, C5 ); \
+    A9 = _mm_add_epi32( A9, C4 ); \
+    A8 = _mm_add_epi32( A8, C3 ); \
+    A7 = _mm_add_epi32( A7, C2 ); \
+    A6 = _mm_add_epi32( A6, C1 ); \
+    A5 = _mm_add_epi32( A5, C0 ); \
+    A4 = _mm_add_epi32( A4, CF ); \
+    A3 = _mm_add_epi32( A3, CE ); \
+    A2 = _mm_add_epi32( A2, CD ); \
+    A1 = _mm_add_epi32( A1, CC ); \
+    A0 = _mm_add_epi32( A0, CB ); \
+    AB = _mm_add_epi32( AB, CA ); \
+    AA = _mm_add_epi32( AA, C9 ); \
+    A9 = _mm_add_epi32( A9, C8 ); \
+    A8 = _mm_add_epi32( A8, C7 ); \
+    A7 = _mm_add_epi32( A7, C6 ); \
+    A6 = _mm_add_epi32( A6, C5 ); \
+    A5 = _mm_add_epi32( A5, C4 ); \
+    A4 = _mm_add_epi32( A4, C3 ); \
+    A3 = _mm_add_epi32( A3, C2 ); \
+    A2 = _mm_add_epi32( A2, C1 ); \
+    A1 = _mm_add_epi32( A1, C0 ); \
+    A0 = _mm_add_epi32( A0, CF ); \
+    AB = _mm_add_epi32( AB, CE ); \
+    AA = _mm_add_epi32( AA, CD ); \
+    A9 = _mm_add_epi32( A9, CC ); \
+    A8 = _mm_add_epi32( A8, CB ); \
+    A7 = _mm_add_epi32( A7, CA ); \
+    A6 = _mm_add_epi32( A6, C9 ); \
+    A5 = _mm_add_epi32( A5, C8 ); \
+    A4 = _mm_add_epi32( A4, C7 ); \
+    A3 = _mm_add_epi32( A3, C6 ); \
+    A2 = _mm_add_epi32( A2, C5 ); \
+    A1 = _mm_add_epi32( A1, C4 ); \
+    A0 = _mm_add_epi32( A0, C3 ); \
 } while (0)
 
 #define INCR_W   do { \
diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
index 8880b45f..29521120 100644
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -10,6 +10,7 @@
 #include "algo-gate-api.h"
 #include "Verthash.h"
 #include "mm_malloc.h"
+#include "malloc-huge.h"
 
 //-----------------------------------------------------------------------------
 // Verthash info management
@@ -84,12 +85,18 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
     }
 
     // Allocate data
-    info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
-    if (!info->data)
+    info->data = (uint8_t *)malloc_hugepages( fileSize );
+    if ( info->data )
+       if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
+    else
     {
-        fclose(fileMiningData);
-        // Memory allocation fatal error.
-        return 2;
+       info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
+       if (!info->data)
+       {
+           fclose(fileMiningData);
+           // Memory allocation fatal error.
+           return 2;
+       }
     }
 
     // Load data
diff --git a/algo/verthash/tiny_sha3/sha3-4way.c b/algo/verthash/tiny_sha3/sha3-4way.c
index abbc8483..debbd775 100644
--- a/algo/verthash/tiny_sha3/sha3-4way.c
+++ b/algo/verthash/tiny_sha3/sha3-4way.c
@@ -29,16 +29,11 @@ void sha3_4way_keccakf( __m256i st[25] )
    for ( r = 0; r < KECCAKF_ROUNDS; r++ )
    {
       // Theta
-      bc[0] = _mm256_xor_si256( st[0],
-                           mm256_xor4( st[5], st[10], st[15], st[20] ) );
-      bc[1] = _mm256_xor_si256( st[1],
-                           mm256_xor4( st[6], st[11], st[16], st[21] ) );
-      bc[2] = _mm256_xor_si256( st[2],
-                           mm256_xor4( st[7], st[12], st[17], st[22] ) );
-      bc[3] = _mm256_xor_si256( st[3],
-                           mm256_xor4( st[8], st[13], st[18], st[23] ) );
-      bc[4] = _mm256_xor_si256( st[4],
-                           mm256_xor4( st[9], st[14], st[19], st[24] ) );
+      bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) );
+      bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) );
+      bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) );
+      bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) );
+      bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) );
 
       for ( i = 0; i < 5; i++ )
       {
@@ -89,17 +84,13 @@ void sha3_4way_keccakf( __m256i st[25] )
       //  Chi
       for ( j = 0; j < 25; j += 5 )
       {
-         memcpy( bc, &st[ j ], 5*32 );
-         st[ j   ] = _mm256_xor_si256( st[ j   ],
-                                       _mm256_andnot_si256( bc[1], bc[2] ) );
-         st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
-                                       _mm256_andnot_si256( bc[2], bc[3] ) );
-         st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
-                                       _mm256_andnot_si256( bc[3], bc[4] ) );
-         st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
-                                       _mm256_andnot_si256( bc[4], bc[0] ) );
-         st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
-                                       _mm256_andnot_si256( bc[0], bc[1] ) );
+         bc[0] = st[j];
+         bc[1] = st[j+1];
+         st[ j   ] = mm256_xorandnot( st[ j   ], st[j+1], st[j+2] );
+         st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] );
+         st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] );
+         st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] );
+         st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] );
       }
 
       //  Iota
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
index ec808f6b..eeb2e5dd 100644
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -127,7 +127,7 @@ bool register_verthash_algo( algo_gate_t* gate )
 {
   opt_target_factor = 256.0;
   gate->scanhash  = (void*)&scanhash_verthash;
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
    
   const char *verthash_data_file = opt_data_file ? opt_data_file
                                                  : default_verthash_data_file;
diff --git a/configure b/configure
index f678bda4..e76f1398 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.2'
-PACKAGE_STRING='cpuminer-opt 3.19.2'
+PACKAGE_VERSION='3.19.3'
+PACKAGE_STRING='cpuminer-opt 3.19.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.19.2
+cpuminer-opt configure 3.19.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.19.2, which was
+It was created by cpuminer-opt $as_me 3.19.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.19.2'
+ VERSION='3.19.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.2, which was
+This file was extended by cpuminer-opt $as_me 3.19.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.2
+cpuminer-opt config.status 3.19.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 314b0d5b..2b17493a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.2])
+AC_INIT([cpuminer-opt], [3.19.3])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/malloc-huge.c b/malloc-huge.c
new file mode 100644
index 00000000..75c0165d
--- /dev/null
+++ b/malloc-huge.c
@@ -0,0 +1,36 @@
+#include "malloc-huge.h"
+#include "miner.h"
+
+#define HUGEPAGE_SIZE_2M  (2 * 1024 * 1024)
+
+void *malloc_hugepages( size_t size )
+{
+#if !(defined(MAP_HUGETLB) && defined(MAP_ANON))
+//   applog( LOG_WARNING, "Huge pages not available",size);
+   return NULL;
+#else
+
+   if ( size < HUGEPAGE_MIN_ALLOC )
+   {
+//	   applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size);
+	   return NULL;
+   }
+
+   const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1;
+   void *p = NULL;
+   int flags =
+   #ifdef MAP_NOCORE
+                MAP_NOCORE |
+   #endif
+		          MAP_HUGETLB | MAP_ANON | MAP_PRIVATE;
+
+   // round size up to next page boundary
+   size = ( size + hugepage_mask ) & (~hugepage_mask);
+   
+   p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 );
+   if ( p == MAP_FAILED )
+      p = NULL;
+   return p;
+#endif
+}
+
diff --git a/malloc-huge.h b/malloc-huge.h
new file mode 100644
index 00000000..371e09a5
--- /dev/null
+++ b/malloc-huge.h
@@ -0,0 +1,24 @@
+#if !(defined(MALLOC_HUGE__))
+#define MALLOC_HUGE__
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __unix__
+#include <sys/mman.h>
+#endif
+
+#if defined(MAP_HUGETLB)
+
+// Minimum block size 6 MiB to use huge pages
+#define HUGEPAGE_MIN_ALLOC    (6 * 1024 * 1024)
+
+#endif
+
+// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure.
+void *malloc_hugepages( size_t size );
+
+#endif
+
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index ec738593..5774430f 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake 
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512.exe
@@ -61,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
 # AVX2 SHA VAES: Intel Alderlake, AMD Zen3
 make clean || echo done
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS
+CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -69,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
 # AVX2 AES SHA: AMD Zen1
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -77,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
 # AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
@@ -85,7 +85,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
 # AVX AES: Intel Sandybridge, Ivybridge
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS 
+CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS 
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
@@ -93,7 +93,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # SSE4.2 AES: Intel Westmere
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-aes-sse42.exe
@@ -118,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS
+CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe
 make clean || echo clean
 
+# Native with CPU groups ennabled
+make clean || echo clean
+rm -f config.status
+CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
+make -j 8
+strip -s cpuminer.exe
+

From 8727d79182c99f2163a245bd576edfd7c4c328e0 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Wed, 12 Jan 2022 21:08:25 -0500
Subject: [PATCH 18/20] v3.19.4

---
 RELEASE_NOTES            |  9 +++++++++
 algo/verthash/Verthash.c | 15 ++++++++-------
 configure                | 20 ++++++++++----------
 configure.ac             |  2 +-
 cpu-miner.c              | 39 +++++++++++++++++++++++++++++++++++----
 miner.h                  |  3 +++
 util.c                   | 19 +++++++++++++++++++
 7 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 9f3fb6a3..441c15dd 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,15 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.19.4
+
+#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
+
+New option stratum-keepalive prevents stratum timeouts when no shares are
+submitted for several minutes due to high difficulty.
+
+Fixed a bug displaying optimizations for some algos.
+
 v3.19.3
 
 Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
index 29521120..553bb6a6 100644
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -87,16 +87,17 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
     // Allocate data
     info->data = (uint8_t *)malloc_hugepages( fileSize );
     if ( info->data )
+    {
        if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
+    }
     else
-    {
        info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
-       if (!info->data)
-       {
-           fclose(fileMiningData);
-           // Memory allocation fatal error.
-           return 2;
-       }
+
+    if ( !info->data )
+    {
+        fclose( fileMiningData );
+        // Memory allocation fatal error.
+        return 2;
     }
 
     // Load data
diff --git a/configure b/configure
index e76f1398..6c1104c0 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.3'
-PACKAGE_STRING='cpuminer-opt 3.19.3'
+PACKAGE_VERSION='3.19.4'
+PACKAGE_STRING='cpuminer-opt 3.19.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.19.3
+cpuminer-opt configure 3.19.4
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.19.3, which was
+It was created by cpuminer-opt $as_me 3.19.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.19.3'
+ VERSION='3.19.4'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.3, which was
+This file was extended by cpuminer-opt $as_me 3.19.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.3
+cpuminer-opt config.status 3.19.4
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 2b17493a..35567589 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.3])
+AC_INIT([cpuminer-opt], [3.19.4])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 7a70f9f1..abda606e 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -127,6 +127,10 @@ char *short_url = NULL;
 char *coinbase_address;
 char *opt_data_file = NULL;
 bool opt_verify = false;
+static bool opt_stratum_keepalive = false;
+static struct timeval stratum_keepalive_timer;
+// Stratum typically times out in 5 minutes or 300 seconds
+#define stratum_keepalive_timeout 180  // 3 minutes
 
 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
@@ -2797,6 +2801,30 @@ static void *stratum_thread(void *userdata )
       if ( stratum.new_job )
          stratum_gen_work( &stratum, &g_work );
 
+      // is keepalive needed?
+      if ( opt_stratum_keepalive )
+      {
+         struct timeval now, et;
+         gettimeofday( &now, NULL );
+         // any shares submitted since last keepalive?
+         if ( last_submit_time.tv_sec > stratum_keepalive_timer.tv_sec )
+            memcpy( &stratum_keepalive_timer, &last_submit_time,
+                    sizeof (struct timeval) );
+
+         timeval_subtract( &et, &now, &stratum_keepalive_timer );
+
+         if ( et.tv_sec > stratum_keepalive_timeout )
+         {
+             double diff = stratum.job.diff * 0.5;
+             stratum_keepalive_timer = now;
+             if ( !opt_quiet )
+                applog( LOG_BLUE,
+                        "Stratum keepalive requesting lower difficulty" );
+             stratum_suggest_difficulty( &stratum, diff );
+         }
+      }
+
+      // Wait for new message from server
       if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) )
       {
          if ( likely( s = stratum_recv_line( &stratum ) ) )
@@ -2818,7 +2846,6 @@ static void *stratum_thread(void *userdata )
          stratum_need_reset = true;
 //         stratum_disconnect( &stratum );
       }
-
    }  // loop
 out:
   return NULL;
@@ -2990,8 +3017,8 @@ static bool cpu_capability( bool display_only )
      use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
      use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
      use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
-     use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
-                   use_sha || use_vaes );
+     use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
+                || use_avx2 || use_sha || use_vaes );
 
      // Display best options
      printf( "\nStarting miner with" );
@@ -3450,7 +3477,10 @@ void parse_arg(int key, char *arg )
    case 1028:  // verify
       opt_verify = true;
       break;
-	case 'V':
+   case 1029:  // stratum-keepalive
+      opt_stratum_keepalive = true;
+      break;
+   case 'V':
       display_cpu_capability();
       exit(0);
 	case 'h':
@@ -3899,6 +3929,7 @@ int main(int argc, char *argv[])
    gettimeofday( &last_submit_time, NULL );
    memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
    memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &stratum_keepalive_timer, &last_submit_time, sizeof (struct timeval) );
    memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
    pthread_mutex_unlock( &stats_lock );
 
diff --git a/miner.h b/miner.h
index 99124111..903ddc18 100644
--- a/miner.h
+++ b/miner.h
@@ -466,6 +466,7 @@ void stratum_disconnect(struct stratum_ctx *sctx);
 bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
+bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff );
 
 
 extern bool aes_ni_supported;
@@ -918,6 +919,7 @@ Options:\n\
   -c, --config=FILE     load a JSON-format configuration file\n\
       --data-file=FILE  path and name of data file\n\
       --verify          enable additional time consuming start up tests\n\
+      --stratum-keepalive  Prevent disconnects when difficulty is too high\n\
   -V, --version         display version and CPU information and exit\n\
   -h, --help            display this help text and exit\n\
 ";
@@ -987,6 +989,7 @@ static struct option const options[] = {
         { "userpass", 1, NULL, 'O' },
         { "data-file", 1, NULL, 1027 },
         { "verify", 0, NULL, 1028 },
+        { "stratum-keepalive", 0, NULL, 1029 },
         { "version", 0, NULL, 'V' },
         { 0, 0, 0, 0 }
 };
diff --git a/util.c b/util.c
index 31b92703..a462bccc 100644
--- a/util.c
+++ b/util.c
@@ -1846,6 +1846,25 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 	return ret;
 }
 
+bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff )
+{
+   char *s;
+   s = (char*) malloc( 80 );
+   bool rc = true;
+
+   // response is handled seperately, what ID?
+   sprintf( s, "{\"id\": 1, \"method\": \"mining.suggest_difficulty\", \"params\": [\"%f\"]}", diff );
+   if ( !stratum_send_line( sctx, s ) )
+   {
+      applog(LOG_WARNING,"stratum.suggest_difficulty send failed");
+      rc = false;
+   } 
+   free ( s );
+   return rc;
+}
+
+
+
 /**
  * Extract bloc height     L H... here len=3, height=0x1333e8
  * "...0000000000ffffffff2703e83313062f503253482f043d61105408"

From 90137b391e090785461027fc4c0ddc3efff509ec Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Sun, 30 Jan 2022 20:59:54 -0500
Subject: [PATCH 19/20] v3.19.5

---
 RELEASE_NOTES                                 |  13 ++
 .../argon2a/ar2/sj/scrypt-jane-portable-x86.h |   4 +-
 .../argon2a/ar2/sj/scrypt-jane-romix-basic.h  |   3 +-
 algo/blake/decred-gate.c                      |   3 +
 algo/hodl/sha512-avx.h                        |   2 +-
 algo/ripemd/sph_ripemd.c                      |   6 +
 algo/ripemd/sph_ripemd.h                      |   3 +
 algo/sm3/sph_sm3.h                            |   2 +-
 algo/x16/x16rt-4way.c                         |  20 ++--
 algo/x16/x16rt.c                              |  10 +-
 build-msys2.sh                                |  10 ++
 configure                                     |  20 ++--
 configure.ac                                  |   2 +-
 cpu-miner.c                                   | 113 ++++++++++++------
 miner.h                                       |  19 +--
 sysinfos.c                                    |   2 +-
 util.c                                        |   2 +-
 winbuild-cross.sh                             |   6 +-
 18 files changed, 159 insertions(+), 81 deletions(-)
 create mode 100755 build-msys2.sh

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 441c15dd..54d18080 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,19 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.19.5
+
+Enhanced stratum-keepalive preemptively resets the stratum connection
+before the server to avoid lost shares.
+
+Added build-msys2.sh scrypt for easier compiling on Windows, see Wiki for details.
+
+X16RT: eliminate unnecessary recalculations of the hash order.
+
+Fix a few compiler warnings.
+
+Fixed log colour error when a block is solved.
+
 v3.19.4
 
 #359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
index fb457945..c2f9edcc 100644
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
@@ -344,7 +344,7 @@ static size_t
 detect_cpu(void) {
 	//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
 	//cpu_vendors_x86 vendor = cpu_nobody;
-	x86_regs regs;
+	x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
 	uint32_t max_level, max_ext_level;
 	size_t cpu_flags = 0;
 #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
@@ -460,4 +460,4 @@ get_top_cpuflag_desc(size_t flag) {
 	#endif
 #endif
 
-#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
\ No newline at end of file
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
index 57ba649f..3124c847 100644
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
@@ -4,11 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
 #endif
 
 /* romix pre/post nop function */
+/*
 static void asm_calling_convention
 scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
 	(void)blocks; (void)nblocks;
 }
-
+*/
 /* romix pre/post endian conversion function */
 static void asm_calling_convention
 scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
diff --git a/algo/blake/decred-gate.c b/algo/blake/decred-gate.c
index 9c58b21b..bee00dd1 100644
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -70,7 +70,10 @@ void decred_be_build_stratum_request( char *req, struct work *work,
          rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
    free(xnonce2str);
 }
+
+#if !defined(min)
 #define min(a,b) (a>b ? (b) :(a))
+#endif
 
 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
diff --git a/algo/hodl/sha512-avx.h b/algo/hodl/sha512-avx.h
index eb7f094a..6fbb5bf7 100644
--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -45,6 +45,6 @@ void sha512Compute32b_parallel(
         uint64_t *data[SHA512_PARALLEL_N],
         uint64_t *digest[SHA512_PARALLEL_N]);
 
-void sha512ProcessBlock(Sha512Context *context);
+void sha512ProcessBlock(Sha512Context contexti[2] );
 
 #endif
diff --git a/algo/ripemd/sph_ripemd.c b/algo/ripemd/sph_ripemd.c
index 9273fb8d..dd610966 100644
--- a/algo/ripemd/sph_ripemd.c
+++ b/algo/ripemd/sph_ripemd.c
@@ -35,6 +35,7 @@
 
 #include "sph_ripemd.h"
 
+#if 0
 /*
  * Round functions for RIPEMD (original).
  */
@@ -46,6 +47,7 @@ static const sph_u32 oIV[5] = {
 	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
 	SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
 };
+#endif
 
 /*
  * Round functions for RIPEMD-128 and RIPEMD-160.
@@ -63,6 +65,8 @@ static const sph_u32 IV[5] = {
 
 #define ROTL    SPH_ROTL32
 
+#if 0
+
 /* ===================================================================== */
 /*
  * RIPEMD (original hash, deprecated).
@@ -539,6 +543,8 @@ sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
 #undef RIPEMD128_IN
 }
 
+#endif
+
 /* ===================================================================== */
 /*
  * RIPEMD-160.
diff --git a/algo/ripemd/sph_ripemd.h b/algo/ripemd/sph_ripemd.h
index 39fe5d1a..b677bd54 100644
--- a/algo/ripemd/sph_ripemd.h
+++ b/algo/ripemd/sph_ripemd.h
@@ -84,6 +84,7 @@
  * can be cloned by copying the context (e.g. with a simple
  * <code>memcpy()</code>).
  */
+#if 0
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
@@ -204,6 +205,8 @@ void sph_ripemd128_close(void *cc, void *dst);
  */
 void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
 
+#endif
+
 /* ===================================================================== */
 
 /**
diff --git a/algo/sm3/sph_sm3.h b/algo/sm3/sph_sm3.h
index eab61d36..3d69e55a 100644
--- a/algo/sm3/sph_sm3.h
+++ b/algo/sm3/sph_sm3.h
@@ -74,7 +74,7 @@ typedef struct {
 
 void sm3_init(sm3_ctx_t *ctx);
 void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len);
-void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]);
+void sm3_final(sm3_ctx_t *ctx, unsigned char *digest);
 void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]);
 void sm3(const unsigned char *data, size_t datalen,
 	unsigned char digest[SM3_DIGEST_LENGTH]);
diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c
index fcd56af6..fee47ffa 100644
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -24,15 +24,15 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
    if ( bench )   ptarget[7] = 0x0cff;
 
    static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
    {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
       x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
+      s_ntime = masked_ntime;
       if ( opt_debug && !thr_id )
           applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
    }
 
    x16r_8way_prehash( vdata, pdata );
@@ -78,15 +78,15 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
    if ( bench )  ptarget[7] = 0x0cff;
 
    static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
    {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
       x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
+      s_ntime = masked_ntime;
       if ( opt_debug && !thr_id )
           applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
    }
 
    x16r_4way_prehash( vdata, pdata );
diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c
index 62507098..7ff8dc5d 100644
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -20,15 +20,15 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
    mm128_bswap32_80( edata, pdata );
 
    static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = swab32( pdata[17] );
-   if ( s_ntime != ntime )
+   uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
    {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
       x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
+      s_ntime = masked_ntime;
       if ( opt_debug && !thr_id )
           applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+                        x16r_hash_order, swab32( pdata[17] ), timeHash );
    }
    
    x16r_prehash( edata, pdata );
diff --git a/build-msys2.sh b/build-msys2.sh
new file mode 100755
index 00000000..8f257d40
--- /dev/null
+++ b/build-msys2.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#
+# Compile on Windows using MSYS2 and MinGW.
+
+make distclean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 --param=evrp-mode=legacy -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
+make -j 4
+strip -s cpuminer
diff --git a/configure b/configure
index 6c1104c0..9b150f2d 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.4.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.5.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.4'
-PACKAGE_STRING='cpuminer-opt 3.19.4'
+PACKAGE_VERSION='3.19.5'
+PACKAGE_STRING='cpuminer-opt 3.19.5'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.4 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.5 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.4:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.5:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.19.4
+cpuminer-opt configure 3.19.5
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.19.4, which was
+It was created by cpuminer-opt $as_me 3.19.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.19.4'
+ VERSION='3.19.5'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.4, which was
+This file was extended by cpuminer-opt $as_me 3.19.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.4
+cpuminer-opt config.status 3.19.5
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 35567589..bf8f9991 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.4])
+AC_INIT([cpuminer-opt], [3.19.5])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index abda606e..992842a3 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -131,6 +131,8 @@ static bool opt_stratum_keepalive = false;
 static struct timeval stratum_keepalive_timer;
 // Stratum typically times out in 5 minutes or 300 seconds
 #define stratum_keepalive_timeout 180  // 3 minutes
+static struct timeval stratum_reset_time;
+
 
 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
@@ -191,7 +193,6 @@ int default_api_listen = 4048;
 static struct   timeval session_start;
 static struct   timeval five_min_start;
 static uint64_t session_first_block = 0;
-static double   latency_sum = 0.;
 static uint64_t submit_sum  = 0;
 static uint64_t accept_sum  = 0;
 static uint64_t stale_sum  = 0;
@@ -1147,7 +1148,7 @@ void report_summary_log( bool force )
                solved, solved_block_count );
    }
    if ( stratum_errors )
-      applog2( LOG_INFO, "Stratum errors               %7d", stratum_errors );
+      applog2( LOG_INFO, "Stratum resets               %7d", stratum_errors );
 
    applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
             highest_share, lowest_share );
@@ -1278,7 +1279,6 @@ static int share_result( int result, struct work *work,
       else          reject_sum++;
    }
    submit_sum++;
-   latency_sum += latency;
 
    pthread_mutex_unlock( &stats_lock );
 
@@ -1294,9 +1294,9 @@ static int share_result( int result, struct work *work,
      else              rcol = CL_LRD;
    }
 
-   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
+   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)",
            my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
-           bres, share_time, latency );
+           bres, CL_N, share_time, latency );
 
    if ( unlikely( opt_debug || !result || solved ) )
    {
@@ -2114,7 +2114,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
    {
       unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
                                              g_work->xnonce2_len );
-      applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
+      applog( LOG_INFO, "Extranonce2 0x%s, Block %d, Job %s",
                         xnonce2str, sctx->block_height, g_work->job_id );
       free( xnonce2str );
    }
@@ -2733,6 +2733,18 @@ void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
           sctx->job.final_sapling_hash );
 }
 
+// Loop is out of order:
+//
+//   connect/reconnect
+//   handle message
+//   get new message
+//
+// change to
+//   connect/reconnect
+//   get new message
+//   handle message
+
+
 static void *stratum_thread(void *userdata )
 {
    struct thr_info *mythr = (struct thr_info *) userdata;
@@ -2750,6 +2762,7 @@ static void *stratum_thread(void *userdata )
       if ( unlikely( stratum_need_reset ) )
       {
           stratum_need_reset = false;
+          gettimeofday( &stratum_reset_time, NULL );
           stratum_down = true;
           stratum_errors++;
           stratum_disconnect( &stratum );
@@ -2760,7 +2773,7 @@ static void *stratum_thread(void *userdata )
 	          applog(LOG_BLUE, "Connection changed to %s", short_url);
           }
           else 
-	          applog(LOG_WARNING, "Stratum connection reset");
+	          applog(LOG_BLUE, "Stratum connection reset");
           // reset stats queue as well
           restart_threads();
           if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
@@ -2795,34 +2808,11 @@ static void *stratum_thread(void *userdata )
          }
       }
 
-      report_summary_log( ( stratum_diff != stratum.job.diff )
-                       && ( stratum_diff != 0. ) );
+//      report_summary_log( ( stratum_diff != stratum.job.diff )
+//                       && ( stratum_diff != 0. ) );
       
-      if ( stratum.new_job )
-         stratum_gen_work( &stratum, &g_work );
-
-      // is keepalive needed?
-      if ( opt_stratum_keepalive )
-      {
-         struct timeval now, et;
-         gettimeofday( &now, NULL );
-         // any shares submitted since last keepalive?
-         if ( last_submit_time.tv_sec > stratum_keepalive_timer.tv_sec )
-            memcpy( &stratum_keepalive_timer, &last_submit_time,
-                    sizeof (struct timeval) );
-
-         timeval_subtract( &et, &now, &stratum_keepalive_timer );
-
-         if ( et.tv_sec > stratum_keepalive_timeout )
-         {
-             double diff = stratum.job.diff * 0.5;
-             stratum_keepalive_timer = now;
-             if ( !opt_quiet )
-                applog( LOG_BLUE,
-                        "Stratum keepalive requesting lower difficulty" );
-             stratum_suggest_difficulty( &stratum, diff );
-         }
-      }
+//      if ( stratum.new_job )
+//         stratum_gen_work( &stratum, &g_work );
 
       // Wait for new message from server
       if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) )
@@ -2846,6 +2836,54 @@ static void *stratum_thread(void *userdata )
          stratum_need_reset = true;
 //         stratum_disconnect( &stratum );
       }
+
+      report_summary_log( ( stratum_diff != stratum.job.diff )
+                       && ( stratum_diff != 0. ) );
+
+      if ( !stratum_need_reset )
+      {
+         // Is keepalive needed? Mutex would normally be required but that
+         // would block any attempt to submit a share. A share is more
+         // important even if it messes up the keepalive.
+
+         if ( opt_stratum_keepalive )
+         {
+            struct timeval now, et;
+            gettimeofday( &now, NULL );
+            // any shares submitted since last keepalive?
+            if ( last_submit_time.tv_sec > stratum_keepalive_timer.tv_sec )
+               memcpy( &stratum_keepalive_timer, &last_submit_time,
+                       sizeof (struct timeval) );
+
+            timeval_subtract( &et, &now, &stratum_keepalive_timer );
+
+            if ( et.tv_sec > stratum_keepalive_timeout )
+            {
+                double diff = stratum.job.diff * 0.5;
+                stratum_keepalive_timer = now;
+                if ( !opt_quiet )
+                   applog( LOG_BLUE,
+                           "Stratum keepalive requesting lower difficulty" );
+                stratum_suggest_difficulty( &stratum, diff );
+            }
+
+            if ( last_submit_time.tv_sec > stratum_reset_time.tv_sec )
+              timeval_subtract( &et, &now, &last_submit_time );
+            else
+              timeval_subtract( &et, &now, &stratum_reset_time );
+
+            if ( et.tv_sec > stratum_keepalive_timeout + 60 )
+            {
+               applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
+               stratum_need_reset = true;
+               stratum_keepalive_timer = now;
+            }
+         } // stratum_keepalive
+
+         if ( stratum.new_job && !stratum_need_reset )
+            stratum_gen_work( &stratum, &g_work );
+
+      } // stratum_need_reset
    }  // loop
 out:
   return NULL;
@@ -3434,7 +3472,8 @@ void parse_arg(int key, char *arg )
       break;
 	case 1021:  // cpu-priority
 		v = atoi(arg);
-		if (v < 0 || v > 5)	/* sanity check */
+      applog(LOG_NOTICE,"--cpu-priority is deprecated and will be removed from a future release");
+      if (v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_priority = v;
 		break;
@@ -3470,7 +3509,8 @@ void parse_arg(int key, char *arg )
 		break;
 	case 1024:
 		opt_randomize = true;
-		break;
+      applog(LOG_NOTICE,"--randomize is deprecated and will be removed from a future release");
+      break;
    case 1027:  // data-file
       opt_data_file = strdup( arg );
       break;
@@ -3930,6 +3970,7 @@ int main(int argc, char *argv[])
    memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
    memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
    memcpy( &stratum_keepalive_timer, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &stratum_reset_time, &last_submit_time, sizeof (struct timeval) );
    memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
    pthread_mutex_unlock( &stats_lock );
 
diff --git a/miner.h b/miner.h
index 903ddc18..407a25db 100644
--- a/miner.h
+++ b/miner.h
@@ -824,6 +824,7 @@ Options:\n\
                           qubit         Qubit\n\
                           scrypt        scrypt(1024, 1, 1) (default)\n\
                           scrypt:N      scrypt(N, 1, 1)\n\
+                          scryptn2      scrypt(1048576, 1,1)\n\
                           sha256d       Double SHA-256\n\
                           sha256q       Quad SHA-256, Pyrite (PYE)\n\
                           sha256t       Triple SHA-256, Onecoin (OC)\n\
@@ -886,10 +887,10 @@ Options:\n\
   -T, --timeout=N       timeout for long poll and stratum (default: 300 seconds)\n\
   -s, --scantime=N      upper bound on time spent scanning current work when\n\
                           long polling is unavailable, in seconds (default: 5)\n\
-      --randomize       Randomize scan range start to reduce duplicates\n\
-  -f, --diff-factor=N   Divide req. difficulty by this factor (std is 1.0)\n\
+      --randomize       randomize scan range (deprecated)\n\
+  -f, --diff-factor=N   divide req. difficulty by this factor (std is 1.0)\n\
   -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
-      --hash-meter      Display thread hash rates\n\
+      --hash-meter      display thread hash rates\n\
       --coinbase-addr=ADDR  payout address for solo mining\n\
       --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
       --no-longpoll     disable long polling support\n\
@@ -910,16 +911,16 @@ Options:\n\
   -B, --background      run the miner in the background\n\
       --benchmark       run in offline benchmark mode\n\
       --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
-      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)\n\
+      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest) (deprecated)\n\
   -b, --api-bind=address[:port]   IP address for the miner API, default port is 4048)\n\
-      --api-remote      Allow remote control\n\
-      --max-temp=N      Only mine if cpu temp is less than specified value (linux)\n\
-      --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
-      --max-diff=N      Only mine if net difficulty is less than specified value\n\
+      --api-remote      allow remote control\n\
+      --max-temp=N      only mine if cpu temp is less than specified value (linux)\n\
+      --max-rate=N[KMG] only mine if net hashrate is less than specified value\n\
+      --max-diff=N      only mine if net difficulty is less than specified value\n\
   -c, --config=FILE     load a JSON-format configuration file\n\
       --data-file=FILE  path and name of data file\n\
       --verify          enable additional time consuming start up tests\n\
-      --stratum-keepalive  Prevent disconnects when difficulty is too high\n\
+      --stratum-keepalive  prevent disconnects when difficulty is too high\n\
   -V, --version         display version and CPU information and exit\n\
   -h, --help            display this help text and exit\n\
 ";
diff --git a/sysinfos.c b/sysinfos.c
index ed453e2f..999df9fe 100644
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -209,7 +209,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
 {
    memset(outbuf, 0, maxsz);
 #ifdef WIN32
-   char brand[0xC0] = { 0 };
+   char brand[256] = { 0 };
    int output[4] = { 0 }, ext;
    cpuid(0x80000000, output);
    ext = output[0];
diff --git a/util.c b/util.c
index a462bccc..b746ef9a 100644
--- a/util.c
+++ b/util.c
@@ -1658,7 +1658,7 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 	pthread_mutex_unlock(&sctx->work_lock);
 
    if ( !opt_quiet ) /* pool dynamic change */
-      applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d",
+      applog( LOG_INFO, "Stratum extranonce1 0x%s, extranonce2 size %d",
          xnonce1, xn2_size);
 
 	return true;
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 5774430f..26d10769 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,8 +16,8 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
-# support for Windows CPU groups, AES sometimes not included in -march
-export DEFAULT_CFLAGS="-O3 -maes -Wall -D_WIN32_WINNT=0x0601"
+# Support for Windows 7 CPU groups, AES sometimes not included in -march
+export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"
 
 # make link to local gmp header file.
@@ -26,8 +26,8 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 # make release directory and copy selected DLLs.
 
 rm -rf release > /dev/null
-
 mkdir release
+
 cp README.txt release/
 cp README.md release/
 cp RELEASE_NOTES release/

From 5b678d24816b386d2abd7de981202b03f2429a55 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Mon, 21 Feb 2022 23:14:24 -0500
Subject: [PATCH 20/20] v3.19.6

---
 RELEASE_NOTES                    |    6 +
 algo/lyra2/allium-4way.c         |    3 +-
 algo/lyra2/lyra2z-4way.c         |    2 +-
 algo/shavite/shavite-hash-2way.c |   37 +-
 algo/shavite/shavite-hash-4way.c |   36 +-
 algo/shavite/sph-shavite-aesni.c |   56 +-
 algo/shavite/sph_shavite.c       |    2 +-
 algo/shavite/sph_shavite.h       |    2 +-
 build-allarch.sh                 |    4 +-
 configure                        |   20 +-
 configure.ac                     |    2 +-
 cpu-miner.c                      |   35 +-
 simd-utils/intrlv.h              | 1258 ++++++++++++++++++++++++------
 simd-utils/simd-128.h            |   32 +-
 simd-utils/simd-256.h            |   36 +-
 simd-utils/simd-512.h            |   85 +-
 16 files changed, 1158 insertions(+), 458 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 54d18080..fd8d114a 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.19.6
+
+#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
+Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled
+Small optimization to Shavite.
+
 v3.19.5
 
 Enhanced stratum-keepalive preemptively resets the stratum connection
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index f15648ae..f16047e9 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -69,7 +69,6 @@ void allium_16way_hash( void *state, const void *input )
    intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                 hash15, 256 );
    
-//   rintrlv_8x32_8x64( vhashA, vhash, 256 );
    keccak256_8way_update( &ctx.keccak, vhashA, 32 );
    keccak256_8way_close( &ctx.keccak, vhashA);
    keccak256_8way_init( &ctx.keccak );
@@ -284,7 +283,7 @@ void allium_8way_hash( void *hash, const void *input )
    blake256_8way_close( &ctx.blake, vhashA );
 
    dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                     vhashA, 256 );
+                 vhashA, 256 );
    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
    intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
 
diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c
index a5f8c9a4..531ce5d5 100644
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -49,7 +49,7 @@ void lyra2z_16way_hash( void *state, const void *input )
 
     dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
               hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
-               vhash, 256 );
+              vhash, 256 );
 
     intrlv_2x256( vhash, hash0, hash1, 256 );
     LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index 9c71459a..7bf01d14 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -18,10 +18,13 @@ static const uint32_t IV512[] =
         0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };
 
-
+/*
 #define mm256_ror2x256hi_1x32( a, b ) \
    _mm256_blend_epi32( mm256_shuflr128_32( a ), \
                        mm256_shuflr128_32( b ), 0x88 )
+*/
+
+//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )
 
 #if defined(__VAES__)
 
@@ -127,24 +130,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
      
      // round 2, 6, 10
 
-     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
-     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p2 = _mm256_xor_si256( p2, x );
 
-     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
-     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
      p0 = _mm256_xor_si256( p0, x );
@@ -183,24 +186,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      // round 4, 8, 12
 
-     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
-     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p0 = _mm256_xor_si256( p0, x );
 
-     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
-     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
      p2 = _mm256_xor_si256( p2, x );
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
index 0184ee8c..4dd9b490 100644
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -11,10 +11,6 @@ static const uint32_t IV512[] =
         0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };
 
-#define mm512_ror2x512hi_1x32( a, b ) \
-   _mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
-                                    mm512_shuflr128_32( b ) )
-
 static void
 c512_4way( shavite512_4way_context *ctx, const void *msg )
 {
@@ -106,24 +102,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
      
      // round 2, 6, 10
 
-     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
      P2 = _mm512_xor_si512( P2, X );
 
-     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
 
      P0 = _mm512_xor_si512( P0, X );
@@ -162,24 +158,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 
      // round 4, 8, 12
 
-     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
      P0 = _mm512_xor_si512( P0, X );
 
-     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
 
      P2 = _mm512_xor_si512( P2, X );
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index d8f6febd..eaa63067 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -59,30 +59,6 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };
 
-// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
-// and return the rotated 128 bit vector a.
-// a[3:0] = { b[0], a[3], a[2], a[1] }
-#if defined(__SSSE3__)
-
-#define mm128_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )
-
-#else  // SSE2
-
-#define mm128_ror256hi_1x32( a, b ) \
-   _mm_or_si128( _mm_srli_si128( a,  4 ), \
-                 _mm_slli_si128( b, 12 ) )
-
-#endif
-
-/*
-#if defined(__AVX2__)
-// 2 way version of above
-// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
-#define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror256_1x32( a ), \
-                       mm256_rol256_3x32( b ), 0x88 )
-#endif
-*/
 
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
@@ -190,31 +166,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       // round 2, 6, 10
 
-      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
       x = _mm_xor_si128( p3, k00 );
       x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, zero );
 
       p2 = _mm_xor_si128( p2, x );
 
-      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
       x = _mm_xor_si128( p1, k10 );
       x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
       x = _mm_xor_si128( x, k13 );
       x = _mm_aesenc_si128( x, zero );
 
@@ -262,31 +238,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       // round 4, 8, 12
 
-      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
       x = _mm_xor_si128( p1, k00 );
       x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, zero );
 
       p0 = _mm_xor_si128( p0, x );
 
-      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
       x = _mm_xor_si128( p3, k10 );
       x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
       x = _mm_xor_si128( x, k13 );
       x = _mm_aesenc_si128( x, zero );
 
diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c
index 41988f97..3d7c8286 100644
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -35,7 +35,7 @@
 
 #include "sph_shavite.h"
 
-#if !defined(__AES__)
+#if !(defined(__AES__) && defined(__SSSE3__))
 
 #ifdef __cplusplus
 extern "C"{
diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h
index cca59726..f30f4dfb 100644
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
 //Don't call these directly from application code, use the macros below.
-#ifdef __AES__
+#if defined(__AES__) && defined(__SSSE3__)
 
 void sph_shavite512_aesni_init(void *cc);
 void sph_shavite512_aesni(void *cc, const void *data, size_t len);
diff --git a/build-allarch.sh b/build-allarch.sh
index 4a80588e..836c42a1 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -36,8 +36,8 @@ mv cpuminer cpuminer-avx2-sha-vaes
 # AVX2 SHA AES: AMD Zen1
 make clean || echo done
 rm -f config.status
-CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
-#CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
+#CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx2-sha
diff --git a/configure b/configure
index 9b150f2d..bc6f6e47 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.5.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.5'
-PACKAGE_STRING='cpuminer-opt 3.19.5'
+PACKAGE_VERSION='3.19.6'
+PACKAGE_STRING='cpuminer-opt 3.19.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.19.5
+cpuminer-opt configure 3.19.6
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.19.5, which was
+It was created by cpuminer-opt $as_me 3.19.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.19.5'
+ VERSION='3.19.6'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.5, which was
+This file was extended by cpuminer-opt $as_me 3.19.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.5
+cpuminer-opt config.status 3.19.6
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index bf8f9991..39f25b13 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.5])
+AC_INIT([cpuminer-opt], [3.19.6])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 992842a3..5677c9c0 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2246,7 +2246,7 @@ static void *miner_thread( void *userdata )
 
    if ( !algo_gate.miner_thread_init( thr_id ) )
    {
-      applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
+      applog( LOG_ERR, "FAIL: thread %d failed to initialize", thr_id );
       exit (1);
    }
 
@@ -2274,10 +2274,24 @@ static void *miner_thread( void *userdata )
           {
              while ( unlikely( stratum_down ) )
                 sleep( 1 );
-             if ( *nonceptr >= end_nonce )
-                stratum_gen_work( &stratum, &g_work );
+             if ( unlikely( ( *nonceptr >= end_nonce )
+                         && !work_restart[thr_id].restart ) )
+             {
+                if ( opt_extranonce )
+                   stratum_gen_work( &stratum, &g_work );
+                else
+                {
+                   if ( !thr_id )
+                   {
+                      applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
+                      applog( LOG_WARNING, "waiting for new work...");
+                   }
+                   while ( !work_restart[thr_id].restart )
+                      sleep ( 1 );
+                }
+             }
           }
-          else
+          else  // GBT or getwork
           {
              pthread_rwlock_wrlock( &g_work_lock );
 
@@ -2288,8 +2302,7 @@ static void *miner_thread( void *userdata )
                 if ( unlikely( !get_work( mythr, &g_work ) ) )
                 {
                    pthread_rwlock_unlock( &g_work_lock );
-		             applog( LOG_ERR, "work retrieval failed, exiting "
-		                              "mining thread %d", thr_id );
+		             applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
 		             goto out;
 	             }
                 g_work_time = time(NULL);
@@ -2805,15 +2818,11 @@ static void *stratum_thread(void *userdata )
          {
             stratum_down = false;
             applog(LOG_BLUE,"Stratum connection established" );
+            if ( stratum.new_job )   // prime first job
+               stratum_gen_work( &stratum, &g_work );
          }
       }
 
-//      report_summary_log( ( stratum_diff != stratum.job.diff )
-//                       && ( stratum_diff != 0. ) );
-      
-//      if ( stratum.new_job )
-//         stratum_gen_work( &stratum, &g_work );
-
       // Wait for new message from server
       if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) )
       {
@@ -3903,6 +3912,8 @@ int main(int argc, char *argv[])
       if ( opt_debug )
          applog(LOG_INFO,"Creating stratum thread");
 
+      stratum.new_job = false;  // just to make sure
+
       /* init stratum thread info */
 		stratum_thr_id = opt_n_threads + 2;
 		thr = &thr_info[stratum_thr_id];
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index 956f3e37..00fb1516 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -11,6 +11,53 @@
 //
 //          32 bit data
 
+// Transpose 1 block consisting of 4x4x32 bit integers.
+#define MM128_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \
+{ \
+   __m128i t0 = mm128_shuffle2_32( s0, s1, 0x44 ); \
+   __m128i t1 = mm128_shuffle2_32( s0, s1, 0xee ); \
+   __m128i t2 = mm128_shuffle2_32( s2, s3, 0x44 ); \
+   __m128i t3 = mm128_shuffle2_32( s2, s3, 0xee ); \
+   d0 = mm128_shuffle2_32( t0, t2, 0x88 ); \
+   d1 = mm128_shuffle2_32( t0, t2, 0xdd ); \
+   d2 = mm128_shuffle2_32( t1, t3, 0x88 ); \
+   d3 = mm128_shuffle2_32( t1, t3, 0xdd ); \
+}
+
+#if defined(__AVX2__)
+
+// Transpose 2 contiguous blocks
+#define MM256_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \
+{ \
+   __m256i t0 = mm256_shuffle2_32( s0, s1, 0x44 ); \
+   __m256i t1 = mm256_shuffle2_32( s0, s1, 0xee ); \
+   __m256i t2 = mm256_shuffle2_32( s2, s3, 0x44 ); \
+   __m256i t3 = mm256_shuffle2_32( s2, s3, 0xee ); \
+   d0 = mm256_shuffle2_32( t0, t2, 0x88 ); \
+   d1 = mm256_shuffle2_32( t0, t2, 0xdd ); \
+   d2 = mm256_shuffle2_32( t1, t3, 0x88 ); \
+   d3 = mm256_shuffle2_32( t1, t3, 0xdd ); \
+}   
+
+#endif
+
+#if defined(__AVX512F__)
+
+// Transpose 4 contiguous blocks.
+#define MM512_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \
+{ \
+   __m512i t0 = mm512_shuffle2_32( s0, s1, 0x44 ); \
+   __m512i t1 = mm512_shuffle2_32( s0, s1, 0xee ); \
+   __m512i t2 = mm512_shuffle2_32( s2, s3, 0x44 ); \
+   __m512i t3 = mm512_shuffle2_32( s2, s3, 0xee ); \
+   d0 = mm512_shuffle2_32( t0, t2, 0x88 ); \
+   d1 = mm512_shuffle2_32( t0, t2, 0xdd ); \
+   d2 = mm512_shuffle2_32( t1, t3, 0x88 ); \
+   d3 = mm512_shuffle2_32( t1, t3, 0xdd ); \
+}
+
+#endif
+
 // 2x32
 
 static inline void intrlv_2x32( void *dst, const void *src0,
@@ -86,104 +133,37 @@ static inline void extr_lane_2x32( void *dst, const void *src,
 
 // 4x32
 /*
-static inline void intrlv_4x32( void *dst, const void *src0,
-           const void *src1, const void *src2, const void *src3, int bit_len )
-{
-   __m64 *d = (__m64*)dst;
-   const __m64 *s0 = (const __m64*)src0;
-   const __m64 *s1 = (const __m64*)src1;
-   const __m64 *s2 = (const __m64*)src2;
-   const __m64 *s3 = (const __m64*)src3;
-
-   d[ 0] = _mm_unpacklo_pi32( s0[0], s1[0] );
-   d[ 1] = _mm_unpacklo_pi32( s2[0], s3[0] );
-   d[ 2] = _mm_unpackhi_pi32( s0[0], s1[0] );
-   d[ 3] = _mm_unpackhi_pi32( s2[0], s3[0] );
-
-   d[ 4] = _mm_unpacklo_pi32( s0[1], s1[1] );
-   d[ 5] = _mm_unpacklo_pi32( s2[1], s3[1] );
-   d[ 6] = _mm_unpackhi_pi32( s0[1], s1[1] );
-   d[ 7] = _mm_unpackhi_pi32( s2[1], s3[1] );
-
-   d[ 8] = _mm_unpacklo_pi32( s0[2], s1[2] );
-   d[ 9] = _mm_unpacklo_pi32( s2[2], s3[2] );
-   d[10] = _mm_unpackhi_pi32( s0[2], s1[2] );
-   d[11] = _mm_unpackhi_pi32( s2[2], s3[2] );
-
-   d[12] = _mm_unpacklo_pi32( s0[3], s1[3] );
-   d[13] = _mm_unpacklo_pi32( s2[3], s3[3] );
-   d[14] = _mm_unpackhi_pi32( s0[3], s1[3] );
-   d[15] = _mm_unpackhi_pi32( s2[3], s3[3] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[16] = _mm_unpacklo_pi32( s0[4], s1[4] );
-   d[17] = _mm_unpacklo_pi32( s2[4], s3[4] );
-   d[18] = _mm_unpackhi_pi32( s0[4], s1[4] );
-   d[19] = _mm_unpackhi_pi32( s2[4], s3[4] );
+static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
+                      const void *src2, const void *src3, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+   const __m128i *s2 = (const __m128i*)src2;
+   const __m128i *s3 = (const __m128i*)src3;
 
-   d[20] = _mm_unpacklo_pi32( s0[5], s1[5] );
-   d[21] = _mm_unpacklo_pi32( s2[5], s3[5] );
-   d[22] = _mm_unpackhi_pi32( s0[5], s1[5] );
-   d[23] = _mm_unpackhi_pi32( s2[5], s3[5] );
+   MM128_ILEAVE32( d[ 0], d[ 1], d[ 2], d[ 3], s0[0], s1[0], s2[0], s3[0] );
+   MM128_ILEAVE32( d[ 4], d[ 5], d[ 6], d[ 7], s0[1], s1[1], s2[1], s3[1] );
 
-   d[24] = _mm_unpacklo_pi32( s0[6], s1[6] );
-   d[25] = _mm_unpacklo_pi32( s2[6], s3[6] );
-   d[26] = _mm_unpackhi_pi32( s0[6], s1[6] );
-   d[27] = _mm_unpackhi_pi32( s2[6], s3[6] );
+   if ( bit_len <= 256 ) return;
 
-   d[28] = _mm_unpacklo_pi32( s0[7], s1[7] );
-   d[29] = _mm_unpacklo_pi32( s2[7], s3[7] );
-   d[30] = _mm_unpackhi_pi32( s0[7], s1[7] );
-   d[31] = _mm_unpackhi_pi32( s2[7], s3[7] );
+   MM128_ILEAVE32( d[ 8], d[ 9], d[10], d[11], s0[2], s1[2], s2[2], s3[2] );
+   MM128_ILEAVE32( d[12], d[13], d[14], d[15], s0[3], s1[3], s2[3], s3[3] );
 
    if ( bit_len <= 512 ) return;
 
-   d[32] = _mm_unpacklo_pi32( s0[8], s1[8] );
-   d[33] = _mm_unpacklo_pi32( s2[8], s3[8] );
-   d[34] = _mm_unpackhi_pi32( s0[8], s1[8] );
-   d[35] = _mm_unpackhi_pi32( s2[8], s3[8] );
+   MM128_ILEAVE32( d[16], d[17], d[18], d[19], s0[4], s1[4], s2[4], s3[4] );
 
-   d[36] = _mm_unpacklo_pi32( s0[9], s1[9] );
-   d[37] = _mm_unpacklo_pi32( s2[9], s3[9] );
-   d[38] = _mm_unpackhi_pi32( s0[9], s1[9] );
-   d[39] = _mm_unpackhi_pi32( s2[9], s3[9] );
-   
    if ( bit_len <= 640 ) return;
 
-   d[40] = _mm_unpacklo_pi32( s0[10], s1[10] );
-   d[41] = _mm_unpacklo_pi32( s2[10], s3[10] );
-   d[42] = _mm_unpackhi_pi32( s0[10], s1[10] );
-   d[43] = _mm_unpackhi_pi32( s2[10], s3[10] );
-
-   d[44] = _mm_unpacklo_pi32( s0[11], s1[11] );
-   d[45] = _mm_unpacklo_pi32( s2[11], s3[11] );
-   d[46] = _mm_unpackhi_pi32( s0[11], s1[11] );
-   d[47] = _mm_unpackhi_pi32( s2[11], s3[11] );
-
-   d[48] = _mm_unpacklo_pi32( s0[12], s1[12] );
-   d[49] = _mm_unpacklo_pi32( s2[12], s3[12] );
-   d[50] = _mm_unpackhi_pi32( s0[12], s1[12] );
-   d[51] = _mm_unpackhi_pi32( s2[12], s3[12] );
-
-   d[52] = _mm_unpacklo_pi32( s0[13], s1[13] );
-   d[53] = _mm_unpacklo_pi32( s2[13], s3[13] );
-   d[54] = _mm_unpackhi_pi32( s0[13], s1[13] );
-   d[55] = _mm_unpackhi_pi32( s2[13], s3[13] );
-
-   d[56] = _mm_unpacklo_pi32( s0[14], s1[14] );
-   d[57] = _mm_unpacklo_pi32( s2[14], s3[14] );
-   d[58] = _mm_unpackhi_pi32( s0[14], s1[14] );
-   d[59] = _mm_unpackhi_pi32( s2[14], s3[14] );
-
-   d[60] = _mm_unpacklo_pi32( s0[15], s1[15] );
-   d[61] = _mm_unpacklo_pi32( s2[15], s3[15] );
-   d[62] = _mm_unpackhi_pi32( s0[15], s1[15] );
-   d[63] = _mm_unpackhi_pi32( s2[15], s3[15] );
-}  
+   MM128_ILEAVE32( d[20], d[21], d[22], d[23], s0[5], s1[5], s2[5], s3[5] );
+   MM128_ILEAVE32( d[24], d[25], d[26], d[27], s0[6], s1[6], s2[6], s3[6] );
+   MM128_ILEAVE32( d[28], d[29], d[30], d[31], s0[4], s1[4], s2[4], s3[4] );
+}
 */
 
-static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
+static inline void intrlv_4x32( void *
+      dst, const void *src0, const void *src1,
                       const void *src2, const void *src3, const int bit_len )
 {
    uint32_t *d = (uint32_t*)dst;
@@ -230,53 +210,45 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
 
 /*
 static inline void intrlv_4x32_512( void *dst, const void *src0,
-           const void *src1, const void *src2, const void *src3 )
+                        const void *src1, const void *src2, const void *src3 )
 {
-   __m64 *d = (__m64*)dst;
-   const __m64 *s0 = (const __m64*)src0;
-   const __m64 *s1 = (const __m64*)src1;
-   const __m64 *s2 = (const __m64*)src2;
-   const __m64 *s3 = (const __m64*)src3;
-
-   d[ 0] = _mm_unpacklo_pi32( s0[0], s1[0] );
-   d[ 1] = _mm_unpacklo_pi32( s2[0], s3[0] );
-   d[ 2] = _mm_unpackhi_pi32( s0[0], s1[0] );
-   d[ 3] = _mm_unpackhi_pi32( s2[0], s3[0] );
-
-   d[ 4] = _mm_unpacklo_pi32( s0[1], s1[1] );
-   d[ 5] = _mm_unpacklo_pi32( s2[1], s3[1] );
-   d[ 6] = _mm_unpackhi_pi32( s0[1], s1[1] );
-   d[ 7] = _mm_unpackhi_pi32( s2[1], s3[1] );
-
-   d[ 8] = _mm_unpacklo_pi32( s0[2], s1[2] );
-   d[ 9] = _mm_unpacklo_pi32( s2[2], s3[2] );
-   d[10] = _mm_unpackhi_pi32( s0[2], s1[2] );
-   d[11] = _mm_unpackhi_pi32( s2[2], s3[2] );
-
-   d[12] = _mm_unpacklo_pi32( s0[3], s1[3] );
-   d[13] = _mm_unpacklo_pi32( s2[3], s3[3] );
-   d[14] = _mm_unpackhi_pi32( s0[3], s1[3] );
-   d[15] = _mm_unpackhi_pi32( s2[3], s3[3] );
-
-   d[16] = _mm_unpacklo_pi32( s0[4], s1[4] );
-   d[17] = _mm_unpacklo_pi32( s2[4], s3[4] );
-   d[18] = _mm_unpackhi_pi32( s0[4], s1[4] );
-   d[19] = _mm_unpackhi_pi32( s2[4], s3[4] );
-
-   d[20] = _mm_unpacklo_pi32( s0[5], s1[5] );
-   d[21] = _mm_unpacklo_pi32( s2[5], s3[5] );
-   d[22] = _mm_unpackhi_pi32( s0[5], s1[5] );
-   d[23] = _mm_unpackhi_pi32( s2[5], s3[5] );
-
-   d[24] = _mm_unpacklo_pi32( s0[6], s1[6] );
-   d[25] = _mm_unpacklo_pi32( s2[6], s3[6] );
-   d[26] = _mm_unpackhi_pi32( s0[6], s1[6] );
-   d[27] = _mm_unpackhi_pi32( s2[6], s3[6] );
-
-   d[28] = _mm_unpacklo_pi32( s0[7], s1[7] );
-   d[29] = _mm_unpacklo_pi32( s2[7], s3[7] );
-   d[30] = _mm_unpackhi_pi32( s0[7], s1[7] );
-   d[31] = _mm_unpackhi_pi32( s2[7], s3[7] );
+#if defined(__AVX2__)
+
+   __m256i *d = (__m256i*)dst;
+   const __m256i *s0 = (const __m256i*)src0;
+   const __m256i *s1 = (const __m256i*)src1;
+   const __m256i *s2 = (const __m256i*)src2;
+   const __m256i *s3 = (const __m256i*)src3;
+   __m256i dt0, dt1, dt2, dt3;
+
+   MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] );
+
+   d[0] = _mm256_permute2x128_si256( dt0, dt1, 0x20 ); 
+   d[1] = _mm256_permute2x128_si256( dt2, dt3, 0x20 );
+   d[2] = _mm256_permute2x128_si256( dt0, dt1, 0x31 );
+   d[3] = _mm256_permute2x128_si256( dt2, dt3, 0x31 );
+
+   MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[1], s1[1], s2[1], s3[1] );
+
+   d[4] = _mm256_permute2x128_si256( dt0, dt1, 0x20 );
+   d[5] = _mm256_permute2x128_si256( dt2, dt3, 0x20 );
+   d[6] = _mm256_permute2x128_si256( dt0, dt1, 0x31 );
+   d[7] = _mm256_permute2x128_si256( dt2, dt3, 0x31 );
+
+#else
+
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+   const __m128i *s2 = (const __m128i*)src2;
+   const __m128i *s3 = (const __m128i*)src3;
+
+   MM128_ILEAVE32( d[ 0], d[ 1], d[ 2], d[ 3], s0[0], s1[0], s2[0], s3[0] );
+   MM128_ILEAVE32( d[ 4], d[ 5], d[ 6], d[ 7], s0[1], s1[1], s2[1], s3[1] );
+   MM128_ILEAVE32( d[ 8], d[ 9], d[10], d[11], s0[2], s1[2], s2[2], s3[2] );
+   MM128_ILEAVE32( d[12], d[13], d[14], d[15], s0[3], s1[3], s2[3], s3[3] );
+
+#endif   
 }
 */
 
@@ -306,100 +278,34 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
    d[ 60] = s0[15];   d[ 61] = s1[15];   d[ 62] = s2[15];   d[ 63] = s3[15];
 }
 
+
 /*
 static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
-                                 void *dst3, const void *src, int bit_len )
-{
-   __m64 *d0 = (__m64*)dst0;
-   __m64 *d1 = (__m64*)dst1;
-   __m64 *d2 = (__m64*)dst2;
-   __m64 *d3 = (__m64*)dst3;
-   const __m64 *s = (const __m64*)src;
-   d0[0] = _mm_unpacklo_pi32( s[ 0], s[ 2] );
-   d1[0] = _mm_unpackhi_pi32( s[ 0], s[ 2] );
-   d2[0] = _mm_unpacklo_pi32( s[ 1], s[ 3] );
-   d3[0] = _mm_unpackhi_pi32( s[ 1], s[ 3] );
-
-   d0[1] = _mm_unpacklo_pi32( s[ 4], s[ 6] );
-   d1[1] = _mm_unpackhi_pi32( s[ 4], s[ 6] );
-   d2[1] = _mm_unpacklo_pi32( s[ 5], s[ 7] );
-   d3[1] = _mm_unpackhi_pi32( s[ 5], s[ 7] );
-
-   d0[2] = _mm_unpacklo_pi32( s[ 8], s[10] );
-   d1[2] = _mm_unpackhi_pi32( s[ 8], s[10] );
-   d2[2] = _mm_unpacklo_pi32( s[ 9], s[11] );
-   d3[2] = _mm_unpackhi_pi32( s[ 9], s[11] );
-
-   d0[3] = _mm_unpacklo_pi32( s[12], s[14] );
-   d1[3] = _mm_unpackhi_pi32( s[12], s[14] );
-   d2[3] = _mm_unpacklo_pi32( s[13], s[15] );
-   d3[3] = _mm_unpackhi_pi32( s[13], s[15] );
-
-   if ( bit_len <= 256 ) return;
-
-   d0[4] = _mm_unpacklo_pi32( s[16], s[18] );
-   d1[4] = _mm_unpackhi_pi32( s[16], s[18] );
-   d2[4] = _mm_unpacklo_pi32( s[17], s[19] );
-   d3[4] = _mm_unpackhi_pi32( s[17], s[19] );
+                              void *dst3, const void *src, const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   const __m128i *s = (const __m128i*)src;
 
-   d0[5] = _mm_unpacklo_pi32( s[20], s[22] );
-   d1[5] = _mm_unpackhi_pi32( s[20], s[22] );
-   d2[5] = _mm_unpacklo_pi32( s[21], s[23] );
-   d3[5] = _mm_unpackhi_pi32( s[21], s[23] );
+   MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 1], s[ 2], s[ 3] );
+   MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 4], s[ 5], s[ 6], s[ 7] );
 
-   d0[6] = _mm_unpacklo_pi32( s[24], s[26] );
-   d1[6] = _mm_unpackhi_pi32( s[24], s[26] );
-   d2[6] = _mm_unpacklo_pi32( s[25], s[27] );
-   d3[6] = _mm_unpackhi_pi32( s[25], s[27] );
+   if ( bit_len <= 256 ) return;
 
-   d0[7] = _mm_unpacklo_pi32( s[28], s[30] );
-   d1[7] = _mm_unpackhi_pi32( s[28], s[30] );
-   d2[7] = _mm_unpacklo_pi32( s[29], s[31] );
-   d3[7] = _mm_unpackhi_pi32( s[29], s[31] );
+   MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[ 8], s[ 9], s[10], s[11] );
+   MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[12], s[13], s[14], s[15] );
 
    if ( bit_len <= 512 ) return;
 
-   d0[8] = _mm_unpacklo_pi32( s[32], s[34] );
-   d1[8] = _mm_unpackhi_pi32( s[32], s[34] );
-   d2[8] = _mm_unpacklo_pi32( s[33], s[35] );
-   d3[8] = _mm_unpackhi_pi32( s[33], s[35] );
-
-   d0[9] = _mm_unpacklo_pi32( s[36], s[38] );
-   d1[9] = _mm_unpackhi_pi32( s[36], s[38] );
-   d2[9] = _mm_unpacklo_pi32( s[37], s[39] );
-   d3[9] = _mm_unpackhi_pi32( s[37], s[39] );
+   MM128_ILEAVE32( d0[4], d1[4], d2[4], d3[4], s[16], s[17], s[18], s[19] );
 
    if ( bit_len <= 640 ) return;
 
-   d0[10] = _mm_unpacklo_pi32( s[40], s[42] );
-   d1[10] = _mm_unpackhi_pi32( s[40], s[42] );
-   d2[10] = _mm_unpacklo_pi32( s[41], s[43] );
-   d3[10] = _mm_unpackhi_pi32( s[41], s[43] );
-
-   d0[11] = _mm_unpacklo_pi32( s[44], s[46] );
-   d1[11] = _mm_unpackhi_pi32( s[44], s[46] );
-   d2[11] = _mm_unpacklo_pi32( s[45], s[47] );
-   d3[11] = _mm_unpackhi_pi32( s[45], s[47] );
-
-   d0[12] = _mm_unpacklo_pi32( s[48], s[50] );
-   d1[12] = _mm_unpackhi_pi32( s[48], s[50] );
-   d2[12] = _mm_unpacklo_pi32( s[49], s[51] );
-   d3[12] = _mm_unpackhi_pi32( s[49], s[51] );
-
-   d0[13] = _mm_unpacklo_pi32( s[52], s[54] );
-   d1[13] = _mm_unpackhi_pi32( s[52], s[54] );
-   d2[13] = _mm_unpacklo_pi32( s[53], s[55] );
-   d3[13] = _mm_unpackhi_pi32( s[53], s[55] );
-
-   d0[14] = _mm_unpacklo_pi32( s[56], s[58] );
-   d1[14] = _mm_unpackhi_pi32( s[56], s[58] );
-   d2[14] = _mm_unpacklo_pi32( s[57], s[59] );
-   d3[14] = _mm_unpackhi_pi32( s[57], s[59] );
-
-   d0[15] = _mm_unpacklo_pi32( s[60], s[62] );
-   d1[15] = _mm_unpackhi_pi32( s[60], s[62] );
-   d2[15] = _mm_unpacklo_pi32( s[61], s[62] );
-   d3[15] = _mm_unpackhi_pi32( s[61], s[62] );
+   MM128_ILEAVE32( d0[5], d1[5], d2[5], d3[5], s[20], s[21], s[22], s[23] );
+   MM128_ILEAVE32( d0[6], d1[6], d2[6], d3[6], s[24], s[25], s[26], s[27] );
+   MM128_ILEAVE32( d0[7], d1[7], d2[7], d3[7], s[28], s[29], s[30], s[31] );
 }
 */
 
@@ -452,47 +358,42 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
 static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
                                      void *dst3, const void *src )
 {
-   __m64 *d0 = (__m64*)dst0;
-   __m64 *d1 = (__m64*)dst1;
-   __m64 *d2 = (__m64*)dst2;
-   __m64 *d3 = (__m64*)dst3;
-   const __m64 *s = (const __m64*)src;
-
-   d0[0] = _mm_unpacklo_pi32( s[ 0], s[ 2] );
-   d1[0] = _mm_unpackhi_pi32( s[ 0], s[ 2] );
-   d2[0] = _mm_unpacklo_pi32( s[ 1], s[ 3] );
-   d3[0] = _mm_unpackhi_pi32( s[ 1], s[ 3] );
-   d0[1] = _mm_unpacklo_pi32( s[ 4], s[ 6] );
-   d1[1] = _mm_unpackhi_pi32( s[ 4], s[ 6] );
-   d2[1] = _mm_unpacklo_pi32( s[ 5], s[ 7] );
-   d3[1] = _mm_unpackhi_pi32( s[ 5], s[ 7] );
-
-   d0[2] = _mm_unpacklo_pi32( s[ 8], s[10] );
-   d1[2] = _mm_unpackhi_pi32( s[ 8], s[10] );
-   d2[2] = _mm_unpacklo_pi32( s[ 9], s[11] );
-   d3[2] = _mm_unpackhi_pi32( s[ 9], s[11] );
-   d0[3] = _mm_unpacklo_pi32( s[12], s[14] );
-   d1[3] = _mm_unpackhi_pi32( s[12], s[14] );
-   d2[3] = _mm_unpacklo_pi32( s[13], s[15] );
-   d3[3] = _mm_unpackhi_pi32( s[13], s[15] );
-
-   d0[4] = _mm_unpacklo_pi32( s[16], s[18] );
-   d1[4] = _mm_unpackhi_pi32( s[16], s[18] );
-   d2[4] = _mm_unpacklo_pi32( s[17], s[19] );
-   d3[4] = _mm_unpackhi_pi32( s[17], s[19] );
-   d0[5] = _mm_unpacklo_pi32( s[20], s[22] );
-   d1[5] = _mm_unpackhi_pi32( s[20], s[22] );
-   d2[5] = _mm_unpacklo_pi32( s[21], s[23] );
-   d3[5] = _mm_unpackhi_pi32( s[21], s[23] );
-
-   d0[6] = _mm_unpacklo_pi32( s[24], s[26] );
-   d1[6] = _mm_unpackhi_pi32( s[24], s[26] );
-   d2[6] = _mm_unpacklo_pi32( s[25], s[27] );
-   d3[6] = _mm_unpackhi_pi32( s[25], s[27] );
-   d0[7] = _mm_unpacklo_pi32( s[28], s[30] );
-   d1[7] = _mm_unpackhi_pi32( s[28], s[30] );
-   d2[7] = _mm_unpacklo_pi32( s[29], s[31] );
-   d3[7] = _mm_unpackhi_pi32( s[29], s[31] );
+#if defined(__AVX2__)
+
+   __m256i *d0 = (__m256i*)dst0;
+   __m256i *d1 = (__m256i*)dst1;
+   __m256i *d2 = (__m256i*)dst2;
+   __m256i *d3 = (__m256i*)dst3;
+   const __m256i *s = (const __m256i*)src;
+
+   __m256i st0 = _mm256_permute2x128_si256( s[0], s[2], 0x20 );
+   __m256i st2 = _mm256_permute2x128_si256( s[1], s[3], 0x20 );
+   __m256i st1 = _mm256_permute2x128_si256( s[0], s[2], 0x31 );
+   __m256i st3 = _mm256_permute2x128_si256( s[1], s[3], 0x31 );
+
+   MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st1, st2, st3 );
+
+   st0 = _mm256_permute2x128_si256( s[4], s[6], 0x20 );
+   st2 = _mm256_permute2x128_si256( s[5], s[7], 0x20 );
+   st1 = _mm256_permute2x128_si256( s[4], s[6], 0x31 );
+   st3 = _mm256_permute2x128_si256( s[5], s[7], 0x31 );
+
+   MM256_ILEAVE32( d0[1], d1[1], d2[1], d3[1], st0, st1, st2, st3 );
+
+#else
+
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   const __m128i *s = (const __m128i*)src;
+
+   MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 1], s[ 2], s[ 3] );
+   MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 4], s[ 5], s[ 6], s[ 7] );
+   MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[ 8], s[ 9], s[10], s[11] );
+   MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[12], s[13], s[14], s[15] );
+
+#endif
 }
 */
 
@@ -662,6 +563,204 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
 }
 
 // 8x32
+/*
+static inline void intrlv_8x32( void *dst, const void *src0,
+      const void *src1, const void *src2, const void *src3, const void *src4,
+      const void *src5, const void *src6, const void *src7, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+   const __m128i *s2 = (const __m128i*)src2;
+   const __m128i *s3 = (const __m128i*)src3;
+   const __m128i *s4 = (const __m128i*)src4;
+   const __m128i *s5 = (const __m128i*)src5;
+   const __m128i *s6 = (const __m128i*)src6;
+   const __m128i *s7 = (const __m128i*)src7;
+
+   MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] );
+   MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] );
+   MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] );
+   MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] );
+
+   if ( bit_len <= 256 ) return;
+   
+   MM128_ILEAVE32( d[16], d[18], d[20], d[22], s0[2], s1[2], s2[2], s3[2] );
+   MM128_ILEAVE32( d[17], d[19], d[21], d[23], s4[2], s5[2], s6[2], s7[2] );
+   MM128_ILEAVE32( d[24], d[26], d[28], d[30], s0[3], s1[3], s2[3], s3[3] );
+   MM128_ILEAVE32( d[25], d[27], d[29], d[31], s4[3], s5[3], s6[3], s7[3] );
+
+   if ( bit_len <= 512 ) return;
+
+   MM128_ILEAVE32( d[32], d[34], d[36], d[38], s0[4], s1[4], s2[4], s3[4] );
+   MM128_ILEAVE32( d[33], d[35], d[37], d[39], s4[4], s5[4], s6[4], s7[4] );
+
+   if ( bit_len <= 640 ) return;
+
+   MM128_ILEAVE32( d[40], d[42], d[44], d[46], s0[5], s1[5], s2[5], s3[5] );
+   MM128_ILEAVE32( d[41], d[43], d[45], d[47], s4[5], s5[5], s6[5], s7[5] );
+
+   MM128_ILEAVE32( d[48], d[50], d[52], d[54], s0[6], s1[6], s2[6], s3[6] );
+   MM128_ILEAVE32( d[49], d[51], d[53], d[55], s4[6], s5[6], s6[6], s7[6] );
+   MM128_ILEAVE32( d[56], d[58], d[60], d[62], s0[7], s1[7], s2[7], s3[7] );
+   MM128_ILEAVE32( d[57], d[59], d[61], d[63], s4[7], s5[7], s6[7], s7[7] );
+}
+
+// Not used
+static inline void intrlv_8x32_256( void *dst, const void *src0,
+      const void *src1, const void *src2, const void *src3, const void *src4,
+      const void *src5, const void *src6, const void *src7 )
+{
+#if defined(__AVX2__)
+
+   __m256i *d = (__m256i*)dst;
+   const __m256i *s0 = (const __m256i*)src0;
+   const __m256i *s1 = (const __m256i*)src1;
+   const __m256i *s2 = (const __m256i*)src2;
+   const __m256i *s3 = (const __m256i*)src3;
+   const __m256i *s4 = (const __m256i*)src4;
+   const __m256i *s5 = (const __m256i*)src5;
+   const __m256i *s6 = (const __m256i*)src6;
+   const __m256i *s7 = (const __m256i*)src7;
+   __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7;
+
+   MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] );
+   MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] );
+
+   d[0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 );
+   d[4] = _mm256_permute2x128_si256( dt0, dt4, 0x31 );
+   d[1] = _mm256_permute2x128_si256( dt1, dt5, 0x20 );
+   d[5] = _mm256_permute2x128_si256( dt1, dt5, 0x31 );
+   d[2] = _mm256_permute2x128_si256( dt2, dt6, 0x20 );
+   d[6] = _mm256_permute2x128_si256( dt2, dt6, 0x31 );
+   d[3] = _mm256_permute2x128_si256( dt3, dt7, 0x20 );
+   d[7] = _mm256_permute2x128_si256( dt3, dt7, 0x31 );
+
+#else
+// Shouldn't get here, 8x32 used only with AVX2
+
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+   const __m128i *s2 = (const __m128i*)src2;
+   const __m128i *s3 = (const __m128i*)src3;
+   const __m128i *s4 = (const __m128i*)src4;
+   const __m128i *s5 = (const __m128i*)src5;
+   const __m128i *s6 = (const __m128i*)src6;
+   const __m128i *s7 = (const __m128i*)src7;
+
+   MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] );
+   MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] );
+   MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] );
+   MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] );
+
+#endif
+}
+
+static inline void intrlv_8x32_512( void *dst, const void *src0,
+      const void *src1, const void *src2, const void *src3, const void *src4,
+      const void *src5, const void *src6, const void *src7 )
+{
+#if 0 //defined(__AVX512F__)
+
+   __m512i *d = (__m512i*)dst;
+   const __m512i *s0 = (const __m512i*)src0;
+   const __m512i *s1 = (const __m512i*)src1;
+   const __m512i *s2 = (const __m512i*)src2;
+   const __m512i *s3 = (const __m512i*)src3;
+   const __m512i *s4 = (const __m512i*)src4;
+   const __m512i *s5 = (const __m512i*)src5;
+   const __m512i *s6 = (const __m512i*)src6;
+   const __m512i *s7 = (const __m512i*)src7;
+
+   __m512i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7, t0, t1, t2, t3;
+
+   MM512_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] );
+   MM512_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] );
+
+   t0 = _mm512_shuffle_i32x4( dt0, dt4, 0x44 );
+   t2 = _mm512_shuffle_i32x4( dt1, dt5, 0x44 );
+   t1 = _mm512_shuffle_i32x4( dt0, dt4, 0xee );
+   t3 = _mm512_shuffle_i32x4( dt1, dt5, 0xee );
+
+   d[0] = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   d[2] = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   d[4] = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   d[6] = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+
+   t0 = _mm512_shuffle_i32x4( dt2, dt6, 0x44 );
+   t2 = _mm512_shuffle_i32x4( dt3, dt7, 0x44 );
+   t1 = _mm512_shuffle_i32x4( dt2, dt6, 0xee );
+   t3 = _mm512_shuffle_i32x4( dt3, dt7, 0xee );
+
+   d[1] = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   d[3] = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   d[5] = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   d[7] = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+
+#elif defined(__AVX2__)
+
+   __m256i *d = (__m256i*)dst;
+   const __m256i *s0 = (const __m256i*)src0;
+   const __m256i *s1 = (const __m256i*)src1;
+   const __m256i *s2 = (const __m256i*)src2;
+   const __m256i *s3 = (const __m256i*)src3;
+   const __m256i *s4 = (const __m256i*)src4;
+   const __m256i *s5 = (const __m256i*)src5;
+   const __m256i *s6 = (const __m256i*)src6;
+   const __m256i *s7 = (const __m256i*)src7;
+   __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7;
+
+   MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] );
+   MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] );
+
+   d[0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 );
+   d[1] = _mm256_permute2x128_si256( dt1, dt5, 0x20 );
+   d[4] = _mm256_permute2x128_si256( dt0, dt4, 0x31 );
+   d[5] = _mm256_permute2x128_si256( dt1, dt5, 0x31 );
+   d[2] = _mm256_permute2x128_si256( dt2, dt6, 0x20 );
+   d[3] = _mm256_permute2x128_si256( dt3, dt7, 0x20 );
+   d[6] = _mm256_permute2x128_si256( dt2, dt6, 0x31 );
+   d[7] = _mm256_permute2x128_si256( dt3, dt7, 0x31 );
+
+   MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[1], s1[1], s2[1], s3[1] );
+   MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[1], s5[1], s6[1], s7[1] );
+
+   d[ 8] = _mm256_permute2x128_si256( dt0, dt4, 0x20 );
+   d[ 9] = _mm256_permute2x128_si256( dt1, dt5, 0x20 );
+   d[12] = _mm256_permute2x128_si256( dt0, dt4, 0x31 );
+   d[13] = _mm256_permute2x128_si256( dt1, dt5, 0x31 );
+   d[10] = _mm256_permute2x128_si256( dt2, dt6, 0x20 );
+   d[11] = _mm256_permute2x128_si256( dt3, dt7, 0x20 );
+   d[14] = _mm256_permute2x128_si256( dt2, dt6, 0x31 );
+   d[15] = _mm256_permute2x128_si256( dt3, dt7, 0x31 );
+   
+#else
+// Shouldn't get here, 8x32 only used with AVX2 or AVX512
+
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+   const __m128i *s2 = (const __m128i*)src2;
+   const __m128i *s3 = (const __m128i*)src3;
+   const __m128i *s4 = (const __m128i*)src4;
+   const __m128i *s5 = (const __m128i*)src5;
+   const __m128i *s6 = (const __m128i*)src6;
+   const __m128i *s7 = (const __m128i*)src7;
+
+   MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] );
+   MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] );
+   MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] );
+   MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] );
+
+   MM128_ILEAVE32( d[16], d[18], d[20], d[22], s0[2], s1[2], s2[2], s3[2] );
+   MM128_ILEAVE32( d[17], d[19], d[21], d[23], s4[2], s5[2], s6[2], s7[2] );
+   MM128_ILEAVE32( d[24], d[26], d[28], d[30], s0[3], s1[3], s2[3], s3[3] );
+   MM128_ILEAVE32( d[25], d[27], d[29], d[31], s4[3], s5[3], s6[3], s7[3] );
+
+#endif   
+}
+*/
 
 #define ILEAVE_8x32( i ) do \
 { \
@@ -684,6 +783,7 @@ static inline void intrlv_8x32b( void *dst, const void *s0, const void *s1,
       ILEAVE_8x32( i );
 }
 
+
 static inline void intrlv_8x32( void *dst, const void *s0, const void *s1,
            const void *s2, const void *s3, const void *s4, const void *s5,
            const void *s6, const void *s7, const int bit_len )
@@ -709,6 +809,8 @@ static inline void intrlv_8x32( void *dst, const void *s0, const void *s1,
    ILEAVE_8x32( 30 );   ILEAVE_8x32( 31 );
 }
 
+
+
 static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1,
                const void *s2, const void *s3, const void *s4, const void *s5,
                const void *s6, const void *s7 )
@@ -723,8 +825,205 @@ static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1,
    ILEAVE_8x32( 14 );   ILEAVE_8x32( 15 );
 }
 
+
 #undef ILEAVE_8x32
 
+/*
+static inline void dintrlv_8x32( void *dst0, void *dst1, void *dst2, void *dst3,
+             void *dst4, void *dst5, void *dst6, void *dst7, const void *src,
+             const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   __m128i *d4 = (__m128i*)dst4;
+   __m128i *d5 = (__m128i*)dst5;
+   __m128i *d6 = (__m128i*)dst6;
+   __m128i *d7 = (__m128i*)dst7;
+   const __m128i *s = (const __m128i*)src;
+
+   MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] );
+   MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] );
+   MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] );
+   MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[16], s[18], s[20], s[22] );
+   MM128_ILEAVE32( d4[2], d5[2], d6[2], d7[2], s[17], s[19], s[21], s[23] );
+   MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[24], s[26], s[28], s[30] );
+   MM128_ILEAVE32( d4[3], d5[3], d6[3], d7[3], s[25], s[27], s[29], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   MM128_ILEAVE32( d0[4], d1[4], d2[4], d3[4], s[32], s[34], s[36], s[38] );
+   MM128_ILEAVE32( d4[4], d5[4], d6[4], d7[4], s[33], s[35], s[37], s[39] );
+
+   if ( bit_len <= 640 ) return;
+
+   MM128_ILEAVE32( d0[5], d1[5], d2[5], d3[5], s[40], s[42], s[44], s[46] );
+   MM128_ILEAVE32( d4[5], d5[5], d6[5], d7[5], s[41], s[43], s[45], s[47] );
+   MM128_ILEAVE32( d0[6], d1[6], d2[6], d3[6], s[48], s[50], s[52], s[54] );
+   MM128_ILEAVE32( d4[6], d5[6], d6[6], d7[6], s[49], s[51], s[53], s[55] );
+   MM128_ILEAVE32( d0[7], d1[7], d2[7], d3[7], s[56], s[58], s[60], s[62] );
+   MM128_ILEAVE32( d4[7], d5[7], d6[7], d7[7], s[57], s[59], s[61], s[63] );
+}
+
+static inline void dintrlv_8x32_256( void *dst0, void *dst1, void *dst2,
+             void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
+             const void *src )
+{
+#if defined(__AVX2__)
+
+   __m256i *d0 = (__m256i*)dst0;
+   __m256i *d1 = (__m256i*)dst1;
+   __m256i *d2 = (__m256i*)dst2;
+   __m256i *d3 = (__m256i*)dst3;
+   __m256i *d4 = (__m256i*)dst4;
+   __m256i *d5 = (__m256i*)dst5;
+   __m256i *d6 = (__m256i*)dst6;
+   __m256i *d7 = (__m256i*)dst7;
+   const __m256i *s = (const __m256i*)src;
+
+   __m256i st0 = _mm256_permute2x128_si256( s[0], s[4], 0x20 );
+   __m256i st1 = _mm256_permute2x128_si256( s[0], s[4], 0x31 );
+   __m256i st2 = _mm256_permute2x128_si256( s[1], s[5], 0x20 );
+   __m256i st3 = _mm256_permute2x128_si256( s[1], s[5], 0x31 );
+   __m256i st4 = _mm256_permute2x128_si256( s[2], s[6], 0x20 );
+   __m256i st5 = _mm256_permute2x128_si256( s[2], s[6], 0x31 );
+   __m256i st6 = _mm256_permute2x128_si256( s[3], s[7], 0x20 );
+   __m256i st7 = _mm256_permute2x128_si256( s[3], s[7], 0x31 );
+
+   MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st2, st4, st6 );
+   MM256_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st1, st3, st5, st7 );
+
+#else
+// Not needed, 8x32 used only with AVX2, AVX512
+
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   __m128i *d4 = (__m128i*)dst4;
+   __m128i *d5 = (__m128i*)dst5;
+   __m128i *d6 = (__m128i*)dst6;
+   __m128i *d7 = (__m128i*)dst7;
+   const __m128i *s = (const __m128i*)src;
+
+   MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] );
+   MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] );
+   MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] );
+   MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] );
+
+#endif
+}
+
+static inline void dintrlv_8x32_512( void *dst0, void *dst1, void *dst2,
+             void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
+             const void *src )
+{
+#if 0 // defined(__AVX512F__)
+
+   __m512i *d0 = (__m512i*)dst0;
+   __m512i *d1 = (__m512i*)dst1;
+   __m512i *d2 = (__m512i*)dst2;
+   __m512i *d3 = (__m512i*)dst3;
+   __m512i *d4 = (__m512i*)dst4;
+   __m512i *d5 = (__m512i*)dst5;
+   __m512i *d6 = (__m512i*)dst6;
+   __m512i *d7 = (__m512i*)dst7;
+
+
+   const __m512i *s = (const __m512i*)src;
+
+   __m512i st0, st1, st2, st3, st4, st5, st6, st7, t0, t1, t2, t3;
+
+   t0 = _mm512_shuffle_i32x4( s[0], s[2], 0x44 );
+   t2 = _mm512_shuffle_i32x4( s[4], s[6], 0x44 );
+   t1 = _mm512_shuffle_i32x4( s[0], s[2], 0xee );
+   t3 = _mm512_shuffle_i32x4( s[4], s[6], 0xee );
+
+   st0 = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   st4 = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   st1 = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   st5 = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+
+   t0 = _mm512_shuffle_i32x4( s[1], s[3], 0x44 );
+   t2 = _mm512_shuffle_i32x4( s[5], s[7], 0x44 );
+   t1 = _mm512_shuffle_i32x4( s[1], s[3], 0xee );
+   t3 = _mm512_shuffle_i32x4( s[5], s[7], 0xee );
+   
+   st2 = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   st6 = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   st3 = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   st7 = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+
+   MM512_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st1, st2, st3 );
+   MM512_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st4, st5, st6, st7 );
+
+#elif defined(__AVX2__)
+
+   __m256i *d0 = (__m256i*)dst0;
+   __m256i *d1 = (__m256i*)dst1;
+   __m256i *d2 = (__m256i*)dst2;
+   __m256i *d3 = (__m256i*)dst3;
+   __m256i *d4 = (__m256i*)dst4;
+   __m256i *d5 = (__m256i*)dst5;
+   __m256i *d6 = (__m256i*)dst6;
+   __m256i *d7 = (__m256i*)dst7;
+   const __m256i *s = (const __m256i*)src;
+
+   __m256i st0 = _mm256_permute2x128_si256( s[0], s[4], 0x20 );
+   __m256i st2 = _mm256_permute2x128_si256( s[1], s[5], 0x20 );
+   __m256i st1 = _mm256_permute2x128_si256( s[0], s[4], 0x31 );
+   __m256i st3 = _mm256_permute2x128_si256( s[1], s[5], 0x31 );
+   __m256i st4 = _mm256_permute2x128_si256( s[2], s[6], 0x20 );
+   __m256i st6 = _mm256_permute2x128_si256( s[3], s[7], 0x20 );
+   __m256i st5 = _mm256_permute2x128_si256( s[2], s[6], 0x31 );
+   __m256i st7 = _mm256_permute2x128_si256( s[3], s[7], 0x31 );
+
+   MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st2, st4, st6 );
+   MM256_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st1, st3, st5, st7 );
+
+   st0 = _mm256_permute2x128_si256( s[ 8], s[12], 0x20 );
+   st2 = _mm256_permute2x128_si256( s[ 9], s[13], 0x20 );
+   st1 = _mm256_permute2x128_si256( s[ 8], s[12], 0x31 );
+   st3 = _mm256_permute2x128_si256( s[ 9], s[13], 0x31 );
+   st4 = _mm256_permute2x128_si256( s[10], s[14], 0x20 );
+   st6 = _mm256_permute2x128_si256( s[11], s[15], 0x20 );
+   st5 = _mm256_permute2x128_si256( s[10], s[14], 0x31 );
+   st7 = _mm256_permute2x128_si256( s[11], s[15], 0x31 );
+
+   MM256_ILEAVE32( d0[1], d1[1], d2[1], d3[1], st0, st2, st4, st6 );
+   MM256_ILEAVE32( d4[1], d5[1], d6[1], d7[1], st1, st3, st5, st7 );
+   
+#else
+
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   __m128i *d4 = (__m128i*)dst4;
+   __m128i *d5 = (__m128i*)dst5;
+   __m128i *d6 = (__m128i*)dst6;
+   __m128i *d7 = (__m128i*)dst7;
+   const __m128i *s = (const __m128i*)src;
+
+   MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] );
+   MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] );
+   MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] );
+   MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] );
+
+   MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[16], s[18], s[20], s[22] );
+   MM128_ILEAVE32( d4[2], d5[2], d6[2], d7[2], s[17], s[19], s[21], s[23] );
+   MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[24], s[26], s[28], s[30] );
+   MM128_ILEAVE32( d4[3], d5[3], d6[3], d7[3], s[25], s[27], s[29], s[31] );
+
+#endif
+}
+*/
+
 #define DLEAVE_8x32( i ) do \
 { \
    const uint32_t *s = (const uint32_t*)(src) + ( (i) << 3 ); \
@@ -771,6 +1070,7 @@ static inline void dintrlv_8x32( void *d0, void *d1, void *d2, void *d3,
    DLEAVE_8x32( 30 );   DLEAVE_8x32( 31 );
 }
 
+
 static inline void dintrlv_8x32_512( void *d0, void *d1, void *d2, void *d3,
                      void *d4, void *d5, void *d6, void *d7, const void *src )
 {
@@ -874,6 +1174,210 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 #endif   // AVX2
 
 // 16x32
+/*
+static inline void intrlv_16x32( void *dst, const void *src00,
+    const void *src01, const void *src02, const void *src03, const void *src04,
+    const void *src05, const void *src06, const void *src07, const void *src08,
+    const void *src09, const void *src10, const void *src11, const void *src12,
+    const void *src13, const void *src14, const void *src15, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s00 = (const __m128i*)src00;
+   const __m128i *s01 = (const __m128i*)src01;
+   const __m128i *s02 = (const __m128i*)src02;
+   const __m128i *s03 = (const __m128i*)src03;
+   const __m128i *s04 = (const __m128i*)src04;
+   const __m128i *s05 = (const __m128i*)src05;
+   const __m128i *s06 = (const __m128i*)src06;
+   const __m128i *s07 = (const __m128i*)src07;
+   const __m128i *s08 = (const __m128i*)src08;
+   const __m128i *s09 = (const __m128i*)src09;
+   const __m128i *s10 = (const __m128i*)src10;
+   const __m128i *s11 = (const __m128i*)src11;
+   const __m128i *s12 = (const __m128i*)src12;
+   const __m128i *s13 = (const __m128i*)src13;
+   const __m128i *s14 = (const __m128i*)src14;
+   const __m128i *s15 = (const __m128i*)src15;
+
+   MM128_ILEAVE32( d[ 0], d[ 4], d[ 8], d[12], s00[0], s01[0], s02[0], s03[0] );
+   MM128_ILEAVE32( d[ 1], d[ 5], d[ 9], d[13], s04[0], s05[0], s06[0], s07[0] );
+   MM128_ILEAVE32( d[ 2], d[ 6], d[10], d[14], s08[0], s09[0], s10[0], s11[0] );
+   MM128_ILEAVE32( d[ 3], d[ 7], d[11], d[15], s12[0], s13[0], s14[0], s15[0] );
+
+   MM128_ILEAVE32( d[16], d[20], d[24], d[28], s00[1], s01[1], s02[1], s03[1] );
+   MM128_ILEAVE32( d[17], d[21], d[25], d[29], s04[1], s05[1], s06[1], s07[1] );
+   MM128_ILEAVE32( d[18], d[22], d[26], d[30], s08[1], s09[1], s10[1], s11[1] );
+   MM128_ILEAVE32( d[19], d[23], d[27], d[31], s12[1], s13[1], s14[1], s15[1] );
+
+   if ( bit_len <= 256 ) return;
+
+   MM128_ILEAVE32( d[32], d[36], d[40], d[44], s00[2], s01[2], s02[2], s03[2] );
+   MM128_ILEAVE32( d[33], d[37], d[41], d[45], s04[2], s05[2], s06[2], s07[2] );
+   MM128_ILEAVE32( d[34], d[38], d[42], d[46], s08[2], s09[2], s10[2], s11[2] );
+   MM128_ILEAVE32( d[35], d[39], d[43], d[47], s12[2], s13[2], s14[2], s15[2] );
+
+   MM128_ILEAVE32( d[48], d[52], d[56], d[60], s00[3], s01[3], s02[3], s03[3] );
+   MM128_ILEAVE32( d[49], d[53], d[57], d[61], s04[3], s05[3], s06[3], s07[3] );
+   MM128_ILEAVE32( d[50], d[54], d[58], d[62], s08[3], s09[3], s10[3], s11[3] );
+   MM128_ILEAVE32( d[51], d[55], d[59], d[63], s12[3], s13[3], s14[3], s15[3] );
+
+   if ( bit_len <= 512 ) return;
+
+   MM128_ILEAVE32( d[64], d[68], d[72], d[76], s00[4], s01[4], s02[4], s03[4] );
+   MM128_ILEAVE32( d[65], d[69], d[73], d[77], s04[4], s05[4], s06[4], s07[4] );
+   MM128_ILEAVE32( d[66], d[70], d[74], d[78], s08[4], s09[4], s10[4], s11[4] );
+   MM128_ILEAVE32( d[67], d[71], d[75], d[79], s12[4], s13[4], s14[4], s15[4] );
+
+   if ( bit_len <= 640 ) return;
+
+   MM128_ILEAVE32( d[80], d[84], d[88], d[92], s00[5], s01[5], s02[5], s03[5] );
+   MM128_ILEAVE32( d[81], d[85], d[89], d[93], s04[5], s05[5], s06[5], s07[5] );
+   MM128_ILEAVE32( d[82], d[86], d[90], d[94], s08[5], s09[5], s10[5], s11[5] );
+   MM128_ILEAVE32( d[83], d[87], d[91], d[95], s12[5], s13[5], s14[5], s15[5] );
+
+   MM128_ILEAVE32( d[ 96], d[100], d[104], d[108], s00[6], s01[6], s02[6], s03[6] );
+   MM128_ILEAVE32( d[ 97], d[101], d[105], d[109], s04[6], s05[6], s06[6], s07[6] );
+   MM128_ILEAVE32( d[ 98], d[102], d[106], d[110], s08[6], s09[6], s10[6], s11[6] );
+   MM128_ILEAVE32( d[ 99], d[103], d[107], d[111], s12[6], s13[6], s14[6], s15[6] );
+
+   MM128_ILEAVE32( d[112], d[116], d[120], d[124], s00[7], s01[7], s02[7], s03[7] );
+   MM128_ILEAVE32( d[113], d[117], d[121], d[125], s04[7], s05[7], s06[7], s07[7] );
+   MM128_ILEAVE32( d[114], d[118], d[122], d[126], s08[7], s09[7], s10[7], s11[7] );
+   MM128_ILEAVE32( d[115], d[119], d[123], d[127], s12[7], s13[7], s14[7], s15[7] );
+}
+
+// Not used, only potential use is with AVX512
+#if defined(__AVX2__)
+
+static inline void intrlv_16x32_256( void *dst, const void *src00,
+    const void *src01, const void *src02, const void *src03, const void *src04,
+    const void *src05, const void *src06, const void *src07, const void *src08,
+    const void *src09, const void *src10, const void *src11, const void *src12,
+    const void *src13, const void *src14, const void *src15 )
+{
+   __m256i *d = (__m256i*)dst;
+   const __m256i *s00 = (const __m256i*)src00;
+   const __m256i *s01 = (const __m256i*)src01;
+   const __m256i *s02 = (const __m256i*)src02;
+   const __m256i *s03 = (const __m256i*)src03;
+   const __m256i *s04 = (const __m256i*)src04;
+   const __m256i *s05 = (const __m256i*)src05;
+   const __m256i *s06 = (const __m256i*)src06;
+   const __m256i *s07 = (const __m256i*)src07;
+   const __m256i *s08 = (const __m256i*)src08;
+   const __m256i *s09 = (const __m256i*)src09;
+   const __m256i *s10 = (const __m256i*)src10;
+   const __m256i *s11 = (const __m256i*)src11;
+   const __m256i *s12 = (const __m256i*)src12;
+   const __m256i *s13 = (const __m256i*)src13;
+   const __m256i *s14 = (const __m256i*)src14;
+   const __m256i *s15 = (const __m256i*)src15;
+   __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7;
+
+   MM256_ILEAVE32( dt0, dt1, dt2, dt3, s00[0], s01[0], s02[0], s03[0] );
+   MM256_ILEAVE32( dt4, dt5, dt6, dt7, s04[0], s05[0], s06[0], s07[0] );
+
+   d[ 0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 );
+   d[ 8] = _mm256_permute2x128_si256( dt0, dt4, 0x31 );
+   d[ 2] = _mm256_permute2x128_si256( dt1, dt5, 0x20 );
+   d[10] = _mm256_permute2x128_si256( dt1, dt5, 0x31 );
+   d[ 4] = _mm256_permute2x128_si256( dt2, dt6, 0x20 );  
+   d[12] = _mm256_permute2x128_si256( dt2, dt6, 0x31 );  
+   d[ 6] = _mm256_permute2x128_si256( dt3, dt7, 0x20 );
+   d[14] = _mm256_permute2x128_si256( dt3, dt7, 0x31 );
+
+   MM256_ILEAVE32( dt0, dt1, dt1, dt3, s08[0], s09[0], s10[0], s11[0] );
+   MM256_ILEAVE32( dt4, dt5, dt6, dt7, s12[0], s13[0], s14[0], s15[0] );
+
+   d[ 1] = _mm256_permute2x128_si256( dt0, dt4, 0x20 );
+   d[ 9] = _mm256_permute2x128_si256( dt0, dt4, 0x31 );
+   d[ 3] = _mm256_permute2x128_si256( dt1, dt5, 0x20 );
+   d[11] = _mm256_permute2x128_si256( dt1, dt5, 0x31 );
+   d[ 5] = _mm256_permute2x128_si256( dt2, dt6, 0x20 );  
+   d[13] = _mm256_permute2x128_si256( dt2, dt6, 0x31 );  
+   d[ 7] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); 
+   d[15] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); 
+}
+#endif
+
+// Not used
+static inline void intrlv_16x32_512( void *dst, const void *src00,
+    const void *src01, const void *src02, const void *src03, const void *src04,
+    const void *src05, const void *src06, const void *src07, const void *src08,
+    const void *src09, const void *src10, const void *src11, const void *src12,
+    const void *src13, const void *src14, const void *src15 )
+{
+#if defined(__AVX512F__)
+
+   __m512i *d = (__m512i*)dst;
+   const __m512i *s00 = (const __m512i*)src00;
+   const __m512i *s01 = (const __m512i*)src01;
+   const __m512i *s02 = (const __m512i*)src02;
+   const __m512i *s03 = (const __m512i*)src03;
+   const __m512i *s04 = (const __m512i*)src04;
+   const __m512i *s05 = (const __m512i*)src05;
+   const __m512i *s06 = (const __m512i*)src06;
+   const __m512i *s07 = (const __m512i*)src07;
+   const __m512i *s08 = (const __m512i*)src08;
+   const __m512i *s09 = (const __m512i*)src09;
+   const __m512i *s10 = (const __m512i*)src10;
+   const __m512i *s11 = (const __m512i*)src11;
+   const __m512i *s12 = (const __m512i*)src12;
+   const __m512i *s13 = (const __m512i*)src13;
+   const __m512i *s14 = (const __m512i*)src14;
+   const __m512i *s15 = (const __m512i*)src15;
+   __m512i st00, st01, st02, st03, st04, st05, st06, st07,
+           st08, st09, st10, st11, st12, st13, st14, st15,
+           t0, t1, t2, t3;
+
+   MM512_ILEAVE32( st00, st01, st02, st03, s00[0], s01[0], s02[0], s03[0] );
+   MM512_ILEAVE32( st04, st05, st06, st07, s04[0], s05[0], s06[0], s07[0] );
+   MM512_ILEAVE32( st08, st09, st10, st11, s08[0], s09[0], s10[0], s11[0] );
+   MM512_ILEAVE32( st12, st13, st14, st15, s12[0], s13[0], s14[0], s15[0] );
+
+   t0 = _mm512_shuffle_i32x4( st00, st04, 0x88 );
+   t1 = _mm512_shuffle_i32x4( st00, st04, 0xdd );
+   t2 = _mm512_shuffle_i32x4( st08, st12, 0x88 );
+   t3 = _mm512_shuffle_i32x4( st08, st12, 0xdd );
+
+   d[ 0] = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   d[ 8] = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   d[ 4] = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   d[12] = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+
+   t0 = _mm512_shuffle_i32x4( st01, st05, 0x88 );
+   t1 = _mm512_shuffle_i32x4( st01, st05, 0xdd );
+   t2 = _mm512_shuffle_i32x4( st09, st13, 0x88 );
+   t3 = _mm512_shuffle_i32x4( st09, st13, 0xdd );
+
+   d[ 1] = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   d[ 9] = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   d[ 5] = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   d[13] = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+
+   t0 = _mm512_shuffle_i32x4( st02, st06, 0x88 );
+   t1 = _mm512_shuffle_i32x4( st02, st06, 0xdd );
+   t2 = _mm512_shuffle_i32x4( st10, st14, 0x88 );
+   t3 = _mm512_shuffle_i32x4( st10, st14, 0xdd );
+
+   d[ 2] = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   d[10] = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   d[ 6] = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   d[14] = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+   
+   t0 = _mm512_shuffle_i32x4( st03, st07, 0x88 );
+   t1 = _mm512_shuffle_i32x4( st03, st07, 0xdd );
+   t2 = _mm512_shuffle_i32x4( st11, st15, 0x88 );
+   t3 = _mm512_shuffle_i32x4( st11, st15, 0xdd );
+
+   d[ 3] = _mm512_shuffle_i32x4( t0, t2, 0x88 );
+   d[11] = _mm512_shuffle_i32x4( t0, t2, 0xdd );
+   d[ 7] = _mm512_shuffle_i32x4( t1, t3, 0x88 );
+   d[15] = _mm512_shuffle_i32x4( t1, t3, 0xdd );
+   
+#endif
+}
+*/
 
 #define ILEAVE_16x32( i ) do \
 { \
@@ -923,6 +1427,7 @@ static inline void intrlv_16x32( void *dst, const void *s00,
    ILEAVE_16x32( 30 );   ILEAVE_16x32( 31 );
 }
 
+
 static inline void intrlv_16x32_512( void *dst, const void *s00,
         const void *s01, const void *s02, const void *s03, const void *s04,
         const void *s05, const void *s06, const void *s07, const void *s08,
@@ -941,6 +1446,187 @@ static inline void intrlv_16x32_512( void *dst, const void *s00,
 
 #undef ILEAVE_16x32
 
+/*
+static inline void dintrlv_16x32( void *dst00, void *dst01, void *dst02,
+      void *dst03, void *dst04, void *dst05, void *dst06, void *dst07,
+      void *dst08, void *dst09, void *dst10, void *dst11, void *dst12,
+      void *dst13, void *dst14, void *dst15, const void *src,
+      const int bit_len )
+{
+   __m128i *d00 = (__m128i*)dst00;
+   __m128i *d01 = (__m128i*)dst01;
+   __m128i *d02 = (__m128i*)dst02;
+   __m128i *d03 = (__m128i*)dst03;
+   __m128i *d04 = (__m128i*)dst04;
+   __m128i *d05 = (__m128i*)dst05;
+   __m128i *d06 = (__m128i*)dst06;
+   __m128i *d07 = (__m128i*)dst07;
+   __m128i *d08 = (__m128i*)dst08;
+   __m128i *d09 = (__m128i*)dst09;
+   __m128i *d10 = (__m128i*)dst10;
+   __m128i *d11 = (__m128i*)dst11;
+   __m128i *d12 = (__m128i*)dst12;
+   __m128i *d13 = (__m128i*)dst13;
+   __m128i *d14 = (__m128i*)dst14;
+   __m128i *d15 = (__m128i*)dst15;
+   const __m128i *s = (const __m128i*)src;
+
+   MM128_ILEAVE32( d00[0], d01[0], d02[0], d03[0], s[ 0], s[ 4], s[ 8], s[12] );
+   MM128_ILEAVE32( d04[0], d05[0], d06[0], d07[0], s[ 1], s[ 5], s[ 9], s[13] );
+   MM128_ILEAVE32( d08[0], d09[0], d10[0], d11[0], s[ 2], s[ 6], s[10], s[14] );
+   MM128_ILEAVE32( d12[0], d13[0], d14[0], d15[0], s[ 3], s[ 7], s[11], s[15] );
+
+   MM128_ILEAVE32( d00[1], d01[1], d02[1], d03[1], s[16], s[20], s[24], s[28] );
+   MM128_ILEAVE32( d04[1], d05[1], d06[1], d07[1], s[17], s[21], s[25], s[29] );
+   MM128_ILEAVE32( d08[1], d09[1], d10[1], d11[1], s[18], s[22], s[26], s[30] );
+   MM128_ILEAVE32( d12[1], d13[1], d14[1], d15[1], s[19], s[23], s[27], s[31] );
+
+   if ( bit_len <= 256 ) return;
+
+   MM128_ILEAVE32( d00[2], d01[2], d02[2], d03[2], s[32], s[36], s[40], s[44] );
+   MM128_ILEAVE32( d04[2], d05[2], d06[2], d07[2], s[33], s[37], s[41], s[45] );
+   MM128_ILEAVE32( d08[2], d09[2], d10[2], d11[2], s[34], s[38], s[42], s[46] );
+   MM128_ILEAVE32( d12[2], d13[2], d14[2], d15[2], s[35], s[39], s[43], s[47] );
+
+   MM128_ILEAVE32( d00[3], d01[3], d02[3], d03[3], s[48], s[52], s[56], s[60] );
+   MM128_ILEAVE32( d04[3], d05[3], d06[3], d07[3], s[49], s[53], s[57], s[61] );
+   MM128_ILEAVE32( d08[3], d09[3], d10[3], d11[3], s[50], s[54], s[58], s[62] );
+   MM128_ILEAVE32( d12[3], d13[3], d14[3], d15[3], s[51], s[55], s[59], s[63] );
+
+   if ( bit_len <= 512 ) return;
+
+   MM128_ILEAVE32( d00[4], d01[4], d02[4], d03[4], s[64], s[68], s[72], s[76] );
+   MM128_ILEAVE32( d04[4], d05[4], d06[4], d07[4], s[65], s[69], s[73], s[77] );
+   MM128_ILEAVE32( d08[4], d09[4], d10[4], d11[4], s[66], s[70], s[74], s[78] );
+   MM128_ILEAVE32( d12[4], d13[4], d14[4], d15[4], s[67], s[71], s[75], s[79] );
+
+   if ( bit_len <= 640 ) return;
+
+   MM128_ILEAVE32( d00[5], d01[5], d02[5], d03[5], s[80], s[84], s[88], s[92] );
+   MM128_ILEAVE32( d04[5], d05[5], d06[5], d07[5], s[81], s[85], s[89], s[93] );
+   MM128_ILEAVE32( d08[5], d09[5], d10[5], d11[5], s[82], s[86], s[90], s[94] );
+   MM128_ILEAVE32( d12[5], d13[5], d14[5], d15[5], s[83], s[87], s[91], s[95] );
+
+   MM128_ILEAVE32( d00[6], d01[6], d02[6], d03[6], s[ 96], s[100], s[104], s[108] );
+   MM128_ILEAVE32( d04[6], d05[6], d06[6], d07[6], s[ 97], s[101], s[105], s[109] );
+   MM128_ILEAVE32( d08[6], d09[6], d10[6], d11[6], s[ 98], s[102], s[106], s[110] );
+   MM128_ILEAVE32( d12[6], d13[6], d14[6], d15[6], s[ 99], s[103], s[107], s[111] );
+
+   MM128_ILEAVE32( d00[7], d01[7], d02[7], d03[7], s[112], s[116], s[120], s[124] );
+   MM128_ILEAVE32( d04[7], d05[7], d06[7], d07[7], s[113], s[117], s[121], s[125] );
+   MM128_ILEAVE32( d08[7], d09[7], d10[7], d11[7], s[114], s[118], s[122], s[126] );
+   MM128_ILEAVE32( d12[7], d13[7], d14[7], d15[7], s[115], s[119], s[123], s[127] );
+}
+
+// 4 interleave algorithms same memory footprint:
+//
+// 1. 32 bit integer move
+//
+// Most instructions, all 32 bit loads & stores, use general purpose regs
+//
+// 2. SSE2 128 bit shuffle
+//
+// 128 bit loads and stores + fast shuffles, fewer total instructions: .75,
+// uses 128 bit simd regs
+//
+// 3. AVX2 2x128 bit shuffle with 256 bit permute 
+//
+// 256 bit loads and stores + slow 256 bit permutes, even fewer instructions:
+// additional .5, uses 256 bit simd regs
+//
+// 4. AVX2 2x128 bit shuffle with union
+//
+// 128 bit loads, 256 bit stores + 128 bit moves using union + overhead
+// converting from mm128 to mm256, compiler may choose mem ovly or  
+
+static inline void dintrlv_16x32_256( void *dst00, void *dst01, void *dst02,
+      void *dst03, void *dst04, void *dst05, void *dst06, void *dst07,
+      void *dst08, void *dst09, void *dst10, void *dst11, void *dst12,
+      void *dst13, void *dst14, void *dst15, const void *src )
+{
+#if defined(__AVX2__)
+// Can't use AVX512, min bit_len is 512 unless a single contiguous 
+// output buffer is used.
+
+   const __m256i *s = (const __m256i*)src;
+   __m256i *d00 = (__m256i*)dst00;
+   __m256i *d01 = (__m256i*)dst01;
+   __m256i *d02 = (__m256i*)dst02;
+   __m256i *d03 = (__m256i*)dst03;
+   __m256i *d04 = (__m256i*)dst04;
+   __m256i *d05 = (__m256i*)dst05;
+   __m256i *d06 = (__m256i*)dst06;
+   __m256i *d07 = (__m256i*)dst07;
+   __m256i *d08 = (__m256i*)dst08;
+   __m256i *d09 = (__m256i*)dst09;
+   __m256i *d10 = (__m256i*)dst10;
+   __m256i *d11 = (__m256i*)dst11;
+   __m256i *d12 = (__m256i*)dst12;
+   __m256i *d13 = (__m256i*)dst13;
+   __m256i *d14 = (__m256i*)dst14;
+   __m256i *d15 = (__m256i*)dst15;
+   __m256i st0, st1, st2, st3, st4, st5, st6, st7;
+
+   st0 = _mm256_permute2x128_si256( s[ 0], s[ 8], 0x20 );
+   st4 = _mm256_permute2x128_si256( s[ 0], s[ 8], 0x31 );
+   st1 = _mm256_permute2x128_si256( s[ 2], s[10], 0x20 );
+   st5 = _mm256_permute2x128_si256( s[ 2], s[10], 0x31 );
+   st2 = _mm256_permute2x128_si256( s[ 4], s[12], 0x20 );
+   st6 = _mm256_permute2x128_si256( s[ 4], s[12], 0x31 );
+   st3 = _mm256_permute2x128_si256( s[ 6], s[14], 0x20 );
+   st7 = _mm256_permute2x128_si256( s[ 6], s[14], 0x31 );
+
+   MM256_ILEAVE32( d00[0], d01[0], d02[0], d03[0], st0, st1, st2, st3 );
+   MM256_ILEAVE32( d04[0], d05[0], d06[0], d07[0], st4, st5, st6, st7 );
+
+   st0 = _mm256_permute2x128_si256( s[ 1], s[ 9], 0x20 );
+   st4 = _mm256_permute2x128_si256( s[ 1], s[ 9], 0x31 );
+   st1 = _mm256_permute2x128_si256( s[ 3], s[11], 0x20 );
+   st5 = _mm256_permute2x128_si256( s[ 3], s[11], 0x31 );
+   st2 = _mm256_permute2x128_si256( s[ 5], s[13], 0x20 );
+   st6 = _mm256_permute2x128_si256( s[ 5], s[13], 0x31 );
+   st3 = _mm256_permute2x128_si256( s[ 7], s[15], 0x20 );
+   st7 = _mm256_permute2x128_si256( s[ 7], s[15], 0x31 );
+
+   MM256_ILEAVE32( d08[0], d09[0], d10[0], d11[0], st0, st1, st2, st3 );
+   MM256_ILEAVE32( d12[0], d13[0], d14[0], d15[0], st4, st5, st6, st7 );
+
+
+#else
+// not needed, 16x32 is only used with AVX512
+
+   __m128i *d00 = (__m128i*)dst00;
+   __m128i *d01 = (__m128i*)dst01;
+   __m128i *d02 = (__m128i*)dst02;
+   __m128i *d03 = (__m128i*)dst03;
+   __m128i *d04 = (__m128i*)dst04;
+   __m128i *d05 = (__m128i*)dst05;
+   __m128i *d06 = (__m128i*)dst06;
+   __m128i *d07 = (__m128i*)dst07;
+   __m128i *d08 = (__m128i*)dst08;
+   __m128i *d09 = (__m128i*)dst09;
+   __m128i *d10 = (__m128i*)dst10;
+   __m128i *d11 = (__m128i*)dst11;
+   __m128i *d12 = (__m128i*)dst12;
+   __m128i *d13 = (__m128i*)dst13;
+   __m128i *d14 = (__m128i*)dst14;
+   __m128i *d15 = (__m128i*)dst15;
+   const __m128i *s = (const __m128i*)src;
+
+   MM128_ILEAVE32( d00[0], d01[0], d02[0], d03[0], s[ 0], s[ 4], s[ 8], s[12] );
+   MM128_ILEAVE32( d04[0], d05[0], d06[0], d07[0], s[ 1], s[ 5], s[ 9], s[13] );
+   MM128_ILEAVE32( d08[0], d09[0], d10[0], d11[0], s[ 2], s[ 6], s[10], s[14] );
+   MM128_ILEAVE32( d12[0], d13[0], d14[0], d15[0], s[ 3], s[ 7], s[11], s[15] );
+
+   MM128_ILEAVE32( d00[1], d01[1], d02[1], d03[1], s[16], s[20], s[24], s[28] );
+   MM128_ILEAVE32( d04[1], d05[1], d06[1], d07[1], s[17], s[21], s[25], s[29] );
+   MM128_ILEAVE32( d08[1], d09[1], d10[1], d11[1], s[18], s[22], s[26], s[30] );
+   MM128_ILEAVE32( d12[1], d13[1], d14[1], d15[1], s[19], s[23], s[27], s[31] );
+
+#endif
+}
+*/
+
 #define DLEAVE_16x32( i ) do \
 { \
    const uint32_t *s = (const uint32_t*)(src) + ( (i) << 4 ); \
@@ -962,6 +1648,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00,
    *( (uint32_t*)(d15) +(i) ) = s[15]; \
 } while(0)
 
+
 static inline void dintrlv_16x32( void *d00, void *d01, void *d02, void *d03,
             void *d04, void *d05, void *d06, void *d07, void *d08, void *d09,
             void *d10, void *d11, void *d12, void *d13, void *d14, void *d15,
@@ -988,6 +1675,7 @@ static inline void dintrlv_16x32( void *d00, void *d01, void *d02, void *d03,
    DLEAVE_16x32( 30 );   DLEAVE_16x32( 31 );
 }
 
+
 static inline void dintrlv_16x32_512( void *d00, void *d01, void *d02,
                 void *d03, void *d04, void *d05, void *d06, void *d07,
                 void *d08, void *d09, void *d10, void *d11, void *d12,
@@ -1005,6 +1693,7 @@ static inline void dintrlv_16x32_512( void *d00, void *d01, void *d02,
 
 #undef DLEAVE_16x32
 
+
 static inline void extr_lane_16x32( void *d, const void *s,
                                     const int lane, const int bit_len )
 {
@@ -1322,6 +2011,33 @@ static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
    d3[3] = _mm_unpackhi_epi64( s[13], s[15] );
 }
 
+
+static inline void extr_lane_4x64( void *dst, const void *src, const int lane,
+     const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s = (const __m128i*)src;
+   int i = lane / 2;
+   if ( lane % 2 )   // odd lanes
+   { 
+      d[0] = _mm_unpackhi_epi64( s[ i+ 0 ], s[ i+ 2 ] );
+      d[1] = _mm_unpackhi_epi64( s[ i+ 4 ], s[ i+ 6 ] );
+      if ( bit_len <= 256 ) return;
+      d[2] = _mm_unpackhi_epi64( s[ i+ 8 ], s[ i+10 ] );
+      d[3] = _mm_unpackhi_epi64( s[ i+12 ], s[ i+14 ] );
+   }
+   else     // even lanes
+   { 
+      d[0] = _mm_unpacklo_epi64( s[ i+ 0 ], s[ i+ 2 ] );
+      d[1] = _mm_unpacklo_epi64( s[ i+ 4 ], s[ i+ 6 ] );
+      if ( bit_len <= 256 ) return;
+      d[2] = _mm_unpacklo_epi64( s[ i+ 8 ], s[ i+10 ] );
+      d[3] = _mm_unpacklo_epi64( s[ i+12 ], s[ i+14 ] );
+   }
+   return;    // bit_len == 512   
+}
+
+/*
 static inline void extr_lane_4x64( void *d, const void *s,
                                    const int lane, const int bit_len )
 {
@@ -1335,6 +2051,7 @@ static inline void extr_lane_4x64( void *d, const void *s,
    ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+24 ];
    ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+28 ];
 }
+*/
 
 #if defined(__AVX2__)
 // Doesn't really need AVX2, just SSSE3, but is only used with AVX2 code.
@@ -1710,6 +2427,32 @@ static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2,
    d7[3] = _mm_unpackhi_epi64( s[27], s[31] );
 }
 
+static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
+     const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s = (const __m128i*)src;
+   int i = lane / 2;
+   if ( lane % 2 )   // odd lanes
+   {
+      d[0] = _mm_unpackhi_epi64( s[ i+ 0], s[ i+ 4] );
+      d[1] = _mm_unpackhi_epi64( s[ i+ 8], s[ i+12] );
+      if ( bit_len <= 256 ) return;
+      d[2] = _mm_unpackhi_epi64( s[ i+16], s[ i+20] );
+      d[3] = _mm_unpackhi_epi64( s[ i+24], s[ i+28] );
+   }
+   else   // even lanes
+   {
+      d[0] = _mm_unpacklo_epi64( s[ i+ 0], s[ i+ 4] );
+      d[1] = _mm_unpacklo_epi64( s[ i+ 8], s[ i+12] );
+      if ( bit_len <= 256 ) return;
+      d[2] = _mm_unpacklo_epi64( s[ i+16], s[ i+20] );
+      d[3] = _mm_unpacklo_epi64( s[ i+24], s[ i+28] );
+   }
+   return;
+}
+
+/*
 static inline void extr_lane_8x64( void *d, const void *s,
                                    const int lane, const int bit_len )
 {
@@ -1723,6 +2466,7 @@ static inline void extr_lane_8x64( void *d, const void *s,
    ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+ 48 ];
    ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+ 56 ];
 }
+*/
 
 #if defined(__AVX512F__) && defined(__AVX512VL__)
 
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 765d8479..b5a36ab4 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -272,9 +272,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #endif
 
+// Mask making
 
+// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
+// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
 
-// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
+#define mm_movmask_64( v ) \
+   _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
+
+#define mm_movmask_32( v ) \
+   _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )
+
+
+// Diagonal blend
 
 // Blend 4 32 bit elements from 4 vectors
 
@@ -284,7 +294,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
                   _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
 
-#elif defined(__SSE4_1)
+#elif defined(__SSE4_1__)
 
 #define mm128_diagonal_32( v3, v2, v1, v0 ) \
   mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
@@ -401,6 +411,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
    _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
 
+// Limited 2 input shuffle
+#define mm128_shuffle2_64( a, b, c ) \
+   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
+                                     _mm_castsi128_pd( b ), c ) ); 
+
+#define mm128_shuffle2_32( a, b, c ) \
+   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
+                                     _mm_castsi128_ps( b ), c ) ); 
+
+
 //
 // Rotate vector elements accross all lanes
 
@@ -532,9 +552,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #if defined(__SSSE3__)
 
 // Function macro with two inputs and one output, inputs are preserved.
-// Returns modified first arg.
 // Two input functions are not available without SSSE3. Use procedure
-// belowe instead.
+// macros below instead.
 
 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
@@ -548,12 +567,11 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
 
-// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
-// Returns both modified args in place.
+// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
 
 // These macros retain the vrol/vror name for now to avoid
 // confusion with the shufl2r/shuffle2l function macros above.
-// These may be renamed to something like shufl2r2 for 2 1nputs and
+// These may be renamed to something like shufl2r2 for 2 nputs and
 // 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
 
 #define mm128_vror256_64( v1, v2 ) \
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 1116976f..bede65c7 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -233,6 +233,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 #endif
 
+// Mask making
+
+// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
+// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+
+#define mm256_movmask_64( v ) \
+   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
+
+#define mm256_movmask_32( v ) \
+   _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )
+
+
 // Diagonal blending
 
 // Blend 4 64 bit elements from 4 vectors
@@ -405,6 +417,16 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.
 
+// Limited 2 input shuffle
+#define mm256_shuffle2_64( a, b, c ) \
+   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
+                                           _mm256_castsi256_pd( b ), c ) ); 
+
+#define mm256_shuffle2_32( a, b, c ) \
+   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
+                                           _mm256_castsi256_ps( b ), c ) ); 
+
+
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
 #define mm256_shufll128_64 mm256_swap128_64
@@ -485,20 +507,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
    v2 = _mm256_xor_si256( v1, v2 ); \
    v1 = _mm256_xor_si256( v1, v2 );
 
-#define mm256_vror512_128( v1, v2 ) \
-do { \
-   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
-   v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
-   v2 = t; \
-} while(0)
-
-#define mm256_vrol512_128( v1, v2 ) \
-do { \
-   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
-   v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
-   v1 = t; \
-} while(0)
-
 #endif // __AVX2__
 #endif // SIMD_256_H__
 
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index 3cc090a4..6867a3d9 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -493,7 +493,7 @@ static inline __m512i mm512_shufll_32( const __m512i v )
 static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 { return _mm512_alignr_epi64( v, v, n ); }
 
-static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
+static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }
 
 #define mm512_shuflr_16( v ) \
@@ -581,8 +581,17 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
                      0x0e0d0c0b0a090807, 0x060504030201001f ) )
 
 //
-// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
+// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
  
+// Limited 2 input, 1 output shuffle within 128 bit lanes.
+#define mm512_shuffle2_64( a, b, c ) \
+   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
+                                           _mm512_castsi512_pd( b ), c ) ); 
+
+#define mm512_shuffle2_32( a, b, c ) \
+   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
+                                           _mm512_castsi512_ps( b ), c ) ); 
+
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
 #define mm512_shuflr128_64  mm512_swap128_64
@@ -610,6 +619,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // shufl2r is 2 input ...
 // Drop macros? They can easilly be rebuilt using shufl2 functions
 
+// 2 input, 1 output
 // Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
 // rotated v1 
 // visually confusing for shif2r because of arg order. First arg is always
@@ -627,76 +637,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
 #define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
 
-// Rotate elements from 2 512 bit vectors in place, source arguments
-//  are overwritten.
-
-#define mm512_swap1024_512( v1, v2 ) \
-   v1 = _mm512_xor_si512( v1, v2 ); \
-   v2 = _mm512_xor_si512( v1, v2 ); \
-   v1 = _mm512_xor_si512( v1, v2 );
-#define mm512_shufl2l_512 mm512_swap1024_512 \
-#define mm512_shufl2r_512 mm512_swap1024_512 \
-
-// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
-// for now.
-//  Rotate elements from 2 512 bit vectors in place, both source arguments
-//  are updated.
-
-#define mm512_vror1024_256( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
-   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_256( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
-   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
-   v1 = t; \
-} while(0)
-
-#define mm512_vror1024_128( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
-   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_128( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
-   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
-   v1 = t; \
-} while(0)
-
-#define mm512_vror1024_64( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
-   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_64( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
-   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
-   v1 = t; \
-} while(0)
-
-#define mm512_vror1024_32( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
-   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_32( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
-   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
-   v1 = t; \
-} while(0)
-
 #endif // AVX512
 #endif // SIMD_512_H__