From be88afc349ba38c5c25da733b0139a6c2207fd4b Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 21 Sep 2023 12:34:06 -0400
Subject: [PATCH] v3.23.2

---
 Makefile.am                           |    7 +-
 RELEASE_NOTES                         |    5 +
 algo-gate-api.c                       |    2 +-
 algo-gate-api.h                       |    2 +-
 algo/argon2/argon2a/argon2a.c         |    2 +-
 algo/blake/sph-blake2s.c              |    2 +-
 algo/blake/sph_blake.h                |    2 +-
 algo/blake/sph_blake2b.c              |    2 +-
 algo/bmw/bmw-hash-4way.h              |    6 +-
 algo/bmw/bmw256-hash-4way.c           |    4 +-
 algo/bmw/bmw512-hash-4way.c           |   24 +-
 algo/bmw/sph_bmw.h                    |    2 +-
 algo/cubehash/cubehash_sse2.c         |    1 -
 algo/cubehash/cubehash_sse2.h         |    2 +-
 algo/cubehash/sph_cubehash.h          |    2 +-
 algo/echo/aes_ni/hash_api.h           |    2 +-
 algo/echo/sph_echo.c                  |    2 +-
 algo/echo/sph_echo.h                  |    2 +-
 algo/fugue/fugue-aesni.h              |    2 +-
 algo/fugue/sph_fugue.h                |    2 +-
 algo/gost/sph_gost.h                  |    2 +-
 algo/groestl/aes_ni/hash-groestl.h    |    4 +-
 algo/groestl/aes_ni/hash-groestl256.h |    3 +-
 algo/groestl/groestl-gate.c           |    2 +-
 algo/groestl/groestl256-hash-4way.h   |    4 -
 algo/groestl/myrgr-4way.c             |    2 +-
 algo/groestl/sph_groestl.h            |    2 +-
 algo/hamsi/hamsi-hash-4way.c          | 1686 ++++++++++++++++---------
 algo/hamsi/hamsi-hash-4way.h          |   70 +-
 algo/hamsi/sph_hamsi.h                |    2 +-
 algo/haval/haval-4way-helper.c        |    4 +-
 algo/haval/haval-hash-4way.c          |   10 +-
 algo/haval/haval-hash-4way.h          |    3 +-
 algo/haval/sph-haval.h                |    2 +-
 algo/jh/sph_jh.h                      |    2 +-
 algo/keccak/keccak-4way.c             |    1 -
 algo/keccak/keccak-gate.c             |    2 +-
 algo/keccak/keccak-hash-4way.h        |   43 -
 algo/keccak/sha3d-4way.c              |    1 -
 algo/keccak/sph_keccak.h              |    2 +-
 algo/lanehash/lane.h                  |    1 -
 algo/luffa/luffa-hash-2way.c          |    6 +-
 algo/luffa/luffa-hash-2way.h          |    6 +-
 algo/luffa/luffa_for_sse2.h           |    2 +-
 algo/luffa/sph_luffa.h                |    2 +-
 algo/lyra2/lyra2.h                    |    3 +-
 algo/lyra2/lyra2rev2.c                |    1 -
 algo/lyra2/lyra2rev3.c                |    1 -
 algo/panama/sph_panama.h              |    2 +-
 algo/quark/hmq1725-4way.c             |    2 +-
 algo/ripemd/lbry-4way.c               |    3 +-
 algo/ripemd/ripemd-hash-4way.h        |    1 -
 algo/ripemd/sph_ripemd.h              |    2 +-
 algo/scrypt/scrypt.c                  |    1 -
 algo/sha/hmac-sha256-hash-4way.h      |    2 +-
 algo/sha/sha-hash-4way.h              |  168 ---
 algo/sha/sha256-hash-2way-ni.c        |  689 ----------
 algo/sha/sha256-hash-4way.c           |  269 ++--
 algo/sha/sha256-hash-opt.c            |  388 ------
 algo/sha/sha256-hash.c                | 1386 +++++++++++++++++++-
 algo/sha/sha256-hash.h                |  106 +-
 algo/sha/sha256d-4way.c               |  327 ++---
 algo/sha/sha256dt.c                   |  198 +--
 algo/sha/sha256q-4way.c               |    2 +-
 algo/sha/sha256t-4way.c               |  121 +-
 algo/sha/sha256t-gate.c               |    6 +-
 algo/sha/sha256t-gate.h               |    6 +-
 algo/sha/sha256t.c                    |  102 --
 algo/sha/sha512-hash-4way.c           |    2 +-
 algo/sha/sha512-hash.h                |   46 +
 algo/sha/sha512256d-4way.c            |    3 +-
 algo/sha/sph_sha2.h                   |    2 +-
 algo/shabal/shabal-hash-4way.c        |    4 +-
 algo/shabal/shabal-hash-4way.h        |   48 +-
 algo/shabal/sph_shabal.h              |    2 +-
 algo/shavite/shavite-hash-2way.c      |    2 -
 algo/shavite/sph_shavite.c            |    2 +-
 algo/shavite/sph_shavite.h            |    2 +-
 algo/simd/nist.h                      |    2 +-
 algo/simd/simd-compat.h               |    2 +-
 algo/simd/sph_simd.h                  |    2 +-
 algo/skein/skein-4way.c               |    1 -
 algo/skein/skein-gate.c               |    1 -
 algo/skein/sph_skein.h                |    2 +-
 algo/tiger/sph_tiger.h                |    2 +-
 algo/whirlpool/sph_whirlpool.h        |    2 +-
 algo/x11/x11-4way.c                   |   80 +-
 algo/x16/x16r-gate.c                  |    2 +-
 algo/x16/x16r-gate.h                  |    3 +-
 algo/x16/x21s-4way.c                  |    4 +-
 algo/x17/sonoa-4way.c                 |    2 +-
 algo/x17/x17-4way.c                   |    4 +-
 algo/x17/xevan-4way.c                 |    2 +-
 algo/x22/x22i-4way.c                  |    6 +-
 algo/x22/x25x-4way.c                  |    6 +-
 asm/aesb-x64.S                        |   72 --
 asm/aesb-x86.S                        |   21 -
 comp.log                              |   50 -
 {algo/sha => compat}/aes_helper.c     |   19 +-
 {algo/sha => compat}/brg_types.h      |    0
 {algo/sha => compat}/sha3-defs.h      |    0
 {algo/sha => compat}/sha3_common.h    |    0
 {algo/sha => compat}/sph_types.h      |    0
 config-template.json                  |   22 +
 configure                             |   20 +-
 configure.ac                          |    2 +-
 configure~                            |   20 +-
 cpu-miner.c                           |   58 +-
 cpuminer-conf.json                    |   20 -
 simd-utils/simd-128.h                 |    3 +
 simd-utils/simd-256.h                 |    4 +
 simd-utils/simd-512.h                 |    3 +
 verthash-help.txt                     |    4 +-
 113 files changed, 3354 insertions(+), 2925 deletions(-)
 delete mode 100644 algo/sha/sha-hash-4way.h
 delete mode 100644 algo/sha/sha256-hash-2way-ni.c
 delete mode 100644 algo/sha/sha256-hash-opt.c
 delete mode 100644 algo/sha/sha256t.c
 create mode 100644 algo/sha/sha512-hash.h
 delete mode 100644 asm/aesb-x64.S
 delete mode 100644 asm/aesb-x86.S
 delete mode 100644 comp.log
 rename {algo/sha => compat}/aes_helper.c (98%)
 rename {algo/sha => compat}/brg_types.h (100%)
 rename {algo/sha => compat}/sha3-defs.h (100%)
 rename {algo/sha => compat}/sha3_common.h (100%)
 rename {algo/sha => compat}/sph_types.h (100%)
 create mode 100644 config-template.json
 delete mode 100644 cpuminer-conf.json

diff --git a/Makefile.am b/Makefile.am
index c7a051f9..92a18eb2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -163,8 +163,6 @@ cpuminer_SOURCES = \
   algo/sha/sph_sha2big.c \
   algo/sha/sha256-hash-4way.c \
   algo/sha/sha512-hash-4way.c \
-  algo/sha/sha256-hash-opt.c \
-  algo/sha/sha256-hash-2way-ni.c \
   algo/sha/hmac-sha256-hash.c \
   algo/sha/hmac-sha256-hash-4way.c \
   algo/sha/sha256d.c \
@@ -172,7 +170,6 @@ cpuminer_SOURCES = \
   algo/sha/sha256d-4way.c \
   algo/sha/sha256t-gate.c \
   algo/sha/sha256t-4way.c \
-  algo/sha/sha256t.c \
   algo/sha/sha256q-4way.c \
   algo/sha/sha256q.c \
   algo/sha/sha512256d-4way.c \
@@ -294,10 +291,10 @@ disable_flags =
 if USE_ASM
    cpuminer_SOURCES += asm/neoscrypt_asm.S
 if ARCH_x86
-   cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S asm/aesb-x86.S
+   cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
 endif
 if ARCH_x86_64
-   cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S asm/aesb-x64.S
+   cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
 endif
 if ARCH_ARM
    cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 2e49122e..e561f497 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.23.2
+
+sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2.
+Other small improvements and code cleanup.
+
 v3.23.1
 
 #349: Fix sha256t low difficulty shares and low effective hash rate.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index 7f971bd9..e86b304f 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -248,7 +248,7 @@ int null_hash()
    return 0;
 };
 
-void init_algo_gate( algo_gate_t* gate )
+static void init_algo_gate( algo_gate_t* gate )
 {
    gate->miner_thread_init       = (void*)&return_true;
    gate->scanhash                = (void*)&scanhash_generic;
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 12b606a7..5cd31826 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -269,7 +269,7 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
 // OpenSSL sha256 deprecated
-void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
+//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
 
 bool std_le_work_decode( struct work *work );
 bool std_be_work_decode( struct work *work );
diff --git a/algo/argon2/argon2a/argon2a.c b/algo/argon2/argon2a/argon2a.c
index 5a7c54d1..51a34aae 100644
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -77,7 +77,7 @@ bool register_argon2_algo( algo_gate_t* gate )
   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
   gate->scanhash        = (void*)&scanhash_argon2;
   gate->hash            = (void*)&argon2hash;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
   opt_target_factor = 65536.0;
 
   return true;
diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c
index 0ebe547b..32aad562 100644
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -15,7 +15,7 @@
 #include <string.h>
 #include <stdio.h>
 
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #include "sph-blake2s.h"
 
 static const uint32_t blake2s_IV[8] =
diff --git a/algo/blake/sph_blake.h b/algo/blake/sph_blake.h
index 37fb6516..087c23d5 100644
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for BLAKE-224.
diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c
index 50a97586..19c73196 100644
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -31,7 +31,7 @@
 #include <stdint.h>
 #include <string.h>
 #include "simd-utils.h"
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #include "sph_blake2b.h"
 
 // Little-endian byte access.
diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h
index afeecfe1..2befb994 100644
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -41,8 +41,6 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"
 
 #define SPH_SIZE_bmw256   256
@@ -57,7 +55,7 @@ typedef struct {
    __m128i buf[64];
    __m128i H[16];
    size_t ptr;
-   sph_u32 bit_count;  // assume bit_count fits in 32 bits
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;
 
 typedef bmw_4way_small_context bmw256_4way_context;
@@ -144,7 +142,7 @@ typedef struct {
    __m256i buf[16];
    __m256i H[16];
    size_t ptr;
-   sph_u64 bit_count;
+   uint64_t bit_count;
 } bmw_4way_big_context __attribute__((aligned(128)));
 
 typedef bmw_4way_big_context bmw512_4way_context;
diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c
index 08f7621f..d15890b0 100644
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -109,7 +109,7 @@ static const uint32_t IV256[] = {
              _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
                                            rol_off_32( M, j, 3 ) ), \
                             rol_off_32( M, j, 10 ) ), \
-       _mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
+       _mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
    H[ ( (j)+7 ) & 0xF ] )
 
 
@@ -485,7 +485,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
    size_t ptr;
    const int buf_size = 64;  // bytes of one lane, compatible with len
 
-   sc->bit_count += (sph_u32)len << 3;
+   sc->bit_count += (uint32_t)len << 3;
    buf = sc->buf;
    ptr = sc->ptr;
    h1 = sc->H;
diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c
index 81378a0c..6773bd07 100644
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -45,15 +45,15 @@ extern "C"{
 
 #define LPAR   (
 
-static const sph_u64 IV512[] = {
-        SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
-        SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
-        SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
-        SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
-        SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
-        SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
-        SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
-        SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+static const uint64_t IV512[] = {
+        0x8081828384858687, 0x88898A8B8C8D8E8F,
+        0x9091929394959697, 0x98999A9B9C9D9E9F,
+        0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF,
+        0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF,
+        0xC0C1C2C3C4C5C6C7, 0xC8C9CACBCCCDCECF,
+        0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF,
+        0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF,
+        0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
 };
 
 #if defined(__SSE2__)
@@ -894,7 +894,7 @@ static const __m256i final_b[16] =
 };
 
 static void
-bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
 {
    sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
    sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
@@ -926,7 +926,7 @@ bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
    size_t ptr;
    const int buf_size = 128;  // bytes of one lane, compatible with len
 
-   sc->bit_count += (sph_u64)len << 3;
+   sc->bit_count += (uint64_t)len << 3;
    buf = sc->buf;
    ptr = sc->ptr;
    h1 = sc->H;
@@ -1377,7 +1377,7 @@ static const __m512i final_b8[16] =
 
 
 void bmw512_8way_init( bmw512_8way_context *ctx )
-//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+//bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
 {
    ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
    ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
diff --git a/algo/bmw/sph_bmw.h b/algo/bmw/sph_bmw.h
index f53dd27f..e1d06838 100644
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for BMW-224.
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index 20967fbf..7f620993 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -9,7 +9,6 @@
 #include <immintrin.h>
 #endif
 #include "cubehash_sse2.h"
-#include "algo/sha/sha3-defs.h"
 #include <stdbool.h>
 #include <unistd.h>
 #include <memory.h>
diff --git a/algo/cubehash/cubehash_sse2.h b/algo/cubehash/cubehash_sse2.h
index 1f06ebae..5b69ac77 100644
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -3,7 +3,7 @@
 
 #include "compat.h"
 #include <stdint.h>
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"
 
 #define	OPTIMIZE_SSE2
 
diff --git a/algo/cubehash/sph_cubehash.h b/algo/cubehash/sph_cubehash.h
index 4ef6794f..08e96ddc 100644
--- a/algo/cubehash/sph_cubehash.h
+++ b/algo/cubehash/sph_cubehash.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for CubeHash-224.
diff --git a/algo/echo/aes_ni/hash_api.h b/algo/echo/aes_ni/hash_api.h
index a5500885..816d4579 100644
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -22,7 +22,7 @@
 #endif
 
 
-#include "algo/sha/sha3_common.h"
+#include "compat/sha3_common.h"
 
 #include <emmintrin.h>
 
diff --git a/algo/echo/sph_echo.c b/algo/echo/sph_echo.c
index 99e7dacd..b7b3c065 100644
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -73,7 +73,7 @@ extern "C"{
 #endif
 
 #define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
+#include "compat/aes_helper.c"
 
 #if SPH_ECHO_64
 
diff --git a/algo/echo/sph_echo.h b/algo/echo/sph_echo.h
index ae5a3507..8165f7b0 100644
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -43,7 +43,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for ECHO-224.
diff --git a/algo/fugue/fugue-aesni.h b/algo/fugue/fugue-aesni.h
index 13fd8f87..389e5793 100644
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -20,7 +20,7 @@
 #error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
 #endif
 
-#include "algo/sha/sha3_common.h"
+#include "compat/sha3_common.h"
 #include "simd-utils.h"
 
 
diff --git a/algo/fugue/sph_fugue.h b/algo/fugue/sph_fugue.h
index 08d4dde0..6a73d5c5 100644
--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -2,7 +2,7 @@
 #define SPH_FUGUE_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 #ifdef __cplusplus
 extern "C"{
diff --git a/algo/gost/sph_gost.h b/algo/gost/sph_gost.h
index 5f8f3491..3467ae9a 100644
--- a/algo/gost/sph_gost.h
+++ b/algo/gost/sph_gost.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for GOST-256.
diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h
index b76d8098..558215a7 100644
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -20,8 +20,8 @@
 #define LENGTH (512)
 
 #include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+//#define NEED_UINT_64T
+#include "compat/brg_types.h"
 
 /* some sizes (number of bytes) */
 #define ROWS (8)
diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h
index 32ce1a5f..24544a50 100644
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -34,8 +34,7 @@ typedef crypto_uint64 u64;
 //#define LENGTH (512)
 
 #include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+#include "compat/brg_types.h"
 
 #ifdef IACA_TRACE
   #include IACA_MARKS
diff --git a/algo/groestl/groestl-gate.c b/algo/groestl/groestl-gate.c
index 92c79bce..eb2d4988 100644
--- a/algo/groestl/groestl-gate.c
+++ b/algo/groestl/groestl-gate.c
@@ -17,7 +17,7 @@ bool register_dmd_gr_algo( algo_gate_t *gate )
 bool register_groestl_algo( algo_gate_t* gate )
 {
     register_dmd_gr_algo( gate );
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+    gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
     return true;
 };
 
diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h
index 59c62708..05ddccb9 100644
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -22,10 +22,6 @@
 
 #define LENGTH (256)
 
-//#include "brg_endian.h"
-//#define NEED_UINT_64T
-//#include "algo/sha/brg_types.h"
-
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c
index c9f558cc..0b13ad21 100644
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include <string.h>
 #include "aes_ni/hash-groestl.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
 #if defined(__VAES__)
   #include "groestl512-hash-4way.h"
 #endif
diff --git a/algo/groestl/sph_groestl.h b/algo/groestl/sph_groestl.h
index 02465e3c..899d716e 100644
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -40,7 +40,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 #if !defined(__AES__)   
 /**
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 89f8646c..3e61cc6d 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -34,498 +34,303 @@
 #include <string.h>
 #include "hamsi-hash-4way.h"
 
-#if defined(__AVX2__)
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-/*
- * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
- * table lookup during message expansion (1 to 8, inclusive). If we note
- * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
- * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
- * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
- * then we will get t tables (where t=ceil(w/n)) of individual size
- * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
- * n=5, there are 7 tables, but the last one uses only two bits on
- * input, not five).
- *
- * Also, we read t rows of r words from RAM. Words in a given row are
- * concatenated in RAM in that order, so most of the cost is about
- * reading the first row word; comparatively, cache misses are thus
- * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
- *
- * When n=1, tables are "special" in that we omit the first entry of
- * each table (which always contains 0), so that total table size is
- * halved.
- *
- * We thus have the following (size1 is the cumulative table size of
- * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
- * are for Hamsi-224/256 and Hamsi-384/512, respectively).
- *
- *   n      size1      size2    t1    t2
- * ---------------------------------------
- *   1       1024       4096    32    64
- *   2       2048       8192    16    32
- *   3       2688      10880    11    22
- *   4       4096      16384     8    16
- *   5       6272      25600     7    13
- *   6      10368      41984     6    11
- *   7      16896      73856     5    10
- *   8      32768     131072     4     8
- *
- * So there is a trade-off: a lower n makes the tables fit better in
- * L1 cache, but increases the number of memory accesses. The optimal
- * value depends on the amount of available L1 cache and the relative
- * impact of a cache miss.
- *
- * Experimentally, in ideal benchmark conditions (which are not necessarily
- * realistic with regards to L1 cache contention), it seems that n=8 is
- * the best value on "big" architectures (those with 32 kB or more of L1
- * cache), while n=4 is better on "small" architectures. This was tested
- * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
- * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
- * (8 kB L1 cache).
- *
- * Note: with n=1, the 32 tables (actually implemented as one big table)
- * are read entirely and sequentially, regardless of the input data,
- * thus avoiding any data-dependent table access pattern.
- */
+#include <stdio.h>
 
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
+#if defined(__AVX2__)
 
 //#include "hamsi-helper-4way.c"
-/*
-static const sph_u32 IV512[] = {
-	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
-	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
-	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
-	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
-	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
-	SPH_C32(0x6769756d)
-};
-*/
-static const sph_u32 alpha_n[] = {
-	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+
+static const uint32_t HAMSI_IV512[] =
+{
+	 0x73746565, 0x6c706172, 0x6b204172, 0x656e6265,
+    0x72672031, 0x302c2062, 0x75732032, 0x3434362c,
+    0x20422d33, 0x30303120, 0x4c657576, 0x656e2d48,
+	 0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d
 };
 
-static const sph_u32 alpha_f[] = {
-	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
-	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
-	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
-	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+static const uint32_t alpha_n[] = {
+	0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa,
+   0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
+   0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0,
+	0xaaaaf0f0, 0xff00cccc, 0xccccf0f0,	0xff00aaaa,
+   0xccccaaaa, 0xff00f0f0,	0xff00aaaa, 0xf0f0cccc,
+   0xf0f0ff00,	0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
+	0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0,	0xccccff00,
+   0xff00cccc, 0xaaaaf0f0,	0xff00aaaa, 0xccccf0f0
 };
 
+static const uint32_t alpha_f[] = {
+	0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0,	0xcaf9f9c0,
+   0x0ff0f9c0, 0x639ccaf9,	0xf9c00ff0, 0x639ccaf9,
+   0x639c0ff0,	0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c,
+	0xf9c0639c, 0xcaf90ff0, 0x0ff0639c,	0xcaf9f9c0,
+   0x0ff0f9c0, 0xcaf9639c,	0xcaf9f9c0, 0x639c0ff0,
+   0x639ccaf9,	0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
+	0xf9c0caf9, 0x639c0ff0, 0xf9c0639c,	0x0ff0caf9,
+   0xcaf90ff0, 0xf9c0639c,	0xcaf9f9c0, 0x0ff0639c
+};
 
 // imported from hamsi helper
 
 /* Note: this table lists bits within each byte from least
    siginificant to most significant. */
-static const sph_u32 T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
-	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
-	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
-	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
-	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
-	  SPH_C32(0x9e69af68) },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
-	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
-	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
-	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
-	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
-	  SPH_C32(0x0c26f262) },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
-	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
-	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
-	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
-	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
-	  SPH_C32(0xdc24e61f) },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
-	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
-	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
-	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
-	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
-	  SPH_C32(0x3daac2da) },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
-	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
-	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
-	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
-	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
-	  SPH_C32(0x78cace29) },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
-	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
-	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
-	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
-	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
-	  SPH_C32(0x2dd1f9ab) },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
-	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
-	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
-	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
-	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
-	  SPH_C32(0xbf2c0be2) },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
-	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
-	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
-	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
-	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
-	  SPH_C32(0x32219526) },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
-	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
-	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
-	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
-	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
-	  SPH_C32(0xac8e6c88) },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
-	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
-	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
-	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
-	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
-	  SPH_C32(0x7b1bd6b9) },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
-	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
-	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
-	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
-	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
-	  SPH_C32(0xf746c320) },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
-	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
-	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
-	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
-	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
-	  SPH_C32(0x69505b3a) },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
-	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
-	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
-	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
-	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
-	  SPH_C32(0x8a341574) },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
-	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
-	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
-	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
-	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
-	  SPH_C32(0x450360bf) },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
-	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
-	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
-	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
-	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
-	  SPH_C32(0xf3d45758) },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
-	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
-	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
-	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
-	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
-	  SPH_C32(0x925c44e9) },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
-	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
-	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
-	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
-	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
-	  SPH_C32(0xa123ff9f) },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
-	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
-	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
-	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
-	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
-	  SPH_C32(0x1568ff0f) },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
-	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
-	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
-	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
-	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
-	  SPH_C32(0xc5c1eb3e) },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
-	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
-	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
-	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
-	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
-	  SPH_C32(0x1af21fe1) },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
-	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
-	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
-	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
-	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
-	  SPH_C32(0x857f3c2b) },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
-	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
-	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
-	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
-	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
-	  SPH_C32(0x2ba05a55) },
-	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
-	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
-	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
-	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
-	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
-	  SPH_C32(0xfeabf254) },
-	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
-	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
-	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
-	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
-	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
-	  SPH_C32(0xfe1cdc7f) },
-	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
-	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
-	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
-	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
-	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
-	  SPH_C32(0xb0a51834) },
-	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
-	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
-	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
-	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
-	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
-	  SPH_C32(0xa6b8c28d) },
-	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
-	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
-	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
-	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
-	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
-	  SPH_C32(0x3a4e99d7) },
-	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
-	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
-	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
-	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
-	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
-	  SPH_C32(0xe1844257) },
-	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
-	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
-	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
-	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
-	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
-	  SPH_C32(0x2c3b504e) },
-	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
-	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
-	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
-	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
-	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
-	  SPH_C32(0x524a0d59) },
-	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
-	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
-	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
-	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
-	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
-	  SPH_C32(0x378dd173) },
-	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
-	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
-	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
-	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
-	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
-	  SPH_C32(0x8b6c72bd) },
-	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
-	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
-	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
-	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
-	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
-	  SPH_C32(0x8e67b7fa) },
-	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
-	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
-	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
-	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
-	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
-	  SPH_C32(0x443d3004) },
-	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
-	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
-	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
-	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
-	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
-	  SPH_C32(0xf4f6ea7b) },
-	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
-	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
-	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
-	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
-	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
-	  SPH_C32(0x979961d0) },
-	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
-	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
-	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
-	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
-	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
-	  SPH_C32(0x98aa496e) },
-	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
-	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
-	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
-	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
-	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
-	  SPH_C32(0x094e3198) },
-	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
-	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
-	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
-	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
-	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
-	  SPH_C32(0xe86cba2e) },
-	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
-	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
-	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
-	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
-	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
-	  SPH_C32(0x4b7eec55) },
-	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
-	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
-	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
-	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
-	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
-	  SPH_C32(0x1e7536a6) },
-	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
-	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
-	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
-	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
-	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
-	  SPH_C32(0x24314f17) },
-	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
-	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
-	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
-	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
-	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
-	  SPH_C32(0x9075b1ce) },
-	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
-	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
-	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
-	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
-	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
-	  SPH_C32(0x9b6ef888) },
-	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
-	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
-	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
-	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
-	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
-	  SPH_C32(0xd8b61463) },
-	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
-	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
-	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
-	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
-	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
-	  SPH_C32(0x3ea660f7) },
-	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
-	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
-	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
-	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
-	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
-	  SPH_C32(0x7f975691) },
-	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
-	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
-	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
-	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
-	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
-	  SPH_C32(0x2c94459e) },
-	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
-	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
-	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
-	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
-	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
-	  SPH_C32(0x56a7b19f) },
-	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
-	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
-	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
-	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
-	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
-	  SPH_C32(0x81fdf908) },
-	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
-	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
-	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
-	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
-	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
-	  SPH_C32(0x5bd61539) },
-	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
-	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
-	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
-	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
-	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
-	  SPH_C32(0x15b961e7) },
-	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
-	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
-	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
-	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
-	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
-	  SPH_C32(0x2a2c18f0) },
-	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
-	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
-	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
-	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
-	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
-	  SPH_C32(0x551e3d6e) },
-	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
-	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
-	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
-	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
-	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
-	  SPH_C32(0x33c5244f) },
-	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
-	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
-	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
-	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
-	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
-	  SPH_C32(0x8a58e6a4) },
-	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
-	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
-	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
-	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
-	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
-	  SPH_C32(0xda878000) },
-	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
-	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
-	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
-	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
-	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
-	  SPH_C32(0x3c5dfffe) },
-	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
-	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
-	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
-	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
-	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
-	  SPH_C32(0x7b1675d7) },
-	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
-	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
-	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
-	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
-	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
-	  SPH_C32(0x2879ebac) },
-	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
-	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
-	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
-	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
-	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
-	  SPH_C32(0xbe0a679e) },
-	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
-	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
-	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
-	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
-	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
-	  SPH_C32(0x30aebcf7) },
-	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
-	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
-	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
-	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
-	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
-	  SPH_C32(0xc7ff60f0) },
-	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
-	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
-	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
-	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
-	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
-	  SPH_C32(0xe7e00a94) }
+static const uint32_t T512[64][16] = {
+	{  0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000,
+      0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a,
+      0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
+	   0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
+	{  0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
+      0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
+      0x26600240, 0xddd80000, 0x722a0000, 0x4f060000,
+	   0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
+	{  0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000,
+      0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
+      0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000,
+	   0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
+	{  0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000,
+      0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
+      0x373d2800, 0x71500000, 0x95e00000, 0x0a140000,
+	   0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
+	{  0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000,
+      0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
+      0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000,
+	   0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
+	{  0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000,
+      0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
+      0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000,
+	   0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
+	{  0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000,
+      0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
+      0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000,
+	   0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
+	{  0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000,
+      0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2,
+      0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000,
+	   0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
+	{  0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000,
+      0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
+      0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000,
+	   0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
+	{  0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000,
+      0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
+      0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000,
+	   0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
+	{  0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000,
+      0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
+      0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000,
+	   0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
+	{  0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000,
+      0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
+      0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000,
+	   0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
+	{  0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000,
+      0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
+      0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000,
+	   0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
+	{  0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000,
+      0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
+      0x832800a0, 0x67420000, 0xe1170000, 0x370b0000,
+	   0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
+	{  0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000,
+      0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1,
+      0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000,
+	   0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
+	{  0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000,
+      0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
+      0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000,
+	   0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
+	{  0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000,
+      0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
+      0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000,
+	   0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
+	{  0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000,
+      0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
+      0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000,
+	   0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
+	{  0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000,
+      0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
+      0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000,
+	   0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
+	{  0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000,
+      0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
+      0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000,
+	   0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
+	{  0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000,
+      0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
+      0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000,
+	   0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
+	{  0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000,
+      0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b,
+      0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000,
+	   0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
+	{  0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000,
+      0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
+      0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000,
+	   0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
+	{  0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000,
+      0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
+      0x9b060002, 0x61468000, 0x221e0000, 0x1d740000,
+	   0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
+	{  0x86790000, 0x3f390002, 0xe19ae000, 0x98560000,
+      0x9565670e, 0x4e88c8ea,	0xd3dd4944, 0x161ddab9,
+      0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000,
+	   0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
+	{  0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000,
+      0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
+      0xb6ce0000,	0xdae90002, 0x156e8000, 0xda920000,
+	   0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
+	{  0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000,
+      0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
+      0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000,
+	   0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
+	{  0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000,
+      0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
+      0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000,
+	   0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
+	{  0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000,
+      0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17,
+      0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000,
+	   0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
+	{  0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000,
+      0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
+      0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000,
+	   0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
+	{  0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000,
+      0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
+      0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000,
+	   0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
+	{  0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000,
+      0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
+      0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000,
+	   0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
+	{  0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000,
+      0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
+      0x819e0000, 0xec570000, 0x66320280, 0x95f30000,
+	   0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
+	{  0x819e0000, 0xec570000, 0x66320280, 0x95f30000,
+      0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
+      0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000,
+	   0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
+	{  0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000,
+      0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
+      0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000,
+	   0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
+	{  0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000,
+      0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b,
+      0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000,
+	   0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
+	{  0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000,
+      0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
+      0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000,
+	   0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
+	{  0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000,
+      0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
+      0xb2060000, 0xc5690000, 0x28031200, 0x74670000,
+	   0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
+	{  0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000,
+      0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
+      0xdb250000, 0x09290000, 0x49aac000, 0x81e10000,
+	   0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
+	{  0xdb250000, 0x09290000, 0x49aac000, 0x81e10000,
+      0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
+      0x75e60000, 0x95660001, 0x307b2000, 0xadf40000,
+	   0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
+	{  0x58430000, 0x807e0000, 0x78330001, 0xc66b3800,
+      0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
+      0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800,
+	   0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
+	{  0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800,
+      0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
+      0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000,
+	   0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
+	{  0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000,
+      0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946,
+      0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000,
+	   0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
+	{  0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000,
+      0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
+      0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000,
+	   0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
+	{  0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000,
+      0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
+      0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000,
+	   0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
+	{  0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000,
+      0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
+      0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000,
+	   0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
+	{  0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000,
+      0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
+      0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000,
+	   0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
+	{  0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000,
+      0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
+      0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000,
+	   0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
+	{  0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0,
+      0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
+      0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0,
+	   0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
+	{  0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0,
+      0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, 
+      0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220,
+	   0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
+	{  0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0,
+      0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
+      0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060,
+	   0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
+	{  0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060,
+      0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
+      0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480,
+	   0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
+	{  0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800,
+      0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
+      0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800,
+	   0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
+	{  0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800,
+      0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
+      0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000,
+	   0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
+	{  0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800,
+      0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
+      0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000,
+	   0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
+	{  0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000,
+      0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
+      0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800,
+	   0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
+	{  0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007,
+      0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe,
+      0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002,
+	   0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
+	{  0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002,
+      0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
+      0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005,
+	   0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
+	{  0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b,
+      0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
+      0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003,
+	   0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
+	{  0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003,
+      0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
+      0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008,
+	   0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
+	{  0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013,
+      0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
+      0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001,
+	   0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
+	{  0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001,
+      0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
+      0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012,
+	   0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
+	{  0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0,
+      0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
+      0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0,
+	   0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
+	{  0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0,
+      0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
+      0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140,
+	   0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
 };
 
 #define s0   m0
@@ -545,6 +350,39 @@ static const sph_u32 T512[64][16] = {
 #define sE   c7
 #define sF   m7
 
+#define S00   M0
+#define S01   M1
+#define S02   C0
+#define S03   C1
+#define S04   M2
+#define S05   M3
+#define S06   C2
+#define S07   C3
+#define S08   C4
+#define S09   C5
+#define S0A   M4
+#define S0B   M5
+#define S0C   C6
+#define S0D   C7
+#define S0E   M6
+#define S0F   M7
+#define S10   M8
+#define S11   M9
+#define S12   C8
+#define S13   C9
+#define S14   MA
+#define S15   MB
+#define S16   CA
+#define S17   CB
+#define S18   CC
+#define S19   CD
+#define S1A   MC
+#define S1B   MD
+#define S1C   CE
+#define S1D   CF
+#define S1E   ME
+#define S1F   MF
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 // Hamsi 8 way AVX512 
@@ -562,14 +400,14 @@ do { \
   for ( int u = 0; u < 64; u++ ) \
   { \
      const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
-     m0 = _mm512_mask_xor_epi64( m0, dm, m0, _mm512_set1_epi64( tp[0] ) ); \
-     m1 = _mm512_mask_xor_epi64( m1, dm, m1, _mm512_set1_epi64( tp[1] ) ); \
-     m2 = _mm512_mask_xor_epi64( m2, dm, m2, _mm512_set1_epi64( tp[2] ) ); \
-     m3 = _mm512_mask_xor_epi64( m3, dm, m3, _mm512_set1_epi64( tp[3] ) ); \
-     m4 = _mm512_mask_xor_epi64( m4, dm, m4, _mm512_set1_epi64( tp[4] ) ); \
-     m5 = _mm512_mask_xor_epi64( m5, dm, m5, _mm512_set1_epi64( tp[5] ) ); \
-     m6 = _mm512_mask_xor_epi64( m6, dm, m6, _mm512_set1_epi64( tp[6] ) ); \
-     m7 = _mm512_mask_xor_epi64( m7, dm, m7, _mm512_set1_epi64( tp[7] ) ); \
+     m0 = _mm512_mask_xor_epi64( m0, dm, m0, v512_64( tp[0] ) ); \
+     m1 = _mm512_mask_xor_epi64( m1, dm, m1, v512_64( tp[1] ) ); \
+     m2 = _mm512_mask_xor_epi64( m2, dm, m2, v512_64( tp[2] ) ); \
+     m3 = _mm512_mask_xor_epi64( m3, dm, m3, v512_64( tp[3] ) ); \
+     m4 = _mm512_mask_xor_epi64( m4, dm, m4, v512_64( tp[4] ) ); \
+     m5 = _mm512_mask_xor_epi64( m5, dm, m5, v512_64( tp[5] ) ); \
+     m6 = _mm512_mask_xor_epi64( m6, dm, m6, v512_64( tp[6] ) ); \
+     m7 = _mm512_mask_xor_epi64( m7, dm, m7, v512_64( tp[7] ) ); \
      db = _mm512_ror_epi64( db, 1 ); \
      tp += 8; \
   } \
@@ -656,7 +494,6 @@ do { \
   SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
   SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
   SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
-\
   s4 = mm512_swap64_32( s4 ); \
   s5 = mm512_swap64_32( s5 ); \
   sD = mm512_swap64_32( sD ); \
@@ -664,7 +501,6 @@ do { \
   t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
   t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
   L8( s0, t0, s9, t1 ); \
-\
   s6 = mm512_swap64_32( s6 ); \
   sF = mm512_swap64_32( sF ); \
   t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
@@ -733,17 +569,17 @@ do { \
    __m512i alpha[16]; \
    const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
    for( int i = 0; i < 16; i++ ) \
-      alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_n )[i] ); \
+      alpha[i] = v512_64( ( (uint64_t*)alpha_n )[i] ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( (1ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( (1ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( (2ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( (2ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( (3ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( (3ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( (4ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( (4ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( (5ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( (5ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
 } while (0)
 
@@ -752,29 +588,29 @@ do { \
    __m512i alpha[16]; \
    const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
    for( int i = 0; i < 16; i++ ) \
-      alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_f )[i] ); \
+      alpha[i] = v512_64( ( (uint64_t*)alpha_f )[i] ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 1ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 1ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 2ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 2ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 3ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 3ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 4ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 4ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 5ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 5ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 6ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 6ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 7ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 7ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 8ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 8ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( ( 9ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( ( 9ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( (10ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( (10ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
-   alpha[0] = _mm512_set1_epi64( (11ULL << 32) ^ A0 ); \
+   alpha[0] = v512_64( (11ULL << 32) ^ A0 ); \
    ROUND_BIG8( alpha ); \
 } while (0)
 
@@ -790,13 +626,335 @@ do { /* order is important */ \
    c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
 } while (0)
 
+///////////////////////
+//
+//     Experimental
+
+// Hamsi 16 way 32 bit.
+
+#define DECL_STATE_16X32 \
+   __m512i C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
+
+#define READ_STATE_16X32(sc) \
+   C0 = sc->h[ 0]; \
+   C1 = sc->h[ 1]; \
+   C2 = sc->h[ 2]; \
+   C3 = sc->h[ 3]; \
+   C4 = sc->h[ 4]; \
+   C5 = sc->h[ 5]; \
+   C6 = sc->h[ 6]; \
+   C7 = sc->h[ 7]; \
+   C8 = sc->h[ 8]; \
+   C9 = sc->h[ 9]; \
+   CA = sc->h[10]; \
+   CB = sc->h[11]; \
+   CC = sc->h[12]; \
+   CD = sc->h[13]; \
+   CE = sc->h[14]; \
+   CF = sc->h[15];
+
+#define WRITE_STATE_16X32(sc) \
+   sc->h[ 0] = C0; \
+   sc->h[ 1] = C1; \
+   sc->h[ 2] = C2; \
+   sc->h[ 3] = C3; \
+   sc->h[ 4] = C4; \
+   sc->h[ 5] = C5; \
+   sc->h[ 6] = C6; \
+   sc->h[ 7] = C7; \
+   sc->h[ 8] = C8; \
+   sc->h[ 9] = C9; \
+   sc->h[10] = CA; \
+   sc->h[11] = CB; \
+   sc->h[12] = CC; \
+   sc->h[13] = CD; \
+   sc->h[14] = CE; \
+   sc->h[15] = CF;
+
+
+#define INPUT_16X32 \
+{ \
+  const __m512i zero = _mm512_setzero_si512(); \
+  const uint32_t *tp = (const uint32_t*)T512; \
+  M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
+  M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
+  for ( int v = 0; v < 2; v++ ) \
+  { \
+     __m512i db = _mm512_ror_epi32( buf[v], 1 ); \
+     for ( int u = 0; u < 32; u++ ) \
+     { \
+        __mmask16 dm = _mm512_cmplt_epi32_mask( db, zero ); \
+        M0 = _mm512_mask_xor_epi32( M0, dm, M0, v512_32( tp[ 0] ) ); \
+        M1 = _mm512_mask_xor_epi32( M1, dm, M1, v512_32( tp[ 1] ) ); \
+        M2 = _mm512_mask_xor_epi32( M2, dm, M2, v512_32( tp[ 2] ) ); \
+        M3 = _mm512_mask_xor_epi32( M3, dm, M3, v512_32( tp[ 3] ) ); \
+        M4 = _mm512_mask_xor_epi32( M4, dm, M4, v512_32( tp[ 4] ) ); \
+        M5 = _mm512_mask_xor_epi32( M5, dm, M5, v512_32( tp[ 5] ) ); \
+        M6 = _mm512_mask_xor_epi32( M6, dm, M6, v512_32( tp[ 6] ) ); \
+        M7 = _mm512_mask_xor_epi32( M7, dm, M7, v512_32( tp[ 7] ) ); \
+        M8 = _mm512_mask_xor_epi32( M8, dm, M8, v512_32( tp[ 8] ) ); \
+        M9 = _mm512_mask_xor_epi32( M9, dm, M9, v512_32( tp[ 9] ) ); \
+        MA = _mm512_mask_xor_epi32( MA, dm, MA, v512_32( tp[10] ) ); \
+        MB = _mm512_mask_xor_epi32( MB, dm, MB, v512_32( tp[11] ) ); \
+        MC = _mm512_mask_xor_epi32( MC, dm, MC, v512_32( tp[12] ) ); \
+        MD = _mm512_mask_xor_epi32( MD, dm, MD, v512_32( tp[13] ) ); \
+        ME = _mm512_mask_xor_epi32( ME, dm, ME, v512_32( tp[14] ) ); \
+        MF = _mm512_mask_xor_epi32( MF, dm, MF, v512_32( tp[15] ) ); \
+        db = _mm512_ror_epi32( db, 1 ); \
+        tp += 16; \
+     } \
+  } \
+}
+
+#define SBOX_16X32 SBOX8
+#define L_16X32    L8
+
+#define ROUND_16X32( rc, alpha ) \
+{ \
+   S00 = _mm512_xor_si512( S00, v512_32( alpha[ 0] ) ); \
+   S01 = _mm512_xor_si512( S01, v512_32( alpha[ 1] ^ rc ) ); \
+   S02 = _mm512_xor_si512( S02, v512_32( alpha[ 2] ) ); \
+   S03 = _mm512_xor_si512( S03, v512_32( alpha[ 3] ) ); \
+   S04 = _mm512_xor_si512( S04, v512_32( alpha[ 4] ) ); \
+   S05 = _mm512_xor_si512( S05, v512_32( alpha[ 5] ) ); \
+   S06 = _mm512_xor_si512( S06, v512_32( alpha[ 6] ) ); \
+   S07 = _mm512_xor_si512( S07, v512_32( alpha[ 7] ) ); \
+   S08 = _mm512_xor_si512( S08, v512_32( alpha[ 8] ) ); \
+   S09 = _mm512_xor_si512( S09, v512_32( alpha[ 9] ) ); \
+   S0A = _mm512_xor_si512( S0A, v512_32( alpha[10] ) ); \
+   S0B = _mm512_xor_si512( S0B, v512_32( alpha[11] ) ); \
+   S0C = _mm512_xor_si512( S0C, v512_32( alpha[12] ) ); \
+   S0D = _mm512_xor_si512( S0D, v512_32( alpha[13] ) ); \
+   S0E = _mm512_xor_si512( S0E, v512_32( alpha[14] ) ); \
+   S0F = _mm512_xor_si512( S0F, v512_32( alpha[15] ) ); \
+   S10 = _mm512_xor_si512( S10, v512_32( alpha[16] ) ); \
+   S11 = _mm512_xor_si512( S11, v512_32( alpha[17] ) ); \
+   S12 = _mm512_xor_si512( S12, v512_32( alpha[18] ) ); \
+   S13 = _mm512_xor_si512( S13, v512_32( alpha[19] ) ); \
+   S14 = _mm512_xor_si512( S14, v512_32( alpha[20] ) ); \
+   S15 = _mm512_xor_si512( S15, v512_32( alpha[21] ) ); \
+   S16 = _mm512_xor_si512( S16, v512_32( alpha[22] ) ); \
+   S17 = _mm512_xor_si512( S17, v512_32( alpha[23] ) ); \
+   S18 = _mm512_xor_si512( S18, v512_32( alpha[24] ) ); \
+   S19 = _mm512_xor_si512( S19, v512_32( alpha[25] ) ); \
+   S1A = _mm512_xor_si512( S1A, v512_32( alpha[26] ) ); \
+   S1B = _mm512_xor_si512( S1B, v512_32( alpha[27] ) ); \
+   S1C = _mm512_xor_si512( S1C, v512_32( alpha[28] ) ); \
+   S1D = _mm512_xor_si512( S1D, v512_32( alpha[29] ) ); \
+   S1E = _mm512_xor_si512( S1E, v512_32( alpha[30] ) ); \
+   S1F = _mm512_xor_si512( S1F, v512_32( alpha[31] ) ); \
+   SBOX_16X32( S00, S08, S10, S18 ); \
+   SBOX_16X32( S01, S09, S11, S19 ); \
+   SBOX_16X32( S02, S0A, S12, S1A ); \
+   SBOX_16X32( S03, S0B, S13, S1B ); \
+   SBOX_16X32( S04, S0C, S14, S1C ); \
+   SBOX_16X32( S05, S0D, S15, S1D ); \
+   SBOX_16X32( S06, S0E, S16, S1E ); \
+   SBOX_16X32( S07, S0F, S17, S1F ); \
+   L_16X32( S00, S09, S12, S1B ); \
+   L_16X32( S01, S0A, S13, S1C ); \
+   L_16X32( S02, S0B, S14, S1D ); \
+   L_16X32( S03, S0C, S15, S1E ); \
+   L_16X32( S04, S0D, S16, S1F ); \
+   L_16X32( S05, S0E, S17, S18 ); \
+   L_16X32( S06, S0F, S10, S19 ); \
+   L_16X32( S07, S08, S11, S1A ); \
+   L_16X32( S00, S02, S05, S07 ); \
+   L_16X32( S10, S13, S15, S16 ); \
+   L_16X32( S09, S0B, S0C, S0E ); \
+   L_16X32( S19, S1A, S1C, S1F ); \
+}
+
+#define P_16X32 \
+      ROUND_16X32( 0, alpha_n ); \
+      ROUND_16X32( 1, alpha_n ); \
+      ROUND_16X32( 2, alpha_n ); \
+      ROUND_16X32( 3, alpha_n ); \
+      ROUND_16X32( 4, alpha_n ); \
+      ROUND_16X32( 5, alpha_n );
+
+#define PF_16X32 \
+      ROUND_16X32(  0, alpha_f ); \
+      ROUND_16X32(  1, alpha_f ); \
+      ROUND_16X32(  2, alpha_f ); \
+      ROUND_16X32(  3, alpha_f ); \
+      ROUND_16X32(  4, alpha_f ); \
+      ROUND_16X32(  5, alpha_f ); \
+      ROUND_16X32(  6, alpha_f ); \
+      ROUND_16X32(  7, alpha_f ); \
+      ROUND_16X32(  8, alpha_f ); \
+      ROUND_16X32(  9, alpha_f ); \
+      ROUND_16X32( 10, alpha_f ); \
+      ROUND_16X32( 11, alpha_f );
+
+#define T_16X32 \
+      /* order is important */ \
+      CF = sc->h[15] = _mm512_xor_si512( sc->h[15], S17 ); \
+      CE = sc->h[14] = _mm512_xor_si512( sc->h[14], S16 ); \
+      CD = sc->h[13] = _mm512_xor_si512( sc->h[13], S15 ); \
+      CC = sc->h[12] = _mm512_xor_si512( sc->h[12], S14 ); \
+      CB = sc->h[11] = _mm512_xor_si512( sc->h[11], S13 ); \
+      CA = sc->h[10] = _mm512_xor_si512( sc->h[10], S12 ); \
+      C9 = sc->h[ 9] = _mm512_xor_si512( sc->h[ 9], S11 ); \
+      C8 = sc->h[ 8] = _mm512_xor_si512( sc->h[ 8], S10 ); \
+      C7 = sc->h[ 7] = _mm512_xor_si512( sc->h[ 7], S07 ); \
+      C6 = sc->h[ 6] = _mm512_xor_si512( sc->h[ 6], S06 ); \
+      C5 = sc->h[ 5] = _mm512_xor_si512( sc->h[ 5], S05 ); \
+      C4 = sc->h[ 4] = _mm512_xor_si512( sc->h[ 4], S04 ); \
+      C3 = sc->h[ 3] = _mm512_xor_si512( sc->h[ 3], S03 ); \
+      C2 = sc->h[ 2] = _mm512_xor_si512( sc->h[ 2], S02 ); \
+      C1 = sc->h[ 1] = _mm512_xor_si512( sc->h[ 1], S01 ); \
+      C0 = sc->h[ 0] = _mm512_xor_si512( sc->h[ 0], S00 );
+
+void hamsi_16x32_big( hamsi_16x32_big_context *sc, __m512i *buf, size_t num )
+{
+   DECL_STATE_16X32
+   uint32_t tmp = num << 6;
+
+   sc->count_low =  sc->count_low + tmp;
+   sc->count_high += (uint32_t)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_16X32( sc );
+   while ( num-- > 0 )
+   {
+      __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+      __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+      INPUT_16X32;
+      P_16X32;
+      T_16X32;
+      buf += 2;
+   }
+   WRITE_STATE_16X32( sc );
+}
+
+void hamsi_16x32_big_final( hamsi_16x32_big_context *sc, __m512i *buf )
+{
+   DECL_STATE_16X32
+   READ_STATE_16X32( sc );
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+   INPUT_16X32;
+   PF_16X32;
+   T_16X32;
+   WRITE_STATE_16X32( sc );
+}
+
+void hamsi512_16x32_init( hamsi_16x32_big_context *sc )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+   sc->h[ 0] = v512_32( HAMSI_IV512[ 0] );
+   sc->h[ 1] = v512_32( HAMSI_IV512[ 1] );
+   sc->h[ 2] = v512_32( HAMSI_IV512[ 2] );
+   sc->h[ 3] = v512_32( HAMSI_IV512[ 3] );
+   sc->h[ 4] = v512_32( HAMSI_IV512[ 4] );
+   sc->h[ 5] = v512_32( HAMSI_IV512[ 5] );
+   sc->h[ 6] = v512_32( HAMSI_IV512[ 6] );
+   sc->h[ 7] = v512_32( HAMSI_IV512[ 7] );
+   sc->h[ 8] = v512_32( HAMSI_IV512[ 8] );
+   sc->h[ 9] = v512_32( HAMSI_IV512[ 9] );
+   sc->h[10] = v512_32( HAMSI_IV512[10] );
+   sc->h[11] = v512_32( HAMSI_IV512[11] );
+   sc->h[12] = v512_32( HAMSI_IV512[12] );
+   sc->h[13] = v512_32( HAMSI_IV512[13] );
+   sc->h[14] = v512_32( HAMSI_IV512[14] );
+   sc->h[15] = v512_32( HAMSI_IV512[15] );
+}
+
+void hamsi512_16x32_update( hamsi_16x32_big_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+
+   hamsi_16x32_big( sc, vdata, len>>3 );
+   vdata += ( (len & ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_512( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_16x32_close( hamsi_16x32_big_context *sc, void *dst )
+{
+   __m512i pad[2];
+   uint32_t ch, cl;
+
+   ch = bswap_32( sc->count_high );
+   cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] = v512_32( ch );
+   pad[1] = v512_32( cl );
+   sc->buf[0] = v512_32( 0x80 );
+   sc->buf[1] = _mm512_setzero_si512();
+   hamsi_16x32_big( sc, sc->buf, 1 );
+   hamsi_16x32_big_final( sc, pad );
+
+   mm512_block_bswap_32( (__m512i*)dst, sc->h );
+   mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 );
+}
+
+void hamsi512_16x32_full( hamsi_16x32_big_context *sc, void *dst,
+                            const void *data, size_t len )
+{
+   // init
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+   sc->h[ 0] = v512_32( HAMSI_IV512[ 0] );
+   sc->h[ 1] = v512_32( HAMSI_IV512[ 1] );
+   sc->h[ 2] = v512_32( HAMSI_IV512[ 2] );
+   sc->h[ 3] = v512_32( HAMSI_IV512[ 3] );
+   sc->h[ 4] = v512_32( HAMSI_IV512[ 4] );
+   sc->h[ 5] = v512_32( HAMSI_IV512[ 5] );
+   sc->h[ 6] = v512_32( HAMSI_IV512[ 6] );
+   sc->h[ 7] = v512_32( HAMSI_IV512[ 7] );
+   sc->h[ 8] = v512_32( HAMSI_IV512[ 8] );
+   sc->h[ 9] = v512_32( HAMSI_IV512[ 9] );
+   sc->h[10] = v512_32( HAMSI_IV512[10] );
+   sc->h[11] = v512_32( HAMSI_IV512[11] );
+   sc->h[12] = v512_32( HAMSI_IV512[12] );
+   sc->h[13] = v512_32( HAMSI_IV512[13] );
+   sc->h[14] = v512_32( HAMSI_IV512[14] );
+   sc->h[15] = v512_32( HAMSI_IV512[15] );
+
+   // update
+   __m512i *vdata = (__m512i*)data;
+
+   hamsi_16x32_big( sc, vdata, len>>3 );
+   vdata += ( (len & ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_512( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+
+   // close   
+   __m512i pad[2];
+   uint32_t ch, cl;
+
+   ch = bswap_32( sc->count_high );
+   cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] = v512_32( ch );
+   pad[1] = v512_32( cl );
+   sc->buf[0] = v512_32( 0x80 );
+   sc->buf[1] = _mm512_setzero_si512();
+   hamsi_16x32_big( sc, sc->buf, 1 );
+   hamsi_16x32_big_final( sc, pad );
+
+   mm512_block_bswap_32( (__m512i*)dst, sc->h );
+   mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 );
+}
+
+//
+//
+//
+/////////////////////////////////
+
+
 void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
 {
    DECL_STATE_BIG8
    uint32_t tmp = num << 6;
 
-   sc->count_low = SPH_T32( sc->count_low + tmp );
-   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   sc->count_low =  sc->count_low + tmp;
+   sc->count_high += (uint32_t)( (num >> 13) >> 13 );
    if ( sc->count_low < tmp )
       sc->count_high++;
 
@@ -804,7 +962,6 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
    while ( num-- > 0 )
    {
       __m512i m0, m1, m2, m3, m4, m5, m6, m7;
-
       INPUT_BIG8;
       P_BIG8;
       T_BIG8;
@@ -816,6 +973,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
 void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
 {
    __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+
    DECL_STATE_BIG8
    READ_STATE_BIG8( sc );
    INPUT_BIG8;
@@ -828,16 +986,27 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
 {
    sc->partial_len = 0;
    sc->count_high = sc->count_low = 0;
-
-   sc->h[0] = _mm512_set1_epi64( 0x6c70617273746565 );
-   sc->h[1] = _mm512_set1_epi64( 0x656e62656b204172 );
-   sc->h[2] = _mm512_set1_epi64( 0x302c206272672031 );
-   sc->h[3] = _mm512_set1_epi64( 0x3434362c75732032 );
-   sc->h[4] = _mm512_set1_epi64( 0x3030312020422d33 );
-   sc->h[5] = _mm512_set1_epi64( 0x656e2d484c657576 );
-   sc->h[6] = _mm512_set1_epi64( 0x6c65652c65766572 );
-   sc->h[7] = _mm512_set1_epi64( 0x6769756d2042656c );
-}
+   uint64_t *iv = (uint64_t*)HAMSI_IV512;
+
+   sc->h[0] = v512_64( iv[0] );
+   sc->h[1] = v512_64( iv[1] );
+   sc->h[2] = v512_64( iv[2] );
+   sc->h[3] = v512_64( iv[3] );
+   sc->h[4] = v512_64( iv[4] );
+   sc->h[5] = v512_64( iv[5] );
+   sc->h[6] = v512_64( iv[6] );
+   sc->h[7] = v512_64( iv[7] );
+/*   
+   sc->h[0] = v512_64( 0x6c70617273746565 );
+   sc->h[1] = v512_64( 0x656e62656b204172 );
+   sc->h[2] = v512_64( 0x302c206272672031 );
+   sc->h[3] = v512_64( 0x3434362c75732032 );
+   sc->h[4] = v512_64( 0x3030312020422d33 );
+   sc->h[5] = v512_64( 0x656e2d484c657576 );
+   sc->h[6] = v512_64( 0x6c65652c65766572 );
+   sc->h[7] = v512_64( 0x6769756d2042656c );
+*/
+   }
 
 void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
                            size_t len )
@@ -855,11 +1024,11 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 {
    __m512i pad[1];
    uint32_t ch, cl;
-
-   sph_enc32be( &ch, sc->count_high );
-   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
-   pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
-   sc->buf[0] = _mm512_set1_epi64( 0x80 );
+   
+   ch = bswap_32( sc->count_high );
+   cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] = v512_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
+   sc->buf[0] = v512_64( 0x80 );
    hamsi_8way_big( sc, sc->buf, 1 );
    hamsi_8way_big_final( sc, pad );
 
@@ -881,14 +1050,14 @@ do { \
   for ( int u = 0; u < 64; u++ ) \
   { \
      const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \
-     m0 = _mm256_mask_xor_epi64( m0, dm, m0, _mm256_set1_epi64x( tp[0] ) ); \
-     m1 = _mm256_mask_xor_epi64( m1, dm, m1, _mm256_set1_epi64x( tp[1] ) ); \
-     m2 = _mm256_mask_xor_epi64( m2, dm, m2, _mm256_set1_epi64x( tp[2] ) ); \
-     m3 = _mm256_mask_xor_epi64( m3, dm, m3, _mm256_set1_epi64x( tp[3] ) ); \
-     m4 = _mm256_mask_xor_epi64( m4, dm, m4, _mm256_set1_epi64x( tp[4] ) ); \
-     m5 = _mm256_mask_xor_epi64( m5, dm, m5, _mm256_set1_epi64x( tp[5] ) ); \
-     m6 = _mm256_mask_xor_epi64( m6, dm, m6, _mm256_set1_epi64x( tp[6] ) ); \
-     m7 = _mm256_mask_xor_epi64( m7, dm, m7, _mm256_set1_epi64x( tp[7] ) ); \
+     m0 = _mm256_mask_xor_epi64( m0, dm, m0, v256_64( tp[0] ) ); \
+     m1 = _mm256_mask_xor_epi64( m1, dm, m1, v256_64( tp[1] ) ); \
+     m2 = _mm256_mask_xor_epi64( m2, dm, m2, v256_64( tp[2] ) ); \
+     m3 = _mm256_mask_xor_epi64( m3, dm, m3, v256_64( tp[3] ) ); \
+     m4 = _mm256_mask_xor_epi64( m4, dm, m4, v256_64( tp[4] ) ); \
+     m5 = _mm256_mask_xor_epi64( m5, dm, m5, v256_64( tp[5] ) ); \
+     m6 = _mm256_mask_xor_epi64( m6, dm, m6, v256_64( tp[6] ) ); \
+     m7 = _mm256_mask_xor_epi64( m7, dm, m7, v256_64( tp[7] ) ); \
      db = _mm256_ror_epi64( db, 1 ); \
      tp += 8; \
   } \
@@ -906,21 +1075,21 @@ do { \
   { \
      __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
      m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[0] ) ) ); \
+                                          v256_64( tp[0] ) ) ); \
      m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[1] ) ) ); \
+                                          v256_64( tp[1] ) ) ); \
      m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[2] ) ) ); \
+                                          v256_64( tp[2] ) ) ); \
      m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[3] ) ) ); \
+                                          v256_64( tp[3] ) ) ); \
      m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[4] ) ) ); \
+                                          v256_64( tp[4] ) ) ); \
      m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[5] ) ) ); \
+                                          v256_64( tp[5] ) ) ); \
      m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[6] ) ) ); \
+                                          v256_64( tp[6] ) ) ); \
      m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
-                                          _mm256_set1_epi64x( tp[7] ) ) ); \
+                                          v256_64( tp[7] ) ) ); \
      tp += 8; \
   } \
 } while (0)
@@ -1126,17 +1295,17 @@ do { \
    __m256i alpha[16]; \
    const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
    for( int i = 0; i < 16; i++ ) \
-      alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_n )[i] ); \
+      alpha[i] = v256_64( ( (uint64_t*)alpha_n )[i] ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( (1ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( (1ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( (2ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( (2ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( (3ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( (3ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( (4ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( (4ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( (5ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( (5ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
 } while (0)
 
@@ -1145,29 +1314,29 @@ do { \
    __m256i alpha[16]; \
    const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
    for( int i = 0; i < 16; i++ ) \
-      alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_f )[i] ); \
+      alpha[i] = v256_64( ( (uint64_t*)alpha_f )[i] ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 1ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 1ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 2ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 2ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 3ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 3ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 4ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 4ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 5ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 5ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 6ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 6ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 7ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 7ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 8ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 8ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( ( 9ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( ( 9ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( (10ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( (10ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
-   alpha[0] = _mm256_set1_epi64x( (11ULL << 32) ^ A0 ); \
+   alpha[0] = v256_64( (11ULL << 32) ^ A0 ); \
    ROUND_BIG( alpha ); \
 } while (0)
 
@@ -1183,14 +1352,376 @@ do { /* order is important */ \
    c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
 } while (0)
 
+
+// Hamsi-512 8x32
+
+// Experimental untested
+
+
+#define DECL_STATE_8X32 \
+   __m256i C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
+
+#define READ_STATE_8X32(sc) \
+   C0 = sc->h[ 0]; \
+   C1 = sc->h[ 1]; \
+   C2 = sc->h[ 2]; \
+   C3 = sc->h[ 3]; \
+   C4 = sc->h[ 4]; \
+   C5 = sc->h[ 5]; \
+   C6 = sc->h[ 6]; \
+   C7 = sc->h[ 7]; \
+   C8 = sc->h[ 8]; \
+   C9 = sc->h[ 9]; \
+   CA = sc->h[10]; \
+   CB = sc->h[11]; \
+   CC = sc->h[12]; \
+   CD = sc->h[13]; \
+   CE = sc->h[14]; \
+   CF = sc->h[15];
+
+#define WRITE_STATE_8X32(sc) \
+   sc->h[ 0] = C0; \
+   sc->h[ 1] = C1; \
+   sc->h[ 2] = C2; \
+   sc->h[ 3] = C3; \
+   sc->h[ 4] = C4; \
+   sc->h[ 5] = C5; \
+   sc->h[ 6] = C6; \
+   sc->h[ 7] = C7; \
+   sc->h[ 8] = C8; \
+   sc->h[ 9] = C9; \
+   sc->h[10] = CA; \
+   sc->h[11] = CB; \
+   sc->h[12] = CC; \
+   sc->h[13] = CD; \
+   sc->h[14] = CE; \
+   sc->h[15] = CF;
+
+#if defined(__AVX512VL__)
+
+#define INPUT_8X32 \
+{ \
+  const __m256i zero = _mm256_setzero_si256(); \
+  const uint32_t *tp = (const uint32_t*)T512; \
+  M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
+  M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
+  for ( int v = 0; v < 2; v++ ) \
+  { \
+     __m256i db = _mm256_ror_epi32( buf[v], 1 ); \
+     for ( int u = 0; u < 32; u++ ) \
+     { \
+        __mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
+        M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
+        M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
+        M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
+        M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
+        M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
+        M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
+        M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
+        M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
+        M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
+        M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
+        MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
+        MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
+        MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
+        MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
+        ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
+        MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
+        db = _mm256_ror_epi32( db, 1 ); \
+        tp += 16; \
+     } \
+  } \
+}
+
+#else
+
+#define INPUT_8X32 \
+{ \
+  const __m256i zero = _mm256_setzero_si256(); \
+  const uint32_t *tp = (const uint32_t*)T512; \
+  M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
+  M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
+  for ( int v = 0; v < 2; v++ ) \
+  { \
+   __m256i db = buf[v]; \
+   for ( int u = 31; u >= 0; u-- ) \
+    { \
+      __m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
+      M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
+      M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
+      M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
+      M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
+      M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
+      M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
+      M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
+      M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
+      M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
+      M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
+      MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
+      MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
+      MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
+      MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
+      ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
+      MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
+      tp += 16; \
+    } \
+  } \
+}
+
+#endif
+
+#define SBOX_8X32    SBOX
+#define L_8X32       L
+
+#define ROUND_8X32( rc, alpha ) \
+{ \
+   S00 = _mm256_xor_si256( S00, v256_32( alpha[ 0] ) ); \
+   S01 = _mm256_xor_si256( S01, v256_32( (alpha[ 1]) ^ (rc) ) ); \
+   S02 = _mm256_xor_si256( S02, v256_32( alpha[ 2] ) ); \
+   S03 = _mm256_xor_si256( S03, v256_32( alpha[ 3] ) ); \
+   S04 = _mm256_xor_si256( S04, v256_32( alpha[ 4] ) ); \
+   S05 = _mm256_xor_si256( S05, v256_32( alpha[ 5] ) ); \
+   S06 = _mm256_xor_si256( S06, v256_32( alpha[ 6] ) ); \
+   S07 = _mm256_xor_si256( S07, v256_32( alpha[ 7] ) ); \
+   S08 = _mm256_xor_si256( S08, v256_32( alpha[ 8] ) ); \
+   S09 = _mm256_xor_si256( S09, v256_32( alpha[ 9] ) ); \
+   S0A = _mm256_xor_si256( S0A, v256_32( alpha[10] ) ); \
+   S0B = _mm256_xor_si256( S0B, v256_32( alpha[11] ) ); \
+   S0C = _mm256_xor_si256( S0C, v256_32( alpha[12] ) ); \
+   S0D = _mm256_xor_si256( S0D, v256_32( alpha[13] ) ); \
+   S0E = _mm256_xor_si256( S0E, v256_32( alpha[14] ) ); \
+   S0F = _mm256_xor_si256( S0F, v256_32( alpha[15] ) ); \
+   S10 = _mm256_xor_si256( S10, v256_32( alpha[16] ) ); \
+   S11 = _mm256_xor_si256( S11, v256_32( alpha[17] ) ); \
+   S12 = _mm256_xor_si256( S12, v256_32( alpha[18] ) ); \
+   S13 = _mm256_xor_si256( S13, v256_32( alpha[19] ) ); \
+   S14 = _mm256_xor_si256( S14, v256_32( alpha[20] ) ); \
+   S15 = _mm256_xor_si256( S15, v256_32( alpha[21] ) ); \
+   S16 = _mm256_xor_si256( S16, v256_32( alpha[22] ) ); \
+   S17 = _mm256_xor_si256( S17, v256_32( alpha[23] ) ); \
+   S18 = _mm256_xor_si256( S18, v256_32( alpha[24] ) ); \
+   S19 = _mm256_xor_si256( S19, v256_32( alpha[25] ) ); \
+   S1A = _mm256_xor_si256( S1A, v256_32( alpha[26] ) ); \
+   S1B = _mm256_xor_si256( S1B, v256_32( alpha[27] ) ); \
+   S1C = _mm256_xor_si256( S1C, v256_32( alpha[28] ) ); \
+   S1D = _mm256_xor_si256( S1D, v256_32( alpha[29] ) ); \
+   S1E = _mm256_xor_si256( S1E, v256_32( alpha[30] ) ); \
+   S1F = _mm256_xor_si256( S1F, v256_32( alpha[31] ) ); \
+   SBOX_8X32( S00, S08, S10, S18 ); \
+   SBOX_8X32( S01, S09, S11, S19 ); \
+   SBOX_8X32( S02, S0A, S12, S1A ); \
+   SBOX_8X32( S03, S0B, S13, S1B ); \
+   SBOX_8X32( S04, S0C, S14, S1C ); \
+   SBOX_8X32( S05, S0D, S15, S1D ); \
+   SBOX_8X32( S06, S0E, S16, S1E ); \
+   SBOX_8X32( S07, S0F, S17, S1F ); \
+   L_8X32( S00, S09, S12, S1B ); \
+   L_8X32( S01, S0A, S13, S1C ); \
+   L_8X32( S02, S0B, S14, S1D ); \
+   L_8X32( S03, S0C, S15, S1E ); \
+   L_8X32( S04, S0D, S16, S1F ); \
+   L_8X32( S05, S0E, S17, S18 ); \
+   L_8X32( S06, S0F, S10, S19 ); \
+   L_8X32( S07, S08, S11, S1A ); \
+   L_8X32( S00, S02, S05, S07 ); \
+   L_8X32( S10, S13, S15, S16 ); \
+   L_8X32( S09, S0B, S0C, S0E ); \
+   L_8X32( S19, S1A, S1C, S1F ); \
+}
+
+#define P_8X32 \
+      ROUND_8X32( 0, alpha_n ); \
+      ROUND_8X32( 1, alpha_n ); \
+      ROUND_8X32( 2, alpha_n ); \
+      ROUND_8X32( 3, alpha_n ); \
+      ROUND_8X32( 4, alpha_n ); \
+      ROUND_8X32( 5, alpha_n );
+
+#define PF_8X32 \
+      ROUND_8X32(  0, alpha_f ); \
+      ROUND_8X32(  1, alpha_f ); \
+      ROUND_8X32(  2, alpha_f ); \
+      ROUND_8X32(  3, alpha_f ); \
+      ROUND_8X32(  4, alpha_f ); \
+      ROUND_8X32(  5, alpha_f ); \
+      ROUND_8X32(  6, alpha_f ); \
+      ROUND_8X32(  7, alpha_f ); \
+      ROUND_8X32(  8, alpha_f ); \
+      ROUND_8X32(  9, alpha_f ); \
+      ROUND_8X32( 10, alpha_f ); \
+      ROUND_8X32( 11, alpha_f );
+
+#define T_8X32 \
+      /* order is important */ \
+      CF = sc->h[15] = _mm256_xor_si256( sc->h[15], S17 ); \
+      CE = sc->h[14] = _mm256_xor_si256( sc->h[14], S16 ); \
+      CD = sc->h[13] = _mm256_xor_si256( sc->h[13], S15 ); \
+      CC = sc->h[12] = _mm256_xor_si256( sc->h[12], S14 ); \
+      CB = sc->h[11] = _mm256_xor_si256( sc->h[11], S13 ); \
+      CA = sc->h[10] = _mm256_xor_si256( sc->h[10], S12 ); \
+      C9 = sc->h[ 9] = _mm256_xor_si256( sc->h[ 9], S11 ); \
+      C8 = sc->h[ 8] = _mm256_xor_si256( sc->h[ 8], S10 ); \
+      C7 = sc->h[ 7] = _mm256_xor_si256( sc->h[ 7], S07 ); \
+      C6 = sc->h[ 6] = _mm256_xor_si256( sc->h[ 6], S06 ); \
+      C5 = sc->h[ 5] = _mm256_xor_si256( sc->h[ 5], S05 ); \
+      C4 = sc->h[ 4] = _mm256_xor_si256( sc->h[ 4], S04 ); \
+      C3 = sc->h[ 3] = _mm256_xor_si256( sc->h[ 3], S03 ); \
+      C2 = sc->h[ 2] = _mm256_xor_si256( sc->h[ 2], S02 ); \
+      C1 = sc->h[ 1] = _mm256_xor_si256( sc->h[ 1], S01 ); \
+      C0 = sc->h[ 0] = _mm256_xor_si256( sc->h[ 0], S00 );
+
+
+void hamsi_8x32_big( hamsi_8x32_big_context *sc, __m256i *buf, size_t num )
+{
+   DECL_STATE_8X32
+   uint32_t tmp;
+
+   tmp = (uint32_t)num << 6;
+   sc->count_low = sc->count_low + tmp;
+   sc->count_high += (uint32_t)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_8X32( sc );
+   while ( num-- > 0 )
+   {
+      __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+      __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+      INPUT_8X32;
+      P_8X32;
+      T_8X32;
+      buf += 2;
+   }
+   WRITE_STATE_8X32( sc );
+}
+
+void hamsi_8x32_big_final( hamsi_8x32_big_context *sc, __m256i *buf )
+{
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+
+   DECL_STATE_8X32
+   READ_STATE_8X32( sc );
+   INPUT_8X32;
+   PF_8X32;
+   T_8X32;
+   WRITE_STATE_8X32( sc );
+}
+
+void hamsi512_8x32_init( hamsi512_8x32_context *sc )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+
+   sc->h[ 0] = v256_32( HAMSI_IV512[ 0] );
+   sc->h[ 1] = v256_32( HAMSI_IV512[ 1] );
+   sc->h[ 2] = v256_32( HAMSI_IV512[ 2] );
+   sc->h[ 3] = v256_32( HAMSI_IV512[ 3] );
+   sc->h[ 4] = v256_32( HAMSI_IV512[ 4] );
+   sc->h[ 5] = v256_32( HAMSI_IV512[ 5] );
+   sc->h[ 6] = v256_32( HAMSI_IV512[ 6] );
+   sc->h[ 7] = v256_32( HAMSI_IV512[ 7] );
+   sc->h[ 8] = v256_32( HAMSI_IV512[ 8] );
+   sc->h[ 9] = v256_32( HAMSI_IV512[ 9] );
+   sc->h[10] = v256_32( HAMSI_IV512[10] );
+   sc->h[11] = v256_32( HAMSI_IV512[11] );
+   sc->h[12] = v256_32( HAMSI_IV512[12] );
+   sc->h[13] = v256_32( HAMSI_IV512[13] );
+   sc->h[14] = v256_32( HAMSI_IV512[14] );
+   sc->h[15] = v256_32( HAMSI_IV512[15] );
+}
+
+void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
+      size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   
+   hamsi_8x32_big( sc, vdata, len >> 3 );
+   vdata += ( (len & ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_256( sc->buf, vdata, len>> 3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst )
+{
+   __m256i pad[2];
+   uint32_t ch, cl;
+
+   ch = bswap_32( sc->count_high );
+   cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] = v256_32( ch );
+   pad[1] = v256_32( cl );
+   sc->buf[0] = v256_32( 0x80 );
+   sc->buf[1] = _mm256_setzero_si256();
+   hamsi_8x32_big( sc, sc->buf, 1 );
+   hamsi_8x32_big_final( sc, pad );
+
+   mm256_block_bswap_32( (__m256i*)dst, sc->h );
+   mm256_block_bswap_32( (__m256i*)dst + 8, sc->h + 8 );
+}
+
+void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst, 
+                         const void *data, size_t len )
+{
+   // init
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+
+   sc->h[ 0] = v256_32( HAMSI_IV512[ 0] );
+   sc->h[ 1] = v256_32( HAMSI_IV512[ 1] );
+   sc->h[ 2] = v256_32( HAMSI_IV512[ 2] );
+   sc->h[ 3] = v256_32( HAMSI_IV512[ 3] );
+   sc->h[ 4] = v256_32( HAMSI_IV512[ 4] );
+   sc->h[ 5] = v256_32( HAMSI_IV512[ 5] );
+   sc->h[ 6] = v256_32( HAMSI_IV512[ 6] );
+   sc->h[ 7] = v256_32( HAMSI_IV512[ 7] );
+   sc->h[ 8] = v256_32( HAMSI_IV512[ 8] );
+   sc->h[ 9] = v256_32( HAMSI_IV512[ 9] );
+   sc->h[10] = v256_32( HAMSI_IV512[10] );
+   sc->h[11] = v256_32( HAMSI_IV512[11] );
+   sc->h[12] = v256_32( HAMSI_IV512[12] );
+   sc->h[13] = v256_32( HAMSI_IV512[13] );
+   sc->h[14] = v256_32( HAMSI_IV512[14] );
+   sc->h[15] = v256_32( HAMSI_IV512[15] );
+
+   //update
+   __m256i *vdata = (__m256i*)data;
+
+   hamsi_8x32_big( sc, vdata, len >> 3 );
+   vdata += ( (len & ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_256( sc->buf, vdata, len>> 3 );
+   sc->partial_len = len;
+
+   // close
+   __m256i pad[2];
+   uint32_t ch, cl;
+
+   ch = bswap_32( sc->count_high );
+   cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] = v256_32( ch );
+   pad[1] = v256_32( cl );
+   sc->buf[0] = v256_32( 0x80 );
+   sc->buf[1] = _mm256_setzero_si256();
+   hamsi_8x32_big( sc, sc->buf, 1 );
+   hamsi_8x32_big_final( sc, pad );
+
+   mm256_block_bswap_32( (__m256i*)dst, sc->h );
+   mm256_block_bswap_32( (__m256i*)dst + 8, sc->h + 8 );
+}
+
+
+////////////
+
 void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
 {
    DECL_STATE_BIG
-   sph_u32 tmp;
+   uint32_t tmp;
 
-   tmp = SPH_T32( (sph_u32)num << 6 );
-   sc->count_low = SPH_T32( sc->count_low + tmp );
-   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   tmp = (uint32_t)num << 6;
+   sc->count_low = sc->count_low + tmp;
+   sc->count_high += (uint32_t)( (num >> 13) >> 13 );
    if ( sc->count_low < tmp )
       sc->count_high++;
 
@@ -1223,14 +1754,14 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
    sc->partial_len = 0;
    sc->count_high = sc->count_low = 0;
 
-   sc->h[0] = _mm256_set1_epi64x( 0x6c70617273746565 );
-   sc->h[1] = _mm256_set1_epi64x( 0x656e62656b204172 );
-   sc->h[2] = _mm256_set1_epi64x( 0x302c206272672031 );
-   sc->h[3] = _mm256_set1_epi64x( 0x3434362c75732032 );
-   sc->h[4] = _mm256_set1_epi64x( 0x3030312020422d33 );
-   sc->h[5] = _mm256_set1_epi64x( 0x656e2d484c657576 );
-   sc->h[6] = _mm256_set1_epi64x( 0x6c65652c65766572 );
-   sc->h[7] = _mm256_set1_epi64x( 0x6769756d2042656c );
+   sc->h[0] = v256_64( 0x6c70617273746565 );
+   sc->h[1] = v256_64( 0x656e62656b204172 );
+   sc->h[2] = v256_64( 0x302c206272672031 );
+   sc->h[3] = v256_64( 0x3434362c75732032 );
+   sc->h[4] = v256_64( 0x3030312020422d33 );
+   sc->h[5] = v256_64( 0x656e2d484c657576 );
+   sc->h[6] = v256_64( 0x6c65652c65766572 );
+   sc->h[7] = v256_64( 0x6769756d2042656c );
 }
 
 void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
@@ -1250,17 +1781,14 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
    __m256i pad[1];
    uint32_t ch, cl;
 
-   sph_enc32be( &ch, sc->count_high );
-   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
-   pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch );
-   sc->buf[0] = _mm256_set1_epi64x( 0x80 );
+   ch = bswap_32( sc->count_high );
+   cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] = v256_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
+   sc->buf[0] = v256_64( 0x80 );
    hamsi_big( sc, sc->buf, 1 );
    hamsi_big_final( sc, pad );
 
    mm256_block_bswap_32( (__m256i*)dst, sc->h );
 }
 
-#ifdef __cplusplus
-}
-#endif
 #endif
diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h
index 60e33b24..8e21219a 100644
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -36,44 +36,64 @@
 #define HAMSI_4WAY_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 
 #if defined (__AVX2__)
 
 #include "simd-utils.h"
 
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#define SPH_SIZE_hamsi512   512
+// Hamsi-512 4x64
 
 // Partial is only scalar but needs pointer ref for hamsi-helper
 // deprecate partial_len
-typedef struct {
+typedef struct
+{
    __m256i h[8];
    __m256i buf[1];
    size_t partial_len;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } hamsi_4way_big_context;
-
 typedef hamsi_4way_big_context hamsi512_4way_context;
 
 void hamsi512_4way_init( hamsi512_4way_context *sc );
 void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
       size_t len );
-//#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 
+#define hamsi512_4x64_context   hamsi512_4way_context
+#define hamsi512_4x64_init      hamsi512_4way_init
+#define hamsi512_4x64_update    hamsi512_4way_update
+#define hamsi512_4x64_close     hamsi512_4way_close
+
+// Hamsi-512 8x32
+
+typedef struct
+{
+   __m256i h[16];
+   __m256i buf[2];
+   size_t partial_len;
+   uint32_t count_high, count_low;
+} hamsi_8x32_big_context;
+typedef hamsi_8x32_big_context hamsi512_8x32_context;
+
+void hamsi512_8x32_init( hamsi512_8x32_context *sc );
+void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
+      size_t len );
+void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst );
+void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
+      size_t len );
+
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
+// Hamsi-512 8x64
+
 typedef struct {
    __m512i h[8];
    __m512i buf[1];
    size_t partial_len;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } hamsi_8way_big_context;
-
 typedef hamsi_8way_big_context hamsi512_8way_context;
 
 void hamsi512_8way_init( hamsi512_8way_context *sc );
@@ -81,15 +101,29 @@ void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
                            size_t len );
 void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
 
+#define hamsi512_8x64_context   hamsi512_8way_context
+#define hamsi512_8x64_init      hamsi512_8way_init
+#define hamsi512_8x64_update    hamsi512_8way_update
+#define hamsi512_8x64_close     hamsi512_8way_close
 
+// Hamsi-512 16x32
 
-#endif
-
+typedef struct
+{
+   __m512i h[16];
+   __m512i buf[2];
+   size_t partial_len;
+   uint32_t count_high, count_low;
+} hamsi_16x32_big_context;
+typedef hamsi_16x32_big_context hamsi512_16x32_context;
 
-#ifdef __cplusplus
-}
-#endif
+void hamsi512_16x32_init( hamsi512_16x32_context *sc );
+void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
+                           size_t len );
+void hamsi512_16way_close( hamsi512_16x32_context *sc, void *dst );
+void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
+                          const void *data, size_t len );
 
-#endif
+#endif   // AVX512
 
 #endif
diff --git a/algo/hamsi/sph_hamsi.h b/algo/hamsi/sph_hamsi.h
index b0cb20c0..0d55ccfe 100644
--- a/algo/hamsi/sph_hamsi.h
+++ b/algo/hamsi/sph_hamsi.h
@@ -36,7 +36,7 @@
 #define SPH_HAMSI_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 #ifdef __cplusplus
 extern "C"{
diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c
index 313b23fa..e3ae285a 100644
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -48,7 +48,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
    while ( len > 0 )
    {
       unsigned clen;
-      sph_u32 clow, clow2;
+      uint32_t clow, clow2;
 
       clen = 128U - current;
       if ( clen > len )
@@ -67,7 +67,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
          current = 0;
       }
       clow = sc->count_low;
-      clow2 = SPH_T32(clow + clen);
+      clow2 = clow + clen;
       sc->count_low = clow2;
       if ( clow2 < clow )
          sc->count_high ++;
diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c
index f3981c1c..39bbb756 100644
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -292,7 +292,9 @@ static const unsigned MP5[32] = {
 	 2, 23, 16, 22,  4,  1, 25, 15
 };
 
-static const sph_u32 RK2[32] = {
+#define SPH_C32(x) (x)
+
+static const uint32_t RK2[32] = {
 	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
 	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
 	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
@@ -311,7 +313,7 @@ static const sph_u32 RK2[32] = {
 	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
 };
 
-static const sph_u32 RK3[32] = {
+static const uint32_t RK3[32] = {
 	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
 	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
 	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
@@ -330,7 +332,7 @@ static const sph_u32 RK3[32] = {
 	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
 };
 
-static const sph_u32 RK4[32] = {
+static const uint32_t RK4[32] = {
 	SPH_C32(0x7A325381), SPH_C32(0x28958677),
 	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
 	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
@@ -349,7 +351,7 @@ static const sph_u32 RK4[32] = {
 	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
 };
 
-static const sph_u32 RK5[32] = {
+static const uint32_t RK5[32] = {
 	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
 	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
 	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h
index 9164d2fd..d032e1bc 100644
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -68,7 +68,6 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"
 
 #define SPH_SIZE_haval256_5   256
@@ -77,7 +76,7 @@ typedef struct {
    __m128i buf[32];
    __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    unsigned olen, passes;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } haval_4way_context;
 
 typedef haval_4way_context haval256_5_4way_context;
diff --git a/algo/haval/sph-haval.h b/algo/haval/sph-haval.h
index 9ec57721..710393a4 100644
--- a/algo/haval/sph-haval.h
+++ b/algo/haval/sph-haval.h
@@ -66,7 +66,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for HAVAL-128/3.
diff --git a/algo/jh/sph_jh.h b/algo/jh/sph_jh.h
index 77a0fdb4..a5c37695 100644
--- a/algo/jh/sph_jh.h
+++ b/algo/jh/sph_jh.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for JH-224.
diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c
index f8b0cd1a..95f437e0 100644
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-#include "sph_keccak.h"
 #include "keccak-hash-4way.h"
 
 #if defined(KECCAK_8WAY)
diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c
index c710836b..b2f0a212 100644
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -9,7 +9,7 @@ int hard_coded_eb = 1;
 bool register_keccak_algo( algo_gate_t* gate )
 {
   gate->optimizations = AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
   opt_target_factor = 128.0;
 #if defined (KECCAK_8WAY)
   gate->scanhash  = (void*)&scanhash_keccak_8way;
diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h
index 5b91bcfe..20554091 100644
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -1,45 +1,6 @@
-/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * Keccak interface. This is the interface for Keccak with the
- * recommended parameters for SHA-3, with output lengths 224, 256,
- * 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_keccak.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
 #ifndef KECCAK_HASH_4WAY_H__
 #define KECCAK_HASH_4WAY_H__
 
-#ifdef __cplusplus
-extern "C"{
-#endif
-
 #ifdef  __AVX2__
 
 #include <stddef.h>
@@ -100,8 +61,4 @@ void keccak512_4way_addbits_and_close(
 
 #endif
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif
diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c
index ffb4056d..ca5ab726 100644
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-#include "sph_keccak.h"
 #include "keccak-hash-4way.h"
 
 #if defined(KECCAK_8WAY)
diff --git a/algo/keccak/sph_keccak.h b/algo/keccak/sph_keccak.h
index ec2dbfc7..b075f150 100644
--- a/algo/keccak/sph_keccak.h
+++ b/algo/keccak/sph_keccak.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for Keccak-224.
diff --git a/algo/lanehash/lane.h b/algo/lanehash/lane.h
index 4a02e643..a0e85f60 100644
--- a/algo/lanehash/lane.h
+++ b/algo/lanehash/lane.h
@@ -23,7 +23,6 @@
 #define LANE_H
 
 #include <string.h>
-//#include "algo/sha/sha3-defs.h"
 #include <stdint.h>
 
 typedef unsigned char BitSequence;
diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c
index 64d14069..f9b049b9 100644
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -7,8 +7,10 @@
 
 #include "simd-utils.h"
 
+#define uint32 uint32_t
+
 /* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(64))) = {
+static const uint32_t IV[40] __attribute((aligned(64))) = {
     0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
     0xdef610bb,0xee058139,0x90152df4,0x6e292011,
     0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
@@ -22,7 +24,7 @@ static const uint32 IV[40] __attribute((aligned(64))) = {
 };
 
 /* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
+static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
     0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
     0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
     0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h
index ba9bc147..a274995f 100644
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -23,7 +23,7 @@
 #if defined(__AVX2__)
 
 #include <immintrin.h>
-#include "algo/sha/sha3-defs.h"
+//#include "algo/sha/sha3-defs.h"
 #include "simd-utils.h"
 
 /* The length of digests*/
@@ -54,7 +54,7 @@
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 typedef struct {
-    uint32 buffer[8*4];
+    uint32_t buffer[8*4];
     __m512i chainv[10];   /* Chaining values */
     int hashbitlen;
     int rembytes;
@@ -82,7 +82,7 @@ int luffa512_4way_update_close( luffa_4way_context *state, void *output,
 #endif
 
 typedef struct {
-    uint32 buffer[8*2];
+    uint32_t buffer[8*2];
     __m256i chainv[10];   /* Chaining values */
     int hashbitlen;
     int rembytes;
diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h
index f20a400c..aaa066e6 100644
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -22,7 +22,7 @@
  */
 
 #include <emmintrin.h>
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"
 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
 #define DIGEST_BIT_LEN_256 256
diff --git a/algo/luffa/sph_luffa.h b/algo/luffa/sph_luffa.h
index 5201d2fc..5cc24b7d 100644
--- a/algo/luffa/sph_luffa.h
+++ b/algo/luffa/sph_luffa.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for Luffa-224.
diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h
index 5ab0b813..71c0a3f9 100644
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -21,9 +21,8 @@
 #define LYRA2_H_
 
 #include <stdint.h>
-#include "algo/sha/sha3-defs.h"
 
-//typedef unsigned char byte;
+typedef unsigned char byte;
 
 //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
 #define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
diff --git a/algo/lyra2/lyra2rev2.c b/algo/lyra2/lyra2rev2.c
index 69155182..8e052971 100644
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -4,7 +4,6 @@
 
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
diff --git a/algo/lyra2/lyra2rev3.c b/algo/lyra2/lyra2rev3.c
index e72ec88c..d1e5b518 100644
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -4,7 +4,6 @@
 
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/cubehash_sse2.h" 
 //#include "lyra2.h"
diff --git a/algo/panama/sph_panama.h b/algo/panama/sph_panama.h
index 6f9d3e8a..638e4313 100644
--- a/algo/panama/sph_panama.h
+++ b/algo/panama/sph_panama.h
@@ -58,7 +58,7 @@
 #define SPH_PANAMA_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for PANAMA.
diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c
index 45b10115..96c6d0f8 100644
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -21,7 +21,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #if defined(__VAES__)
   #include "algo/groestl/groestl512-hash-4way.h"
   #include "algo/shavite/shavite-hash-4way.h"
diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c
index 386e2452..990a4af3 100644
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -3,7 +3,8 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
+#include "algo/sha/sha512-hash.h"
 #include "ripemd-hash-4way.h"
 
 #define LBRY_INPUT_SIZE 112
diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h
index 71fb3d73..c0c87db4 100644
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -2,7 +2,6 @@
 #define RIPEMD_HASH_4WAY_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 
 #if defined(__SSE4_2__)
 
diff --git a/algo/ripemd/sph_ripemd.h b/algo/ripemd/sph_ripemd.h
index b677bd54..c0019f9c 100644
--- a/algo/ripemd/sph_ripemd.h
+++ b/algo/ripemd/sph_ripemd.h
@@ -57,7 +57,7 @@
 #define SPH_RIPEMD_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for RIPEMD.
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index c36411bd..b60a5ba8 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -31,7 +31,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
-#include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"
 #include <mm_malloc.h>
 #include "malloc-huge.h"
diff --git a/algo/sha/hmac-sha256-hash-4way.h b/algo/sha/hmac-sha256-hash-4way.h
index f33fa23a..31d51cd9 100644
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -36,7 +36,7 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include "simd-utils.h"
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
 
 typedef struct _hmac_sha256_4way_context
 {
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
deleted file mode 100644
index 2e95c7f0..00000000
--- a/algo/sha/sha-hash-4way.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
- *
- * SHA-256 has been published in FIPS 180-2, now amended with a change
- * notice to include SHA-224 as well (which is a simple variation on
- * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
- * standards can be found at:
- *    http://csrc.nist.gov/publications/fips/
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_sha2.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SHA2_HASH_4WAY_H__
-#define SHA2_HASH_4WAY_H__ 1
-
-#include <stddef.h>
-#include "simd-utils.h"
-
-#if defined(__SSE2__)
-
-// SHA-256 4 way
-
-typedef struct {
-   __m128i buf[64>>2];
-   __m128i val[8];
-   uint32_t count_high, count_low;
-} sha256_4way_context __attribute__ ((aligned (64)));
-
-void sha256_4way_init( sha256_4way_context *sc );
-void sha256_4way_update( sha256_4way_context *sc, const void *data,
-                         size_t len );
-void sha256_4way_close( sha256_4way_context *sc, void *dst );
-void sha256_4way_full( void *dst, const void *data, size_t len );
-void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
-                                   const __m128i *W, const __m128i *state_in );
-void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
-        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
-int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                                   const __m128i *state_in, const uint32_t *target );
-
-#endif  // SSE2
-
-#if defined (__AVX2__)
-
-// SHA-256 8 way
-
-typedef struct {
-   __m256i buf[64>>2];
-   __m256i val[8];
-   uint32_t count_high, count_low;
-} sha256_8way_context __attribute__ ((aligned (128)));
-
-void sha256_8way_init( sha256_8way_context *sc );
-void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
-void sha256_8way_close( sha256_8way_context *sc, void *dst );
-void sha256_8way_full( void *dst, const void *data, size_t len );
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
-                               const __m256i *state_in );
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
-                               const __m256i *state_in );
-
-void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
-                                 const __m256i *W, const __m256i *state_in );
-void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
-int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
-                             const __m256i *state_in, const uint32_t *target );
-
-#endif  // AVX2
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-256 16 way
-
-typedef struct {
-   __m512i buf[64>>2];
-   __m512i val[8];
-   uint32_t count_high, count_low;
-} sha256_16way_context __attribute__ ((aligned (128)));
-
-void sha256_16way_init( sha256_16way_context *sc );
-void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
-void sha256_16way_close( sha256_16way_context *sc, void *dst );
-void sha256_16way_full( void *dst, const void *data, size_t len );
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in );
-void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in );
-void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
-                                  const __m512i *W, const __m512i *state_in );
-void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
-
-int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
-                            const __m512i *state_in, const uint32_t *target );
-
-#endif // AVX512
-
-#if defined (__AVX2__)
-
-// SHA-512 4 way
-
-typedef struct {
-   __m256i buf[128>>3];
-   __m256i val[8];
-   uint64_t count;
-   bool initialized;
-} sha512_4way_context __attribute__ ((aligned (128)));
-
-void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way_update( sha512_4way_context *sc, const void *data,
-                         size_t len );
-void sha512_4way_close( sha512_4way_context *sc, void *dst );
-void sha512_4way_full( void *dst, const void *data, size_t len );
-
-#endif  // AVX2
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-512 8 way
-
-typedef struct {
-   __m512i buf[128>>3];
-   __m512i val[8];
-   uint64_t count;
-   bool initialized;
-} sha512_8way_context __attribute__ ((aligned (128)));
-
-void sha512_8way_init( sha512_8way_context *sc);
-void sha512_8way_update( sha512_8way_context *sc, const void *data, 
-                         size_t len );
-void sha512_8way_close( sha512_8way_context *sc, void *dst );
-void sha512_8way_full( void *dst, const void *data, size_t len );
-
-#endif  // AVX512
-
-#endif  // SHA256_4WAY_H__
diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c
deleted file mode 100644
index 7fc64ca3..00000000
--- a/algo/sha/sha256-hash-2way-ni.c
+++ /dev/null
@@ -1,689 +0,0 @@
-/*   Intel SHA extensions using C intrinsics               */
-/*   Written and place in public domain by Jeffrey Walton  */
-/*   Based on code from Intel, and by Sean Gulley for      */
-/*   the miTLS project.                                    */
-
-// A stripped down version with byte swapping removed. 
-
-#if defined(__SHA__)
-
-#include "sha256-hash.h"
-
-void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y )
-{
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
-
-    // Load initial values
-    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
-    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
-    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
-    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
-
-    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
-    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
-    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
-    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
-    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
-    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
-    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
-    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
-    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
-    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
-    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
-    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
-    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
-    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
-    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
-    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
-    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
-
-    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
-    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
-    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
-    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
-    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
-    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
-    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
-    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
-    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
-}
-
-void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y )
-{
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
-
-    // Load initial values
-    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
-    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
-    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
-    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
-    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
-    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
-    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
-    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
-    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
-    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
-    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
-    TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
-    TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
-    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
-    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
-    TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
-    TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
-    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
-    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
-    TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
-    TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
-    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
-    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
-    TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
-    TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
-    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
-    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
-    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
-
-    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
-    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
-    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
-    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
-    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
-    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
-    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
-    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
-    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
-}
-
-
-#endif
-
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index 549b2b7c..1f2d4e9d 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -3,7 +3,7 @@
 
 #include <stddef.h>
 #include <string.h>
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
 #include "compat.h"
 
 /*
@@ -610,6 +610,16 @@ do { \
    SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
    SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
 
+// Not used with AVX512, needed to satisfy the compiler
+#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   D  = _mm256_add_epi32( D,  T1 ); \
+   H  = _mm256_add_epi32( T1, T2 ); \
+}
+
 #else  // AVX2
 
 #define CHx(X, Y, Z) \
@@ -621,6 +631,16 @@ do { \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
 
+#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   Y_xor_Z = X_xor_Y; \
+   D  = _mm256_add_epi32( D,  T1 ); \
+   H  = _mm256_add_epi32( T1, T2 ); \
+}
+
 #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
 do { \
   __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -635,7 +655,6 @@ do { \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
-
 // read Y_xor_Z, update X_xor_Y
 #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
@@ -769,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
    SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
    SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 0 );
 
-   // round 3 part 1, ignore nonces W[3]
+   // round 3 part 1, avoid nonces W[3]
    T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
                        _mm256_set1_epi32( K256[3] ) );
    A = _mm256_add_epi32( A, T1 );
@@ -807,23 +826,22 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
 #endif
 
-   // round 3 part 2, inject nonces  
+   // round 3 part 2, add nonces  
    A = _mm256_add_epi32( A, W[3] );
    E = _mm256_add_epi32( E, W[3] );
    
-//   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
+   SHA256_8WAY_ROUND(       E, F, G, H, A, B, C, D,  4, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_8WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A, 15, 0 );
 
    W[ 0] = X[ 0];
    W[ 1] = X[ 1];
@@ -865,6 +883,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); 
 
    SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
    SHA256_8WAY_MEXP_16ROUNDS( W );
    SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
    
@@ -887,8 +906,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    _mm256_store_si256( state_out + 7,  H );
 }
 
-
-// It's working with a high hit rate but performance is lower
 int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
                            const __m256i *state_in, const uint32_t *target )
 {
@@ -912,14 +929,37 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
    const __m256i IV7 = H;
    const __m256i IV6 = G;
 
-   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
 
-   for ( int j = 16; j < 48; j += 16 )
-   {
-      SHA256_8WAY_MEXP_16ROUNDS( W );
-      SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j );
-   }
+   // rounds 0 to 16, ignore zero padding W[9..14]
+   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  0, 0 );
+   SHA256_8WAY_ROUND(       H, A, B, C, D, E, F, G,  1, 0 );
+   SHA256_8WAY_ROUND(       G, H, A, B, C, D, E, F,  2, 0 );
+   SHA256_8WAY_ROUND(       F, G, H, A, B, C, D, E,  3, 0 );
+   SHA256_8WAY_ROUND(       E, F, G, H, A, B, C, D,  4, 0 );
+   SHA256_8WAY_ROUND(       D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_8WAY_ROUND(       C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A, 15, 0 );
+  
+   // rounds 16 ro 31
+   SHA256_8WAY_MEXP_16ROUNDS( W );
+   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
 
+   // rounds 32  to 47
+   SHA256_8WAY_MEXP_16ROUNDS( W );
+   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
+   // rounds 48 to 60 mexp
    W[ 0] = SHA256_8WAY_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
    W[ 1] = SHA256_8WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
    W[ 2] = SHA256_8WAY_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
@@ -935,9 +975,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
    W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
 
 #if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+   Y_xor_Z = _mm256_xor_si256( B, C );
 #endif
 
+   // rounds 48 to 57
    SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
    SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
    SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
@@ -968,7 +1009,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
    flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
    if ( likely( 0xff == ( flip ^
                     mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
-   return 0;
+      return 0;
 
    t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
 
@@ -983,28 +1024,29 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
 
    if ( t6_mask )
    { 
-      // Testing H inconclusive: hash7 == target7, need to test G
+      // Testing H was inconclusive: hash7 == target7, need to test G
       targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
       hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
 
-      if ( unlikely( 0 != ( t6_mask & mm256_movmask_32(
+      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
                                       _mm256_cmpeq_epi32( hash, targ ) ) ) ))
-         return 0;
-      else
       {
           flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
           if ( likely( 0 != ( t6_mask & ( flip ^
                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
              return 0;
-          else if ( likely( target[6] == 0x80000000 ))
-          {
-             if ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
-                                  hash, _mm256_xor_si256( hash, hash ) ) ) ) )
-               return 0;
-          }
+          if ( likely( ( target[6] == 0x80000000 )
+             && ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
+                              hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
+             return 0;
        } 
+//     else inconclusive, testing targ5 isn't practical, fininsh hashing  
    }
 
+// At this point either the hash will be good or the test was inconclusive.
+// If the latter it's probably a high target difficulty with a nearly equal
+// high difficulty hash that has a good chance of being good.  
+
    // rounds 59 to 61 part 2
    E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
                                                MAJx( F, G, H ) ) );
@@ -1179,6 +1221,15 @@ do { \
   H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
    
+#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
+                              _mm512_set1_epi32( K256[(i)+(j)] ) ); \
+   __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+   D  = _mm512_add_epi32( D,  T1 ); \
+   H  = _mm512_add_epi32( T1, T2 ); \
+}
+
 /*
 #define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
 do { \
@@ -1292,7 +1343,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
    SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
    SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F,  2, 0 );
 
-   // round 3 part 1, ignore nonces W[3]
+   // round 3 part 1, avoid nonces W[3]
    T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D), 
                        _mm512_set1_epi32( K256[3] ) );
    A = _mm512_add_epi32( A, T1 );
@@ -1312,7 +1363,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
           const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
-   __m512i A, B, C, D, E, F, G, H, T1, T2;
+   __m512i A, B, C, D, E, F, G, H;
    __m512i W[16];
 
    memcpy_512( W, data, 16 );
@@ -1326,87 +1377,25 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    G = _mm512_load_si512( state_mid + 6 );
    H = _mm512_load_si512( state_mid + 7 );
 
-   // round 3 part 2, inject nonces  
+   // round 3 part 2, add nonces  
    A = _mm512_add_epi32( A, W[3] );
    E = _mm512_add_epi32( E, W[3] );
 
-   // round 4
-   SHA256_16WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );   
-
-   // round 5
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B), 
-                       _mm512_set1_epi32( K256[5] ) ); 
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) ); 
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-
-   // round 6
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[6] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
-
-   // round 7
-   T1 = mm512_add4_32( A, BSG2_1x16(F), CHx16(F, G, H),
-                       _mm512_set1_epi32( K256[7] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(B), MAJx16(B, C, D) );
-   E  = _mm512_add_epi32( E,  T1 );
-   A  = _mm512_add_epi32( T1, T2 );
-
-   // round 8
-   T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G),
-                       _mm512_set1_epi32( K256[8] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) );
-   D  = _mm512_add_epi32( D,  T1 );
-   H  = _mm512_add_epi32( T1, T2 );
-
-   // round 9
-   T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
-                       _mm512_set1_epi32( K256[9] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
-   C  = _mm512_add_epi32( C,  T1 );
-   G  = _mm512_add_epi32( T1, T2 );
-       
-   // round 10
-   T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
-                       _mm512_set1_epi32( K256[10] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
-   B  = _mm512_add_epi32( B,  T1 );
-   F  = _mm512_add_epi32( T1, T2 );
-       
-   // round 11
-   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
-                       _mm512_set1_epi32( K256[11] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
-   A  = _mm512_add_epi32( A,  T1 );
-   E  = _mm512_add_epi32( T1, T2 );
-
-   // round 12
-   T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
-                       _mm512_set1_epi32( K256[12] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
-   H  = _mm512_add_epi32( H,  T1 );
-   D  = _mm512_add_epi32( T1, T2 );
-
-   // round 13
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
-                       _mm512_set1_epi32( K256[13] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-  
-   // round 14
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[14] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
-
-   // round 15   
-   SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
-
-   // rounds 16 to 31 mexp part 2, inject nonces.
+   // rounds 4 to 15, ignore zero padding W[5..14]
+   SHA256_16WAY_ROUND      ( E, F, G, H, A, B, C, D,  4, 0 );   
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_16WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_16WAY_ROUND      ( B, C, D, E, F, G, H, A, 15, 0 );
+
+   // rounds 16 to 31 mexp part 2, add nonces.
    W[ 0] = X[ 0];
    W[ 1] = X[ 1];
    W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1428,6 +1417,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
 
    SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
 
+   // rounds 32 to 63   
    W[ 0] = _mm512_add_epi32( X[ 6], _mm512_add_epi32( SSG2_1x16( W[14] ),
                                                       W[ 9] ) ); 
    W[ 1] = SHA256_16WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
@@ -1505,41 +1495,12 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
    SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  8, 0 );
 
    // rounds 9 to 14, ignore zero padding
-   T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
-                       _mm512_set1_epi32( K256[9] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
-   C  = _mm512_add_epi32( C,  T1 );
-   G  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
-                       _mm512_set1_epi32( K256[10] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
-   B  = _mm512_add_epi32( B,  T1 );
-   F  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
-                       _mm512_set1_epi32( K256[11] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
-   A  = _mm512_add_epi32( A,  T1 );
-   E  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
-                       _mm512_set1_epi32( K256[12] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
-   H  = _mm512_add_epi32( H,  T1 );
-   D  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
-                       _mm512_set1_epi32( K256[13] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[14] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
+   SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
 
    // round 15
    SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
@@ -1575,7 +1536,6 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
 
    // rounds 32 to 47
    SHA256_MEXP_16WAY_16ROUNDS( W );
-
    SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
 
    // rounds 48 to 60 mexp
@@ -1640,8 +1600,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
    {
       hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
       targ = _mm512_set1_epi32( target[6] );
-      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask,
-                                                      hash, targ ) ))
+      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
           return 0;
    }
 
diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c
deleted file mode 100644
index e08dd60b..00000000
--- a/algo/sha/sha256-hash-opt.c
+++ /dev/null
@@ -1,388 +0,0 @@
-/*   Intel SHA extensions using C intrinsics               */
-/*   Written and place in public domain by Jeffrey Walton  */
-/*   Based on code from Intel, and by Sean Gulley for      */
-/*   the miTLS project.                                    */
-
-// A stripped down version with byte swapping removed. 
-
-#if defined(__SHA__)
-
-#include "sha256-hash.h"
-
-void sha256_opt_transform_le( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state_in[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
-//    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
-//    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-//    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-//    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-//    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state_out[0], STATE0);
-    _mm_store_si128((__m128i*) &state_out[4], STATE1);
-}
-
-
-void sha256_opt_transform_be( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state_in[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state_out[0], STATE0);
-    _mm_store_si128((__m128i*) &state_out[4], STATE1);
-}
-
-#endif
diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c
index 9d81b20e..823ee72f 100644
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -6,15 +6,1385 @@ static const uint32_t SHA256_IV[8] =
    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 
-/*
-static const uint8_t SHA256_PAD[64] =
+#if defined(__SHA__)
+
+void sha256_opt_transform_le( uint32_t *state_out, const void *input,
+                              const uint32_t *state_in )
 {
-   0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-*/
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm_load_si128((__m128i*) &state_in[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 4-7
+    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+
+    // Rounds 8-11
+    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 12-15
+    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 16-19
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 20-23
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 24-27
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 28-31
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 32-35
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 36-39
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 40-43
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 44-47
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 48-51
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 52-55
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 56-59
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 60-63
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Add values back to state
+    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &state_out[0], STATE0);
+    _mm_store_si128((__m128i*) &state_out[4], STATE1);
+}
+
+
+void sha256_opt_transform_be( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in )
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP, MASK;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm_load_si128((__m128i*) &state_in[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
+    TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 4-7
+    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
+    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+    // Rounds 8-11
+    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
+    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 12-15
+    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
+    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 16-19
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 20-23
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 24-27
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 28-31
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 32-35
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 36-39
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 40-43
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 44-47
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 48-51
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 52-55
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 56-59
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 60-63
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Add values back to state
+    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &state_out[0], STATE0);
+    _mm_store_si128((__m128i*) &state_out[4], STATE1);
+}
+
+// 2 way double buffered
+
+void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
+    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
+
+    // Load initial values
+    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
+    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
+    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
+    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
+
+    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
+    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
+    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
+    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
+    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
+    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE_X = STATE0_X;
+    ABEF_SAVE_Y = STATE0_Y;
+    CDGH_SAVE_X = STATE1_X;
+    CDGH_SAVE_Y = STATE1_Y;
+
+    // Rounds 0-3
+    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
+    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
+    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 4-7
+    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
+    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
+    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 8-11
+    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
+    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
+    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 12-15
+    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
+    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
+    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 16-19
+    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 20-23
+    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 24-27
+    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 28-31
+    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 32-35
+    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 36-39
+    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 40-43
+    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 44-47
+    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 48-51
+    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 52-55
+    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 56-59
+    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 60-63
+    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Add values back to state
+    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
+    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
+    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
+    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
+
+    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
+    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
+    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
+    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
+    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
+    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
+    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
+    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
+    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
+}
+
+void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
+    __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
+
+    // Load initial values
+    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
+    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
+    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
+    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
+    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
+    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
+    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
+    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
+    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE_X = STATE0_X;
+    ABEF_SAVE_Y = STATE0_Y;
+    CDGH_SAVE_X = STATE1_X;
+    CDGH_SAVE_Y = STATE1_Y;
+
+    // Rounds 0-3
+    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
+    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
+    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
+    TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
+    TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 4-7
+    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
+    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
+    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
+    TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
+    TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 8-11
+    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
+    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
+    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
+    TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
+    TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 12-15
+    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
+    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
+    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
+    TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
+    TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 16-19
+    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 20-23
+    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 24-27
+    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 28-31
+    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 32-35
+    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 36-39
+    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 40-43
+    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 44-47
+    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 48-51
+    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 52-55
+    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 56-59
+    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 60-63
+    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Add values back to state
+    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
+    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
+    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
+    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
+
+    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
+    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
+    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
+    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
+    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
+    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
+    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
+    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
+    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
+}
+
+// The next 2 functions work together to seperate the low frequency data
+// (outer loop) from the high frequency data containing the nonce (inner loop)
+// when hashing the second block (tail) of the first sha256 hash.
+// The goal is to avoid any redundant processing in final. Prehash is almost
+// 4 rounds total, only missing the final addition of the nonce.
+// Nonce must be set to zero for prehash.
+void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
+                                uint32_t *sstate, const uint32_t *istate )
+{
+   __m128i STATE0, STATE1, MSG, TMP;
+
+   // Load initial values
+   TMP    = casti_m128i( istate, 0 );
+   STATE1 = casti_m128i( istate, 1 );
+
+   TMP    = _mm_shuffle_epi32( TMP, 0xB1 );       // CDAB
+   STATE1 = _mm_shuffle_epi32( STATE1, 0x1B );    // EFGH
+   STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 );    // ABEF
+   STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
+
+   // Save current hash
+   casti_m128i( sstate, 0 ) = STATE0;
+   casti_m128i( sstate, 1 ) = STATE1;
+
+   // Rounds 0 to 3
+   MSG = casti_m128i( msg, 0 );
+   TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
+   MSG = _mm_add_epi32( MSG, TMP );
+   STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
+   MSG = _mm_shuffle_epi32( MSG, 0x0E );
+   casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
+   casti_m128i( ostate, 1 ) = STATE1;
+}
+
+void sha256_ni2way_final_rounds( uint32_t *out_X, uint32_t *out_Y,
+                 const void *msg_X, const void *msg_Y,
+                 const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
+                 const uint32_t *state_save_X, const uint32_t *state_save_Y )
+{
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
+
+    STATE0_X = casti_m128i( state_mid_X, 0 );
+    STATE1_X = casti_m128i( state_mid_X, 1 );
+    STATE0_Y = casti_m128i( state_mid_Y, 0 );
+    STATE1_Y = casti_m128i( state_mid_Y, 1 );
+
+    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
+    TMSG0_X = casti_m128i( msg_X, 0 );
+    TMSG0_Y = casti_m128i( msg_Y, 0 );
+    TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
+    TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
+    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
+    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
+
+    // Rounds 4 to 7
+    TMSG1_X = casti_m128i( msg_X, 1 );
+    TMSG1_Y = casti_m128i( msg_Y, 1 );
+    TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
+    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
+
+    // Rounds 8 to 11, skip TMSG2, it's zero until round 22
+    MSG_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_X );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
+
+    // Rounds 12 to 15
+    TMSG3_X = casti_m128i( msg_X, 3 );
+    TMSG3_Y = casti_m128i( msg_Y, 3 );
+    TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
+    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+
+    // Rounds 16 to 19
+    TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL );
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
+    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
+    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
+    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
+    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
+    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
+
+    // Rounds 20 to 23
+    TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL );
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMSG2_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
+    TMSG2_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
+    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
+    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
+    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
+
+    // Rounds 24 to 27
+    TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL );
+    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
+    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
+    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
+    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
+    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
+    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
+
+    // Rounds 28 to 31
+    TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL );
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
+    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
+    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
+    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
+    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
+
+    // Rounds 32 to 35
+    TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL );
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
+    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
+    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
+    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
+    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
+    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
+
+    // Rounds 36 to 39
+    TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL );
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
+    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
+    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
+    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
+    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
+    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
+
+    // Rounds 40 to 43
+    TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL );
+    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
+    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
+    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
+    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
+    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
+    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
+
+    // Rounds 44 to 47
+    TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL );
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
+    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
+    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
+    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
+    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
+
+    // Rounds 48 to 51
+    TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL );
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
+    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
+    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
+    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
+    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
+    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
+
+    // Rounds 52 to 55
+    TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL );
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
+    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
+    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
+    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
+    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+
+    // Rounds 56 to 59
+    TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL );
+    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
+    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
+    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
+    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
+    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
+    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+
+    // Rounds 60 to 63
+    TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL );
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
+
+    // Add saved state to new state
+    STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
+    STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
+    STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
+    STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
+
+    // Unshuffle & save state    
+    TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B );                        // FEBA
+    TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
+    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 );                     // DCHG
+    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
+    casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
+    casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
+    casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
+    casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
+}
+
+#endif
+
 
 void sha256_ctx_init( sha256_context *ctx )
 {
diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h
index 410ca90f..763b405f 100644
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -4,17 +4,18 @@
 #include <stddef.h>
 #include "simd-utils.h"
 #include "cpuminer-config.h"
-#include "sph_sha2.h"
-
 
 // generic interface 
 
-typedef struct {
+typedef struct
+{
    unsigned char buf[64];    /* first field, for alignment */
    uint32_t state[8];
    uint64_t count;
 } sha256_context __attribute__((aligned(64)));
 
+static const uint32_t SHA256_IV[8];
+
 void sha256_full( void *hash, const void *data, size_t len );
 void sha256_update( sha256_context *ctx, const void *data, size_t len );
 void sha256_final( sha256_context *ctx, void *hash );
@@ -41,20 +42,113 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
                               const void *msg_X, const void *msg_Y,
                               const uint32_t *in_X, const uint32_t *in_Y );
 
+void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
+                              uint32_t *sstate, const uint32_t *istate );
+
+void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
+                 const void *msg_X, const void *msg_Y,
+                 const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
+                 const uint32_t *state_save_X, const uint32_t *state_save_Y );
+
 // Select target
 // with SHA...
 #define sha256_transform_le sha256_opt_transform_le
 #define sha256_transform_be sha256_opt_transform_be
 
 #else
-
 // without SHA...
+#include "sph_sha2.h"
+
 #define sha256_transform_le sph_sha256_transform_le
 #define sha256_transform_be sph_sha256_transform_be
 
 #endif
 
-// SHA can't do only 3 rounds
-#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+typedef struct
+{
+   __m512i buf[64>>2];
+   __m512i val[8];
+   uint32_t count_high, count_low;
+} sha256_16way_context __attribute__ ((aligned (128)));
+
+void sha256_16way_init( sha256_16way_context *sc );
+void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
+void sha256_16way_close( sha256_16way_context *sc, void *dst );
+void sha256_16way_full( void *dst, const void *data, size_t len );
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
+                                  const __m512i *W, const __m512i *state_in );
+void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
+
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                            const __m512i *state_in, const uint32_t *target );
+
+#endif // AVX512
+
+#if defined (__AVX2__)
+
+// SHA-256 8 way
+
+typedef struct
+{
+   __m256i buf[64>>2];
+   __m256i val[8];
+   uint32_t count_high, count_low;
+} sha256_8way_context __attribute__ ((aligned (64)));
+
+void sha256_8way_init( sha256_8way_context *sc );
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_close( sha256_8way_context *sc, void *dst );
+void sha256_8way_full( void *dst, const void *data, size_t len );
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                 const __m256i *W, const __m256i *state_in );
+void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                             const __m256i *state_in, const uint32_t *target );
+
+#endif  // AVX2
+
+#if defined(__SSE2__)
+
+// SHA-256 4 way
+
+typedef struct
+{
+   __m128i buf[64>>2];
+   __m128i val[8];
+   uint32_t count_high, count_low;
+} sha256_4way_context __attribute__ ((aligned (32)));
+
+void sha256_4way_init( sha256_4way_context *sc );
+void sha256_4way_update( sha256_4way_context *sc, const void *data,
+                         size_t len );
+void sha256_4way_close( sha256_4way_context *sc, void *dst );
+void sha256_4way_full( void *dst, const void *data, size_t len );
+void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in );
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                   const __m128i *state_in, const uint32_t *target );
+
+#endif  // SSE2
 
 #endif
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
index 24b588ec..b4a54c47 100644
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"
 
 static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
 {
@@ -17,11 +16,15 @@ static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
 int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t block0[16]   __attribute__ ((aligned (64)));
-   uint32_t block1[16]   __attribute__ ((aligned (64)));
-   uint32_t hash0[8]     __attribute__ ((aligned (32)));
-   uint32_t hash1[8]     __attribute__ ((aligned (32)));
-   uint32_t mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t mstateb[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
@@ -32,56 +35,60 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
    const __m128i shuf_bswap32 =
            _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
 
-   // hash first 64 bytes of data
-   sha256_opt_transform_le( mstate, pdata, sha256_iv );
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 80*8; // bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 32*8; // bit count
 
    do
    {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  mstate, mstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );
+
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
                                   sha256_iv, sha256_iv );
 
-      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
       {
-          casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-          casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
           {
              pdata[19] = n;
-             submit_solution( work, hash0, mythr );
+             submit_solution( work, hasha, mythr );
           }
       }
-
-      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
       {
-         casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-         casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
          {
             pdata[19] = n+1;
-            submit_solution( work, hash1, mythr );
+            submit_solution( work, hashb, mythr );
          }
       }
       n += 2;
@@ -99,18 +106,16 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  hash32[8]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  block[16]    __attribute__ ((aligned (128)));
    __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
    __m512i  mstate1[8]   __attribute__ ((aligned (64)));
    __m512i  mstate2[8]   __attribute__ ((aligned (64)));
    __m512i  istate[8]    __attribute__ ((aligned (64)));
    __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
    uint32_t phash[8]     __attribute__ ((aligned (32)));
    uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
-   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 16;
    const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -134,7 +139,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    mstate1[6] = _mm512_set1_epi32( phash[6] );
    mstate1[7] = _mm512_set1_epi32( phash[7] );
 
-   // second message block data, with nonce & padding   
+   // second message block data, with nonce & padding
    buf[0] = _mm512_set1_epi32( pdata[16] );
    buf[1] = _mm512_set1_epi32( pdata[17] );
    buf[2] = _mm512_set1_epi32( pdata[18] );
@@ -142,12 +147,12 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
    buf[4] = last_byte;
    memset_zero_512( buf+5, 10 );
-   buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
+   buf[15] = _mm512_set1_epi32( 80*8 );  // bit count
 
    // partially pre-expand & prehash second message block, avoiding the nonces
    sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
 
-   // vectorize IV for 2nd & 3rd sha256
+   // vectorize IV for second hash
    istate[0] = _mm512_set1_epi32( sha256_iv[0] );
    istate[1] = _mm512_set1_epi32( sha256_iv[1] );
    istate[2] = _mm512_set1_epi32( sha256_iv[2] );
@@ -157,27 +162,26 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    istate[6] = _mm512_set1_epi32( sha256_iv[6] );
    istate[7] = _mm512_set1_epi32( sha256_iv[7] );
 
-   // initialize padding for 2nd sha256
+   // initialize padding for second hash
    block[ 8] = last_byte;
-   memset_zero_512( block + 9, 6 );
+   memset_zero_512( block+9, 6 );
    block[15] = _mm512_set1_epi32( 32*8 ); // bit count
 
    do
    {
       sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
-
-      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
+      if ( unlikely( sha256_16way_transform_le_short(
+                                  hash32, block, istate, ptarget ) ) )
       {
          for ( int lane = 0; lane < 16; lane++ )
-         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
          {
             extr_lane_16x32( phash, hash32, lane, 256 );
             casti_m256i( phash, 0 ) =
-                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
             if ( likely( valid_hash( phash, ptarget ) && !bench ) )
             {
-               pdata[19] = n + lane;
-               submit_solution( work, phash, mythr );
+              pdata[19] = n + lane;
+              submit_solution( work, phash, mythr );
             }
          }
       }
@@ -188,92 +192,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    *hashes_done = n - first_nonce;
    return 0;
 }
-
-
-/*
-int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
-                           uint64_t *hashes_done, struct thr_info *mythr )
-{
-   __m512i  vdata[32]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (64)));
-   __m512i  initstate[8] __attribute__ ((aligned (64)));
-   __m512i  midstate1[8] __attribute__ ((aligned (64)));
-   __m512i  midstate2[8] __attribute__ ((aligned (64)));
-   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 16;
-   uint32_t n = first_nonce;
-   __m512i *noncev = vdata + 19;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
-   const __m512i sixteen = _mm512_set1_epi32( 16 );
-
-   for ( int i = 0; i < 19; i++ )
-       vdata[i] = _mm512_set1_epi32( pdata[i] );
-
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-
-   vdata[16+4] = last_byte;
-   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
-
-   block[ 8] = last_byte;
-   memset_zero_512( block + 9, 6 );
-   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
    
-   // initialize state
-   initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
-   initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
-   initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
-   initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
-
-   sha256_16way_transform_le( midstate1, vdata, initstate );
-
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                 mexp_pre );
-
-      // 2. 32 byte hash from 1.
-      sha256_16way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-      {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
-         }
-      }
-      *noncev = _mm512_add_epi32( *noncev, sixteen );
-      n += 16;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
-
 #endif
 
 #if defined(SHA256D_8WAY)
@@ -284,15 +203,13 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    __m256i  vdata[32]    __attribute__ ((aligned (64)));
    __m256i  block[16]    __attribute__ ((aligned (32)));
    __m256i  hash32[8]    __attribute__ ((aligned (32)));
-   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate1[8] __attribute__ ((aligned (32)));
-   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  istate[8]    __attribute__ ((aligned (32)));
+   __m256i  mstate1[8]   __attribute__ ((aligned (32)));
+   __m256i  mstate2[8]   __attribute__ ((aligned (32)));
    __m256i  mexp_pre[8]  __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
@@ -301,6 +218,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    const bool bench = opt_benchmark;
    const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
    const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
 
    for ( int i = 0; i < 19; i++ )
       vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -309,50 +228,47 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 
    vdata[16+4] = last_byte;
    memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = _mm256_set1_epi32( 80*8 );
 
    block[ 8] = last_byte;
    memset_zero_256( block + 9, 6 );
-   block[15] = _mm256_set1_epi32( 32*8 ); // bit count
-   
-   // initialize state
-   initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
-
-   sha256_8way_transform_le( midstate1, vdata, initstate );
+   block[15] = _mm256_set1_epi32( 32*8 ); 
    
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+   // initialize state for second hash
+   istate[0] = _mm256_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm256_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm256_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm256_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm256_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm256_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm256_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm256_set1_epi32( sha256_iv[7] );
+
+   sha256_8way_transform_le( mstate1, vdata, istate );
 
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   
    do
    {
-      // 1. final 16 bytes of data, with padding
-      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                mexp_pre );
-
-      // 2. 32 byte hash from 1.
-      sha256_8way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
-      mm256_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+                                                     istate, ptarget ) ) )
       {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         for ( int lane = 0; lane < 8; lane++ )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            casti_m256i( lane_hash, 0 ) =
+               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm256_add_epi32( *noncev, eight );
-       n += 8;
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -366,12 +282,12 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]     __attribute__ ((aligned (64)));
-   __m128i  block[16]     __attribute__ ((aligned (32)));
-   __m128i  hash32[8]     __attribute__ ((aligned (32)));
-   __m128i  initstate[8]  __attribute__ ((aligned (32)));
-   __m128i  midstate1[8]   __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  istate[8] __attribute__ ((aligned (32)));
+   __m128i  mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -392,33 +308,30 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 
    vdata[16+4] = last_byte;
    memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = _mm_set1_epi32( 80*8 );
 
    block[ 8] = last_byte;
    memset_zero_128( block + 9, 6 );
-   block[15] = _mm_set1_epi32( 32*8 ); // bit count
-
+   block[15] = _mm_set1_epi32( 32*8 );
+   
    // initialize state
-   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   istate[0] = _mm_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm_set1_epi32( sha256_iv[7] );
 
    // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate1, vdata, initstate );
+   sha256_4way_transform_le( mstate, vdata, istate );
 
    do
    {
-      // 1. final 16 bytes of data, with padding
-      sha256_4way_transform_le( block, vdata+16, initstate );
+      sha256_4way_transform_le( block,  vdata+16, mstate  );
+      sha256_4way_transform_le( hash32, block, istate );
 
-      // 2. 32 byte hash from 1.
-      sha256_4way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
       mm128_block_bswap_32( hash32, hash32 );
 
       for ( int lane = 0; lane < 4; lane++ )
@@ -440,3 +353,5 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 }
 
 #endif
+
+
diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c
index ac339c86..e1703126 100644
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   #define SHA256DT_16WAY 1
@@ -22,14 +21,104 @@ static const uint32_t sha256dt_iv[8]  __attribute__ ((aligned (32))) =
       0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
    };
 
+#if defined(SHA256DT_SHA)
+
+int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t mstateb[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 0x300; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );
+
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256dt_iv, sha256dt_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256DT_16WAY)
 
 int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  hash32[8]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  block[16]    __attribute__ ((aligned (128)));
    __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
    __m512i  mstate1[8]   __attribute__ ((aligned (64)));
    __m512i  mstate2[8]   __attribute__ ((aligned (64)));
    __m512i  istate[8]    __attribute__ ((aligned (64)));
@@ -37,8 +126,6 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
    uint32_t phash[8]     __attribute__ ((aligned (32)));
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
-//   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
-//   const uint32_t targ32_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 16;
    const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -75,7 +162,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
    // partially pre-expand & prehash second message block, avoiding the nonces
    sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
 
-   // vectorize IV for 2nd sha256
+   // vectorize IV for second hash
    istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
    istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
    istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
@@ -85,20 +172,18 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
    istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
    istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
 
-   // initialize padding for 2nd sha256
+   // initialize padding for second hash
    block[ 8] = last_byte;
    memset_zero_512( block+9, 6 );
    block[15] = _mm512_set1_epi32( 0x300 ); // bit count
 
    do
    {
-      // finish second block with nonces
       sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
       if ( unlikely( sha256_16way_transform_le_short(
                                   hash32, block, istate, ptarget ) ) )
       {
          for ( int lane = 0; lane < 16; lane++ )
-//         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
          {
             extr_lane_16x32( phash, hash32, lane, 256 );
             casti_m256i( phash, 0 ) =
@@ -118,86 +203,9 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
    return 0;
 }
    
-#elif defined(SHA256DT_SHA)
-
-int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block0[16]   __attribute__ ((aligned (64)));
-   uint32_t block1[16]   __attribute__ ((aligned (64)));
-   uint32_t hash0[8]     __attribute__ ((aligned (32)));
-   uint32_t hash1[8]     __attribute__ ((aligned (32)));
-   uint32_t mstate[8]  __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 2;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-   // hash first 64 bytes of data
-   sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 0x480; // funky bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  mstate, mstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 0x300; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  sha256dt_iv, sha256dt_iv );
-
-      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
-      {
-          casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-          casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
-          {
-             pdata[19] = n;
-             submit_solution( work, hash0, mythr );
-          }
-      }
-      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
-      {
-         casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-         casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
-         {
-            pdata[19] = n+1;
-            submit_solution( work, hash1, mythr );
-         }
-      }
-      n += 2;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
+#endif
 
-#elif defined(SHA256DT_8WAY)
+#if defined(SHA256DT_8WAY)
 
 int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
@@ -236,7 +244,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
    memset_zero_256( block + 9, 6 );
    block[15] = _mm256_set1_epi32( 0x300 ); 
    
-   // initialize state
+   // initialize state for swecond hash
    istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
    istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
    istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
@@ -253,11 +261,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
    
    do
    {
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
-                                mexp_pre );
-
-      if ( unlikely( sha256_8way_transform_le_short(
-                            hash32, block, istate, ptarget ) ) )
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+                                                     istate, ptarget ) ) )
       {
          for ( int lane = 0; lane < 8; lane++ )
          {
@@ -279,7 +285,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
    return 0;
 }
 
-#elif defined(SHA256DT_4WAY)
+#endif
+
+#if defined(SHA256DT_4WAY)
 
 int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c
index a57c80b3..0d07a396 100644
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
 
 #if defined(SHA256T_16WAY)
 
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 6a54a116..411d6f58 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -4,7 +4,12 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"
+
+   static const uint32_t sha256_iv[8]  __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
 
 #if defined(SHA256T_16WAY)
 
@@ -19,11 +24,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    __m512i  istate[8]    __attribute__ ((aligned (64)));
    __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
    uint32_t phash[8]     __attribute__ ((aligned (32)));
-   static const uint32_t IV[8]  __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
@@ -39,7 +39,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
 
    // prehash first block directly from pdata
-   sha256_transform_le( phash, pdata, IV );
+   sha256_transform_le( phash, pdata, sha256_iv );
 
    // vectorize block 0 hash for second block
    mstate1[0] = _mm512_set1_epi32( phash[0] );
@@ -65,14 +65,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
 
    // vectorize IV for 2nd & 3rd sha256
-   istate[0] = _mm512_set1_epi32( IV[0] );
-   istate[1] = _mm512_set1_epi32( IV[1] );
-   istate[2] = _mm512_set1_epi32( IV[2] );
-   istate[3] = _mm512_set1_epi32( IV[3] );
-   istate[4] = _mm512_set1_epi32( IV[4] );
-   istate[5] = _mm512_set1_epi32( IV[5] );
-   istate[6] = _mm512_set1_epi32( IV[6] );
-   istate[7] = _mm512_set1_epi32( IV[7] );
+   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm512_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm512_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm512_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm512_set1_epi32( sha256_iv[7] );
 
    // initialize padding for 2nd & 3rd sha256
    block[ 8] = last_byte;
@@ -110,6 +110,97 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
 
 #endif
 
+#if defined(__SHA__)
+
+int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]   __attribute__ ((aligned (32)));
+   uint32_t hashb[8]   __attribute__ ((aligned (32)));
+   uint32_t mstatea[8] __attribute__ ((aligned (32)));
+   uint32_t mstateb[8] __attribute__ ((aligned (32)));
+   uint32_t sstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 80*8; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );
+      sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)
    
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c
index e05c7060..e369f27b 100644
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -5,9 +5,9 @@ bool register_sha256t_algo( algo_gate_t* gate )
     gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256T_16WAY)
     gate->scanhash   = (void*)&scanhash_sha256t_16way;
-#elif defined(__SHA__)
+#elif defined(SHA256T_SHA)
     gate->optimizations = SHA_OPT;
-    gate->scanhash   = (void*)&scanhash_sha256t;
+    gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_8WAY)
     gate->scanhash   = (void*)&scanhash_sha256t_8way;
 #else
@@ -22,7 +22,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
     gate->scanhash   = (void*)&scanhash_sha256q_16way;
     gate->hash       = (void*)&sha256q_16way_hash;
-#elif defined(__SHA__)
+#elif defined(SHA256T_SHA)
     gate->optimizations = SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256q;
     gate->hash       = (void*)&sha256q_hash;
diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h
index e74cfd1d..a20b3dd0 100644
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -6,6 +6,8 @@
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   #define SHA256T_16WAY 1
+#elif defined(__SHA__)
+  #define SHA256T_SHA 1
 #elif defined(__AVX2__)
   #define SHA256T_8WAY 1
 #else
@@ -42,9 +44,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 
-#if defined(__SHA__)
+#if defined(SHA256T_SHA)
 
-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
+int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 
 #endif
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
deleted file mode 100644
index 298b5f09..00000000
--- a/algo/sha/sha256t.c
+++ /dev/null
@@ -1,102 +0,0 @@
-#include "sha256t-gate.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-//#include "algo/sha/sph_sha2.h"
-#include "sha256-hash.h"
-
-#if defined(__SHA__)
-
-// Only used on CPUs with SHA
-
-
-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block0[16]    __attribute__ ((aligned (64)));
-   uint32_t block1[16]    __attribute__ ((aligned (64)));
-   uint32_t hash0[8]    __attribute__ ((aligned (32)));
-   uint32_t hash1[8]    __attribute__ ((aligned (32)));
-   uint32_t initstate[8] __attribute__ ((aligned (32)));
-   uint32_t midstate[8]  __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 2;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-   // initialize state
-   initstate[0] = 0x6A09E667;
-   initstate[1] = 0xBB67AE85;
-   initstate[2] = 0x3C6EF372;
-   initstate[3] = 0xA54FF53A;
-   initstate[4] = 0x510E527F;
-   initstate[5] = 0x9B05688C;
-   initstate[6] = 0x1F83D9AB;
-   initstate[7] = 0x5BE0CD19;
-
-   // hash first 64 bytes of data
-   sha256_opt_transform_le( midstate, pdata, initstate );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
-
-      // 3. 32 byte hash from 2.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
-
-      // byte swap final hash for testing
-      casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-      casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-      casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-      casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-
-      if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash0, mythr );
-      }
-      if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
-      {
-         pdata[19] = n+1;
-         submit_solution( work, hash1, mythr );
-      }
-      n += 2;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#endif
-
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index 0cbd989c..cc4481bd 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -34,7 +34,7 @@
 
 #include <stddef.h>
 #include <string.h>
-#include "sha-hash-4way.h"
+#include "sha512-hash.h"
 
 /*
 static const uit64_t H512[8] =
diff --git a/algo/sha/sha512-hash.h b/algo/sha/sha512-hash.h
new file mode 100644
index 00000000..58ef67c4
--- /dev/null
+++ b/algo/sha/sha512-hash.h
@@ -0,0 +1,46 @@
+#ifndef SHA512_HASH_H__
+#define SHA512_HASH_H__ 1
+
+#include <stddef.h>
+#include "simd-utils.h"
+#include "sph_sha2.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
+typedef struct {
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));
+
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );
+void sha512_8way_full( void *dst, const void *data, size_t len );
+
+#endif  // AVX512
+
+#if defined (__AVX2__)
+
+// SHA-512 4 way
+
+typedef struct {
+   __m256i buf[128>>3];
+   __m256i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_4way_context __attribute__ ((aligned (64)));
+
+void sha512_4way_init( sha512_4way_context *sc);
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
+void sha512_4way_close( sha512_4way_context *sc, void *dst );
+void sha512_4way_full( void *dst, const void *data, size_t len );
+
+#endif  // AVX2
+
+#endif
diff --git a/algo/sha/sha512256d-4way.c b/algo/sha/sha512256d-4way.c
index 68218c41..72129b07 100644
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -1,5 +1,6 @@
 #include "algo-gate-api.h"
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
+#include "sha512-hash.h"
 #include <string.h>
 #include <stdint.h>
 
diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h
index ab05423e..bd001960 100644
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -41,7 +41,7 @@
 #define SPH_SHA2_H__
 
 #include <stddef.h>
-#include "sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for SHA-224.
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index 7cab8215..cd77fc6b 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -58,7 +58,7 @@ extern "C"{
            M8, M9, MA, MB, MC, MD, ME, MF; \
    const __m256i FIVE  = _mm256_set1_epi32( 5 ); \
    const __m256i THREE = _mm256_set1_epi32( 3 ); \
-   sph_u32 Wlow, Whigh;
+   uint32_t Wlow, Whigh;
 
 #define READ_STATE8(state) do \
 { \
@@ -653,7 +653,7 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
    const __m128i FIVE  = _mm_set1_epi32( 5 ); \
    const __m128i THREE = _mm_set1_epi32( 3 ); \
-   sph_u32 Wlow, Whigh;
+   uint32_t Wlow, Whigh;
 
 #define READ_STATE(state) do \
 { \
diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h
index 550a3c6f..cec80fe7 100644
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -1,51 +1,11 @@
-/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
-/**
- * Shabal interface. Shabal is a family of functions which differ by
- * their output size; this implementation defines Shabal for output
- * sizes 192, 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_shabal.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1
 
 #ifdef __SSE4_1__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"
 
-#ifdef __cplusplus
-extern "C"{
-#endif
-
 #define SPH_SIZE_shabal256   256
 
 #define SPH_SIZE_shabal512   512
@@ -55,7 +15,7 @@ extern "C"{
 typedef struct {
    __m256i buf[16];
    __m256i A[12], B[16], C[16];
-   sph_u32 Whigh, Wlow;
+   uint32_t Whigh, Wlow;
    size_t ptr;
    bool state_loaded;
 } shabal_8way_context __attribute__ ((aligned (64)));
@@ -80,7 +40,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
-	sph_u32 Whigh, Wlow;
+	uint32_t Whigh, Wlow;
    size_t ptr;
    bool state_loaded;
 } shabal_4way_context;
@@ -100,10 +60,6 @@ void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                        void *dst );
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif
 
 #endif
diff --git a/algo/shabal/sph_shabal.h b/algo/shabal/sph_shabal.h
index 4d230fb9..c743ee8b 100644
--- a/algo/shabal/sph_shabal.h
+++ b/algo/shabal/sph_shabal.h
@@ -37,7 +37,7 @@
 #define SPH_SHABAL_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index b96a393a..26a9ab25 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -1,6 +1,4 @@
 #include "shavite-hash-2way.h"
-#include "algo/sha/sph_types.h"
-
 #include <stdio.h>
 
 // This is a fake, it actually does not do parallel AES, that requires VAES.
diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c
index 3d7c8286..728a273a 100644
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -64,7 +64,7 @@ extern "C"{
  */
 
 #define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
+#include "compat/aes_helper.c"
 
 static const sph_u32 IV224[] = {
 	C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371),
diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h
index f30f4dfb..c470e6db 100644
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -39,7 +39,7 @@
 #define SPH_SHAVITE_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 #ifdef __cplusplus
 extern "C"{
diff --git a/algo/simd/nist.h b/algo/simd/nist.h
index b4737ffb..052bf71e 100644
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -9,7 +9,7 @@
 #endif
 
 #include "simd-compat.h"
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"
 /*
  * NIST API Specific types.
  */
diff --git a/algo/simd/simd-compat.h b/algo/simd/simd-compat.h
index 721ab906..1c2b379a 100644
--- a/algo/simd/simd-compat.h
+++ b/algo/simd/simd-compat.h
@@ -24,7 +24,7 @@
  */
 
 #include <stdint.h>
-#include "algo/sha/brg_types.h"
+#include "compat/brg_types.h"
 
 #define C32(x)    ((u32)(x))
 
diff --git a/algo/simd/sph_simd.h b/algo/simd/sph_simd.h
index 2c6b7bf1..3397c8c8 100644
--- a/algo/simd/sph_simd.h
+++ b/algo/simd/sph_simd.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 /**
  * Output size (in bits) for SIMD-224.
diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c
index 4e781681..b66c6c14 100644
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,7 +2,6 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"
 
 #if defined (SKEIN_8WAY)
diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c
index 7adeac9f..191aa154 100644
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -1,5 +1,4 @@
 #include "skein-gate.h"
-#include "sph_skein.h"
 #include "skein-hash-4way.h"
 
 bool register_skein_algo( algo_gate_t* gate )
diff --git a/algo/skein/sph_skein.h b/algo/skein/sph_skein.h
index 2ba7e334..0b9ba5d0 100644
--- a/algo/skein/sph_skein.h
+++ b/algo/skein/sph_skein.h
@@ -46,7 +46,7 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 #if SPH_64
 
diff --git a/algo/tiger/sph_tiger.h b/algo/tiger/sph_tiger.h
index 6461b475..8107c091 100644
--- a/algo/tiger/sph_tiger.h
+++ b/algo/tiger/sph_tiger.h
@@ -45,7 +45,7 @@
 #define SPH_TIGER_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 #if SPH_64
 
diff --git a/algo/whirlpool/sph_whirlpool.h b/algo/whirlpool/sph_whirlpool.h
index 801a9f92..10a21f36 100644
--- a/algo/whirlpool/sph_whirlpool.h
+++ b/algo/whirlpool/sph_whirlpool.h
@@ -49,7 +49,7 @@
 #define SPH_WHIRLPOOL_H__
 
 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 
 #if SPH_64
 
diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c
index fada82f6..75e6c0db 100644
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -65,6 +65,9 @@ void init_x11_8way_ctx()
 #endif
 }
 
+static __thread __m512i x11_8way_midstate[16] __attribute__((aligned(64)));
+
+
 void x11_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -80,8 +83,9 @@ void x11_8way_hash( void *state, const void *input )
      uint64_t hash7[8] __attribute__ ((aligned (64)));
      x11_8way_ctx_holder ctx;
      memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
+
+     blake512_8way_final_le( &ctx.blake, vhash, casti_m512i( input, 9 ),
+                             x11_8way_midstate );
 
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
@@ -252,39 +256,45 @@ void x11_8way_hash( void *state, const void *input )
 int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[8*8] __attribute__ ((aligned (128)));
-     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
-     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;
-     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-     const uint32_t Htarg = ptarget[7];
-
-     const uint32_t last_nonce = max_nonce -8;
-     mm512_bswap32_intrlv80_8x64( vdata, pdata );
-
-     do
-     {
-        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
-         x11_8way_hash( hash, vdata );
-         pdata[19] = n;
-
-         for ( int i = 0; i < 8; i++ )
-         if ( ( hash+(i<<3) )[7] <= Htarg
-              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-         {
-             pdata[19] = n+i;
-             submit_solution( work, hash+(i<<3), mythr );
-         }
-         n += 8;
-     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-     *hashes_done = n - first_nonce;
-     return 0;
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;
+   __m512i  *noncev = (__m512i*)vdata + 9; 
+   const uint32_t last_nonce = max_nonce -8;
+   const __m512i eight = _mm512_set1_epi64( 8 );
+
+   // convert LE32 to LE64
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
+
+   mm512_intrlv80_8x64( vdata, edata );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                                    0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
+   blake512_8way_prehash_le( &x11_8way_ctx.blake, x11_8way_midstate, vdata );
+
+   do
+   {
+      x11_8way_hash( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark ))
+      {
+          pdata[19] = n+i;
+          submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   return 0;
 }
 
 
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index c498ff7a..19ba317f 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -263,7 +263,7 @@ bool register_hex_algo( algo_gate_t* gate )
   gate->scanhash        = (void*)&scanhash_hex;
   gate->hash            = (void*)&x16r_hash;
   gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
   opt_target_factor = 128.0;
   return true;
 };
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index 76ca5e7e..be425c41 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -20,7 +20,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha512-hash.h"
 
 #if defined(__AES__)
 #include "algo/echo/aes_ni/hash_api.h"
@@ -42,7 +42,6 @@
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
 
 #if defined(__VAES__)
 #include "algo/groestl/groestl512-hash-4way.h"
diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c
index 103bc636..74eddd52 100644
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -12,9 +12,7 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif
+#include "algo/sha/sha256-hash.h"
 
 #if defined (X21S_8WAY)
 
diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c
index d29db77f..203cd2ac 100644
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -20,7 +20,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #if defined(__VAES__)
   #include "algo/groestl/groestl512-hash-4way.h"
   #include "algo/shavite/shavite-hash-4way.h"
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index e84bad42..5b6a7f73 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -25,7 +25,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 
 #if defined(X17_8WAY)
 
@@ -37,7 +37,6 @@ union _x17_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-//    cube_4way_context       cube;
     cube_4way_2buf_context   cube;
 #if defined(__VAES__)
     groestl512_4way_context groestl;
@@ -190,7 +189,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
-
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index 54e3051c..3566e7fd 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -19,7 +19,7 @@
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #if defined(__VAES__)
   #include "algo/groestl/groestl512-hash-4way.h"
diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c
index e94cb1c8..40e60217 100644
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -16,7 +16,8 @@
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -26,9 +27,6 @@
   #include "algo/shavite/shavite-hash-4way.h"
   #include "algo/echo/echo-hash-4way.h"
 #endif
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif
 
 #if defined(X22I_8WAY)
 
diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c
index 76191219..07445591 100644
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -6,7 +6,8 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/blake/blake2s-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -31,9 +32,6 @@
   #include "algo/shavite/shavite-hash-4way.h"
   #include "algo/echo/echo-hash-4way.h"
 #endif
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif
 
 void x25x_shuffle( void *hash )
 {
diff --git a/asm/aesb-x64.S b/asm/aesb-x64.S
deleted file mode 100644
index 75c04200..00000000
--- a/asm/aesb-x64.S
+++ /dev/null
@@ -1,72 +0,0 @@
-#include <cpuminer-config.h>
-
-#if defined(__linux__) && defined(__ELF__)
-    .section .note.GNU-stack,"",%progbits
-#endif
-
-    .text
-    .p2align 6
-    .globl fast_aesb_single_round
-    .globl _fast_aesb_single_round
-fast_aesb_single_round:
-_fast_aesb_single_round:
-#if defined(_WIN64) || defined(__CYGWIN__)
-    movdqa (%rcx), %xmm1
-    aesenc (%r8), %xmm1
-    movdqa %xmm1, (%rdx)
-#else
-    movdqa (%rdi), %xmm1
-    aesenc (%rdx), %xmm1
-    movdqa %xmm1, (%rsi)
-#endif
-    ret
-
-    .text
-    .p2align 6
-    .globl fast_aesb_pseudo_round_mut
-    .globl _fast_aesb_pseudo_round_mut
-fast_aesb_pseudo_round_mut:
-_fast_aesb_pseudo_round_mut:
-#if defined(_WIN64) || defined(__CYGWIN__)
-    mov %rdx, %r9
-    add $0xA0, %r9
-    movdqa (%rcx), %xmm1
- 
-    .LOOP:
-            aesenc (%rdx), %xmm1
-            add $0x10, %rdx
-			cmp %r9, %rdx
-            jl .LOOP
-
-    movdqa %xmm1, (%rcx)
-#else
-    mov %rsi, %r9
-    add $0xA0, %r9
-    movdqa (%rdi), %xmm1
- 
-    .LOOP:
-            aesenc (%rsi), %xmm1
-            add $0x10, %rsi
-            cmp %r9, %rsi
-            jl .LOOP
-
-    movdqa %xmm1, (%rdi)
-#endif
-    ret
-
-    .text
-    .globl mul128
-    .globl _mul128
-mul128:
-_mul128:
-#if defined(_WIN64) || defined(__CYGWIN__)
-	mov %rcx, %rax
-	mul %rdx
-	mov %rdx, (%r8)
-#else
-	mov %rdx, %r8
-	mov %rdi, %rax
-	mul %rsi
-	mov %rdx, (%r8)
-#endif
-	ret
diff --git a/asm/aesb-x86.S b/asm/aesb-x86.S
deleted file mode 100644
index ab3d1eab..00000000
--- a/asm/aesb-x86.S
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <cpuminer-config.h>
-
-#if defined(__linux__) && defined(__ELF__)
-    .section .note.GNU-stack,"",%progbits
-#endif
-
-    .text
-    .p2align 6
-    .globl fast_aesb_single_round
-    .globl _fast_aesb_single_round
-fast_aesb_single_round:
-_fast_aesb_single_round:
-    ret
-
-    .text
-    .p2align 6
-    .globl fast_aesb_pseudo_round_mut
-    .globl _fast_aesb_pseudo_round_mut
-fast_aesb_pseudo_round_mut:
-_fast_aesb_pseudo_round_mut:
-    ret
diff --git a/comp.log b/comp.log
deleted file mode 100644
index 096dbf73..00000000
--- a/comp.log
+++ /dev/null
@@ -1,50 +0,0 @@
-make  all-recursive
-make[1]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
-Making all in compat
-make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-Making all in jansson
-make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson'
-make[3]: Nothing to be done for `all'.
-make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson'
-make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-make[3]: Nothing to be done for `all-am'.
-make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT cpuminer-cpu-miner.o -MD -MP -MF .deps/cpuminer-cpu-miner.Tpo -c -o cpuminer-cpu-miner.o `test -f 'cpu-miner.c' || echo './'`cpu-miner.c
-mv -f .deps/cpuminer-cpu-miner.Tpo .deps/cpuminer-cpu-miner.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT cpuminer-util.o -MD -MP -MF .deps/cpuminer-util.Tpo -c -o cpuminer-util.o `test -f 'util.c' || echo './'`util.c
-mv -f .deps/cpuminer-util.Tpo .deps/cpuminer-util.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT cpuminer-algo-gate-api.o -MD -MP -MF .deps/cpuminer-algo-gate-api.Tpo -c -o cpuminer-algo-gate-api.o `test -f 'algo-gate-api.c' || echo './'`algo-gate-api.c
-mv -f .deps/cpuminer-algo-gate-api.Tpo .deps/cpuminer-algo-gate-api.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/shavite/cpuminer-shavite.o -MD -MP -MF algo/shavite/.deps/cpuminer-shavite.Tpo -c -o algo/shavite/cpuminer-shavite.o `test -f 'algo/shavite/shavite.c' || echo './'`algo/shavite/shavite.c
-mv -f algo/shavite/.deps/cpuminer-shavite.Tpo algo/shavite/.deps/cpuminer-shavite.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/keccak/cpuminer-keccak.o -MD -MP -MF algo/keccak/.deps/cpuminer-keccak.Tpo -c -o algo/keccak/cpuminer-keccak.o `test -f 'algo/keccak/keccak.c' || echo './'`algo/keccak/keccak.c
-mv -f algo/keccak/.deps/cpuminer-keccak.Tpo algo/keccak/.deps/cpuminer-keccak.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-axiom.o -MD -MP -MF algo/.deps/cpuminer-axiom.Tpo -c -o algo/cpuminer-axiom.o `test -f 'algo/axiom.c' || echo './'`algo/axiom.c
-mv -f algo/.deps/cpuminer-axiom.Tpo algo/.deps/cpuminer-axiom.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake.o -MD -MP -MF algo/blake/.deps/cpuminer-blake.Tpo -c -o algo/blake/cpuminer-blake.o `test -f 'algo/blake/blake.c' || echo './'`algo/blake/blake.c
-mv -f algo/blake/.deps/cpuminer-blake.Tpo algo/blake/.deps/cpuminer-blake.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake2.o -MD -MP -MF algo/blake/.deps/cpuminer-blake2.Tpo -c -o algo/blake/cpuminer-blake2.o `test -f 'algo/blake/blake2.c' || echo './'`algo/blake/blake2.c
-mv -f algo/blake/.deps/cpuminer-blake2.Tpo algo/blake/.deps/cpuminer-blake2.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blakecoin.o -MD -MP -MF algo/blake/.deps/cpuminer-blakecoin.Tpo -c -o algo/blake/cpuminer-blakecoin.o `test -f 'algo/blake/blakecoin.c' || echo './'`algo/blake/blakecoin.c
-mv -f algo/blake/.deps/cpuminer-blakecoin.Tpo algo/blake/.deps/cpuminer-blakecoin.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-decred.o -MD -MP -MF algo/blake/.deps/cpuminer-decred.Tpo -c -o algo/blake/cpuminer-decred.o `test -f 'algo/blake/decred.c' || echo './'`algo/blake/decred.c
-mv -f algo/blake/.deps/cpuminer-decred.Tpo algo/blake/.deps/cpuminer-decred.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-pentablake.o -MD -MP -MF algo/blake/.deps/cpuminer-pentablake.Tpo -c -o algo/blake/cpuminer-pentablake.o `test -f 'algo/blake/pentablake.c' || echo './'`algo/blake/pentablake.c
-mv -f algo/blake/.deps/cpuminer-pentablake.Tpo algo/blake/.deps/cpuminer-pentablake.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/bmw/cpuminer-bmw256.o -MD -MP -MF algo/bmw/.deps/cpuminer-bmw256.Tpo -c -o algo/bmw/cpuminer-bmw256.o `test -f 'algo/bmw/bmw256.c' || echo './'`algo/bmw/bmw256.c
-mv -f algo/bmw/.deps/cpuminer-bmw256.Tpo algo/bmw/.deps/cpuminer-bmw256.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-c11.o -MD -MP -MF algo/.deps/cpuminer-c11.Tpo -c -o algo/cpuminer-c11.o `test -f 'algo/c11.c' || echo './'`algo/c11.c
-mv -f algo/.deps/cpuminer-c11.Tpo algo/.deps/cpuminer-c11.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-cryptolight.o -MD -MP -MF algo/.deps/cpuminer-cryptolight.Tpo -c -o algo/cpuminer-cryptolight.o `test -f 'algo/cryptolight.c' || echo './'`algo/cryptolight.c
-mv -f algo/.deps/cpuminer-cryptolight.Tpo algo/.deps/cpuminer-cryptolight.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cryptonight/cpuminer-cryptonight-common.o -MD -MP -MF algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo -c -o algo/cryptonight/cpuminer-cryptonight-common.o `test -f 'algo/cryptonight/cryptonight-common.c' || echo './'`algo/cryptonight/cryptonight-common.c
-mv -f algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo algo/cryptonight/.deps/cpuminer-cryptonight-common.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-drop.o -MD -MP -MF algo/.deps/cpuminer-drop.Tpo -c -o algo/cpuminer-drop.o `test -f 'algo/drop.c' || echo './'`algo/drop.c
-mv -f algo/.deps/cpuminer-drop.Tpo algo/.deps/cpuminer-drop.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-fresh.o -MD -MP -MF algo/.deps/cpuminer-fresh.Tpo -c -o algo/cpuminer-fresh.o `test -f 'algo/fresh.c' || echo './'`algo/fresh.c
-mv -f algo/.deps/cpuminer-fresh.Tpo algo/.deps/cpuminer-fresh.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/groestl/cpuminer-groestl.o -MD -MP -MF algo/groestl/.deps/cpuminer-groestl.Tpo -c -o algo/groestl/cpuminer-groestl.o `test -f 'algo/groestl/groestl.c' || echo './'`algo/groestl/groestl.c
-make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
-make[1]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
diff --git a/algo/sha/aes_helper.c b/compat/aes_helper.c
similarity index 98%
rename from algo/sha/aes_helper.c
rename to compat/aes_helper.c
index 75b7cc69..30063440 100644
--- a/algo/sha/aes_helper.c
+++ b/compat/aes_helper.c
@@ -43,16 +43,15 @@
  * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
  */
 
-#include "sph_types.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
 #if AES_BIG_ENDIAN
 
-#define AESx(x)   ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
-                  | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
-                  | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
-                  | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define AESx(x)   ( (((x) >> 24) & 0x000000FF) \
+                  | (((x) >>  8) & 0x0000FF00) \
+                  | (((x) <<  8) & 0x00FF0000) \
+                  | (((x) << 24) & 0xFF000000))
 
 #define AES0      AES0_BE
 #define AES1      AES1_BE
@@ -83,7 +82,7 @@ extern "C"{
 
 #else
 
-#define AESx(x)   SPH_C32(x)
+#define AESx(x)   (x)
 #define AES0      AES0_LE
 #define AES1      AES1_LE
 #define AES2      AES2_LE
@@ -119,7 +118,7 @@ extern "C"{
  * MixColumns for the column where that byte goes after ShiftRows.
  */
 
-static const sph_u32 AES0[256] = {
+static const uint32_t AES0[256] = {
 	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
 	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
 	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
@@ -186,7 +185,7 @@ static const sph_u32 AES0[256] = {
 	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
 };
 
-static const sph_u32 AES1[256] = {
+static const uint32_t AES1[256] = {
 	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
 	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
 	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
@@ -253,7 +252,7 @@ static const sph_u32 AES1[256] = {
 	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
 };
 
-static const sph_u32 AES2[256] = {
+static const uint32_t AES2[256] = {
 	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
 	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
 	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
@@ -320,7 +319,7 @@ static const sph_u32 AES2[256] = {
 	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
 };
 
-static const sph_u32 AES3[256] = {
+static const uint32_t AES3[256] = {
 	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
 	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
 	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
diff --git a/algo/sha/brg_types.h b/compat/brg_types.h
similarity index 100%
rename from algo/sha/brg_types.h
rename to compat/brg_types.h
diff --git a/algo/sha/sha3-defs.h b/compat/sha3-defs.h
similarity index 100%
rename from algo/sha/sha3-defs.h
rename to compat/sha3-defs.h
diff --git a/algo/sha/sha3_common.h b/compat/sha3_common.h
similarity index 100%
rename from algo/sha/sha3_common.h
rename to compat/sha3_common.h
diff --git a/algo/sha/sph_types.h b/compat/sph_types.h
similarity index 100%
rename from algo/sha/sph_types.h
rename to compat/sph_types.h
diff --git a/config-template.json b/config-template.json
new file mode 100644
index 00000000..9f82214d
--- /dev/null
+++ b/config-template.json
@@ -0,0 +1,22 @@
+{
+	"_comment" : "Any long-format command line argument ",
+	"_comment" : "may be used in this JSON configuration file",
+   "_comment" : "Additional arguments may be added to the command line.",
+   "_comment" : "Usage: cpuminer -c myconfig.json [additional arguments]",
+
+   "_comment" : "Required arguments, replace dummy values",
+   
+	"url" : "stratum+tcp://example.com:3333",
+	"user" : "read.pool.instructions",
+	"pass" : "x.often.works",
+	"algo" : "algo",
+
+   "_comment" : "Often used optional arguments with default values selected.",
+   "_comment" : "Change values, add or delete arguments as desired.",
+
+   "threads" : 0,
+	"cpu-affinity" : -1,
+   "api-bind" : "127.0.0.1:4048",
+	"benchmark" : false,
+	"quiet" : false
+}
diff --git a/configure b/configure
index 2403ac96..2fd8e71b 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.1.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.23.1'
-PACKAGE_STRING='cpuminer-opt 3.23.1'
+PACKAGE_VERSION='3.23.2'
+PACKAGE_STRING='cpuminer-opt 3.23.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.23.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1432,7 +1432,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.23.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.23.1
+cpuminer-opt configure 3.23.2
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.23.1, which was
+It was created by cpuminer-opt $as_me 3.23.2, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.23.1'
+ VERSION='3.23.2'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.23.1, which was
+This file was extended by cpuminer-opt $as_me 3.23.2, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 3.23.1
+cpuminer-opt config.status 3.23.2
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 8276943b..d6a28a5e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.23.1])
+AC_INIT([cpuminer-opt], [3.23.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/configure~ b/configure~
index d69fd1f9..5e85cc30 100755
--- a/configure~
+++ b/configure~
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.23.1'
-PACKAGE_STRING='cpuminer-opt 3.23.1'
+PACKAGE_VERSION='3.23.2'
+PACKAGE_STRING='cpuminer-opt 3.23.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.23.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.23.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.23.1
+cpuminer-opt configure 3.23.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.23.1, which was
+It was created by cpuminer-opt $as_me 3.23.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.23.1'
+ VERSION='3.23.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.23.1, which was
+This file was extended by cpuminer-opt $as_me 3.23.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.23.1
+cpuminer-opt config.status 3.23.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/cpu-miner.c b/cpu-miner.c
index 37234ddb..e86e6a77 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -954,10 +954,10 @@ static inline void sprintf_et( char *str, long unsigned int seconds )
       sprintf( str, "%lum%02lus", min, sec );
 }
 
-const long double exp32 = EXP32;                                  // 2**32
-const long double exp48 = EXP32 * EXP16;                          // 2**48
-const long double exp64 = EXP32 * EXP32;                          // 2**64
-const long double exp96 = EXP32 * EXP32 * EXP32;                  // 2**96
+const long double exp32  = EXP32;                                 // 2**32
+const long double exp48  = EXP32 * EXP16;                         // 2**48
+const long double exp64  = EXP32 * EXP32;                         // 2**64
+const long double exp96  = EXP32 * EXP32 * EXP32;                 // 2**96
 const long double exp128 = EXP32 * EXP32 * EXP32 * EXP32;         // 2**128
 const long double exp160 = EXP32 * EXP32 * EXP32 * EXP32 * EXP16; // 2**160
 
@@ -1280,53 +1280,11 @@ static int share_result( int result, struct work *work,
    applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)",
            my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
            bres, CL_N, share_time, latency );
-
-/*   
-   if ( unlikely( opt_debug || !result || solved ) )
-   {
-      if ( have_stratum )
-         applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s",
-               my_stats.share_diff, my_stats.height, my_stats.job_id );
-      else
-         applog2( LOG_INFO, "Diff %.5g, Block %d",
-               my_stats.share_diff, work ? work->height : last_block_height );
-   }
-*/
-
    if ( unlikely( !( opt_quiet || result || stale ) ) )
    {
-//      uint32_t str[8];
-//      uint32_t *targ;
-
-      if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason );
-      {
-         // The exact hash is not avaiable here, it's just an imprecise
-         // approximation calculated from the share difficulty. It's useless
-         // for anything other than low diff rejects. Until and unless a
-         // solution is implemented to make the hash and targets avaiable
-         // don't bother displaying them. In the meantime display the diff for
-         // low diff rejects.
-
-         if ( strstr( reason, "difficulty" ) )
-            applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g",
-                               my_stats.share_diff, my_stats.target_diff );
-
-/*
-      diff_to_hash( str, my_stats.share_diff );
-      applog2( LOG_INFO, "Hash:   %08x%08x%08x%08x%08x%08x", str[7], str[6],
-               str[5], str[4], str[3],str[2], str[1], str[0] );
-
-      if ( work )
-         targ = work->target;
-      else
-      {
-         diff_to_hash( str, my_stats.target_diff );
-         targ = &str[0];
-      }
-      applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
-               targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
-*/
-      }
+      applog2( LOG_INFO, "Reject reason: %s", reason ? reason : "NULL" );
+      applog2( LOG_INFO, "Share diff: %.5g, Target: %.5g",
+                        my_stats.share_diff, my_stats.target_diff );
    }
    return 1;
 }
@@ -1986,6 +1944,7 @@ void sha256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
      sha256d( merkle_root, merkle_root, 64 );
   }
 }
+/*
 // OpenSSL single sha256, deprecated
 void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 {
@@ -1996,6 +1955,7 @@ void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
      sha256d( merkle_root, merkle_root, 64 );
   }
 }
+*/
 
 // Default is do_nothing (assumed LE)
 void set_work_data_big_endian( struct work *work )
diff --git a/cpuminer-conf.json b/cpuminer-conf.json
deleted file mode 100644
index d464f528..00000000
--- a/cpuminer-conf.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-	"_comment1" : "Any long-format command line argument ",
-	"_comment2" : "may be used in this JSON configuration file",
-
-	"api-bind" : "127.0.0.1:4048",
-
-	"url" : "stratum+tcp://mine.xpool.ca:1131",
-	"user" : "XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3",
-	"pass" : "cpuminer",
-
-	"algo" : "x11",
-	"threads" : 0,
-	"cpu-priority" : 0,
-	"cpu-affinity" : -1,
-
-	"benchmark" : false,
-	"debug" : false,
-	"protocol": false,
-	"quiet" : false
-}
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index ad895256..bafcded5 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -43,6 +43,9 @@ typedef union
 } __attribute__ ((aligned (16))) m128_ovly;
 
 
+#define v128_64(i)    _mm_set1_epi64x(i)
+#define v128_32(i)    _mm_set1_epi32(i)
+
 // Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
 // that make these functions either unnecessary or inefficient.
 // In cases where an explicit move betweeen GP & SIMD registers is still
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index d5425c8b..2f86a3f4 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -33,6 +33,10 @@ typedef union
    uint32_t u32[8];
 } __attribute__ ((aligned (32))) m256_ovly;
 
+
+#define v256_64(i)    _mm256_set1_epi64x(i)
+#define v256_32(i)    _mm256_set1_epi32(i)
+
 //
 // Pointer casting
 
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index ebd7d764..7b902823 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -97,6 +97,9 @@ typedef union
    uint64_t u64[8];
 } __attribute__ ((aligned (64))) m512_ovly;
 
+#define v512_64(i)    _mm512_set1_epi64(i)
+#define v512_32(i)    _mm512_set1_epi32(i)
+
 // A simple 128 bit permute, using function instead of macro avoids
 // problems if the v arg passed as an expression.
 static inline __m512i mm512_perm_128( const __m512i v, const int c )
diff --git a/verthash-help.txt b/verthash-help.txt
index f8e02db4..c7228950 100644
--- a/verthash-help.txt
+++ b/verthash-help.txt
@@ -64,8 +64,8 @@ then exit.
 --algo verthash --verify
 
 A data file will never be created if --data-file is specified. The miner
-will exit with an error if the file is not found. This is to avoid accidentally
-creating an unwanted data file due to a typo.
+will exit with an error if the file is not found in the specified location.
+This is to avoid accidentally creating an unwanted data file due to a typo.
 
 After creation the data file can moved to a more convenient location and
 referenced by --data-file, or left where it is and used by default without the