From 042d13d1e10395764c6607f9b038d588536d638a Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Mon, 20 May 2024 23:08:50 -0400
Subject: [PATCH] v24.2

---
 RELEASE_NOTES                             |   6 +
 algo-gate-api.c                           |   2 +-
 algo-gate-api.h                           |   5 +-
 algo/argon2d/argon2d/opt.c                |   4 +-
 algo/argon2d/blake2/blamka-round-opt.h    |   2 +-
 algo/blake/blake256-hash.c                |   4 +-
 algo/blake/blake256-hash.h                |   2 +-
 algo/blake/blake2b-hash.c                 |   2 +-
 algo/blake/blake2b-hash.h                 |   2 +-
 algo/blake/blake2b.c                      |   2 +-
 algo/blake/blake2s-hash.c                 |   2 +-
 algo/blake/blake2s-hash.h                 |  34 +-
 algo/blake/blake2s.c                      |   2 +-
 algo/blake/blake512-hash.c                |  50 +--
 algo/blake/blake512-hash.h                |   2 +-
 algo/blake/blakecoin-gate.h               |   2 +-
 algo/blake/sph_blake2b.c                  |   8 +-
 algo/bmw/bmw-hash-4way.h                  |   4 +-
 algo/bmw/bmw256-hash-4way.c               |   2 +-
 algo/bmw/bmw512-gate.h                    |   2 +-
 algo/bmw/bmw512-hash-4way.c               |   2 +-
 algo/cubehash/cube-hash-2way.c            |   2 +-
 algo/cubehash/cube-hash-2way.h            |   2 +-
 algo/cubehash/cubehash_sse2.c             |   2 +-
 algo/echo/echo-hash-4way.c                |   2 +-
 algo/echo/echo-hash-4way.h                |   2 +-
 algo/gost/sph_gost.c                      |   2 +-
 algo/groestl/aes_ni/groestl-intr-aes.h    |   2 +-
 algo/groestl/aes_ni/groestl256-intr-aes.h |   2 +-
 algo/groestl/groestl-gate.h               |   2 +-
 algo/groestl/groestl256-hash-4way.c       |   2 +-
 algo/groestl/groestl256-hash-4way.h       |   2 +-
 algo/groestl/groestl256-intr-4way.h       |   2 +-
 algo/groestl/groestl512-hash-4way.c       |   2 +-
 algo/groestl/groestl512-hash-4way.h       |   2 +-
 algo/groestl/groestl512-intr-4way.h       |   2 +-
 algo/groestl/myrgr-gate.h                 |   2 +-
 algo/hamsi/hamsi-hash-4way.c              |   6 +-
 algo/hamsi/hamsi-hash-4way.h              |   2 +-
 algo/haval/haval-hash-4way.c              |   6 +-
 algo/haval/haval-hash-4way.h              |   2 +-
 algo/jh/jh-hash-4way.c                    |   7 +-
 algo/jh/jh-hash-4way.h                    |   2 +-
 algo/keccak/keccak-gate.h                 |   4 +-
 algo/keccak/keccak-hash-4way.c            |   2 +-
 algo/keccak/keccak-hash-4way.h            |   2 +-
 algo/luffa/luffa-hash-2way.c              |   5 +-
 algo/luffa/luffa-hash-2way.h              |   2 +-
 algo/luffa/luffa_for_sse2.c               |   6 +-
 algo/luffa/luffa_for_sse2.h               |   2 +-
 algo/lyra2/allium-4way.c                  |   2 +-
 algo/lyra2/lyra2-gate.h                   |   6 +-
 algo/lyra2/lyra2-hash-2way.c              |   2 +-
 algo/lyra2/lyra2.h                        |   2 +-
 algo/lyra2/lyra2z-4way.c                  |   2 +-
 algo/lyra2/phi2-4way.c                    |   2 +-
 algo/lyra2/sponge-2way.c                  |   2 +-
 algo/lyra2/sponge.h                       |  12 +-
 algo/nist5/nist5-gate.h                   |   2 +-
 algo/panama/panama-hash-4way.c            |   5 +-
 algo/quark/anime-gate.h                   |   2 +-
 algo/quark/hmq1725-gate.h                 |   2 +-
 algo/quark/quark-gate.h                   |   2 +-
 algo/qubit/qubit-gate.h                   |   2 +-
 algo/ripemd/lbry-gate.h                   |   2 +-
 algo/ripemd/ripemd-hash-4way.c            |  10 +-
 algo/ripemd/ripemd-hash-4way.h            |   2 +-
 algo/scrypt/scrypt-core-4way.c            |   6 +-
 algo/scrypt/scrypt-core-4way.h            |   2 +-
 algo/scrypt/scrypt.c                      |   6 +-
 algo/sha/hmac-sha256-hash-4way.c          |   2 +-
 algo/sha/hmac-sha256-hash-4way.h          |   2 +-
 algo/sha/sha256-hash-4way.c               |  12 +-
 algo/sha/sha256-hash.c                    |  52 ++-
 algo/sha/sha256-hash.h                    |   2 +-
 algo/sha/sha256d-4way.h                   |   2 +-
 algo/sha/sha256d.h                        |   2 +-
 algo/sha/sha256dt.c                       |   2 +-
 algo/sha/sha256t-gate.h                   |   2 +-
 algo/sha/sha512-hash-4way.c               | 383 ++++++++++++++--------
 algo/sha/sha512-hash.h                    |   2 +-
 algo/sha/sha512256d-4way.c                |   2 +-
 algo/shabal/shabal-hash-4way.c            |   2 +-
 algo/shabal/shabal-hash-4way.h            |   2 +-
 algo/shavite/shavite-hash-2way.c          |   3 +-
 algo/shavite/shavite-hash-4way.c          |   2 +-
 algo/shavite/shavite-hash-4way.h          |   4 +-
 algo/simd/simd-hash-2way.c                |   5 +-
 algo/simd/simd-hash-2way.h                |   2 +-
 algo/skein/skein-gate.h                   |   2 +-
 algo/skein/skein-hash-4way.c              |   4 +-
 algo/skein/skein-hash-4way.h              |   2 +-
 algo/swifftx/swifftx.c                    |   4 +-
 algo/verthash/tiny_sha3/sha3-4way.c       |   2 +-
 algo/verthash/tiny_sha3/sha3-4way.h       |   2 +-
 algo/x11/c11-gate.h                       |   2 +-
 algo/x11/tribus-gate.h                    |   2 +-
 algo/x11/x11-gate.h                       |   2 +-
 algo/x11/x11gost-gate.h                   |   2 +-
 algo/x12/x12-gate.h                       |   2 +-
 algo/x13/phi1612-gate.h                   |   2 +-
 algo/x13/skunk-gate.h                     |   2 +-
 algo/x13/x13-gate.h                       |   2 +-
 algo/x13/x13sm3-gate.h                    |   2 +-
 algo/x14/x14-gate.h                       |   2 +-
 algo/x15/x15-gate.h                       |   2 +-
 algo/x16/x16r-gate.h                      |   8 +-
 algo/x16/x20r.c                           |   2 +-
 algo/x17/sonoa-gate.h                     |   2 +-
 algo/x17/x17-gate.h                       |   2 +-
 algo/x17/xevan-gate.h                     |   2 +-
 algo/x22/x22i-gate.c                      |   4 +-
 algo/x22/x22i-gate.h                      |   4 +-
 algo/yespower/yespower-blake2b.c          |   2 +-
 algo/yespower/yespower-opt.c              |  20 +-
 armbuild-all.sh                           |  35 +-
 build-allarch.sh                          |  31 +-
 clean-all.sh                              |   2 +-
 configure                                 |  20 +-
 configure.ac                              |   2 +-
 configure~                                |  20 +-
 cpu-miner.c                               |  51 ++-
 simd-utils.h                              |  37 +++
 simd-utils/intrlv.h                       |  44 ++-
 simd-utils/simd-128.h                     |  34 +-
 simd-utils/simd-256.h                     |  41 ++-
 simd-utils/simd-512.h                     |  11 +-
 simd-utils/simd-neon.h                    | 153 +++++----
 sysinfos.c                                |  29 +-
 129 files changed, 835 insertions(+), 538 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 5927c158..feff95f0 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -75,6 +75,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v24.2
+
+x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
+x86_64: Initial support for CPUs with AVX10, needs GCC-14.
+ARM NEON: Various code optimisations.
+
 v24.1
 
 #414: fix bug in merkle error handling.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index a456c873..2b057ac4 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 //int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
 //                      uint64_t *hashes_done, struct thr_info *mythr )
diff --git a/algo-gate-api.h b/algo-gate-api.h
index e9ac10e0..3626a521 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -99,8 +99,11 @@ typedef  uint32_t set_t;
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
 #define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
+#define SHA256_OPT       1 <<  9   // Zen1, Icelake, AArch64 
 #define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 
+#define AVX10_256        1 << 12
+#define AVX10_512        1 << 13
 
 // AVX10 does not have explicit algo features:
 //  AVX10_512 is compatible with AVX512 + VAES
@@ -246,7 +249,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 //int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
 //                      uint64_t *hashes_done, struct thr_info *mythr );
diff --git a/algo/argon2d/argon2d/opt.c b/algo/argon2d/argon2d/opt.c
index a363a614..d32aebd0 100644
--- a/algo/argon2d/argon2d/opt.c
+++ b/algo/argon2d/argon2d/opt.c
@@ -35,7 +35,7 @@
  * @pre all block pointers must be valid
  */
 
-#if defined(__AVX512F__)
+#if defined(SIMD512)
 
 static inline __m512i blamka( __m512i x, __m512i y )
 {
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
     uint64_t pseudo_rand, ref_index, ref_lane;
     uint32_t prev_offset, curr_offset;
     uint32_t starting_index, i;
-#if defined(__AVX512F__)
+#if defined(SIMD512)
     __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
 #elif defined(__AVX2__)
     __m256i state[ARGON2_HWORDS_IN_BLOCK];
diff --git a/algo/argon2d/blake2/blamka-round-opt.h b/algo/argon2d/blake2/blamka-round-opt.h
index 3e0fc3c5..0b4cd786 100644
--- a/algo/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2d/blake2/blamka-round-opt.h
@@ -21,7 +21,7 @@
 #include "blake2-impl.h"
 #include "simd-utils.h"
 
-#if !defined(__AVX512F__)
+#if !defined(SIMD512)
 
 #if !defined(__AVX2__)
 
diff --git a/algo/blake/blake256-hash.c b/algo/blake/blake256-hash.c
index e60376f5..066e8e7d 100644
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 ///////////////////////////////////////
 //
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
 #endif
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 //Blake-256 16 way AVX512
 
diff --git a/algo/blake/blake256-hash.h b/algo/blake/blake256-hash.h
index 2408e540..6b53ef57 100644
--- a/algo/blake/blake256-hash.h
+++ b/algo/blake/blake256-hash.h
@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
 #define blake256r8_8x32_update        blake256r14_8way_update
 #define blake256r8_8x32_close         blake256r14_8way_close
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 ///////////////////////////////////
 //
diff --git a/algo/blake/blake2b-hash.c b/algo/blake/blake2b-hash.c
index eb4679a4..cf178375 100644
--- a/algo/blake/blake2b-hash.c
+++ b/algo/blake/blake2b-hash.c
@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define B2B8W_G(a, b, c, d, x, y) \
 { \
diff --git a/algo/blake/blake2b-hash.h b/algo/blake/blake2b-hash.h
index 1256fb18..88f5b415 100644
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -15,7 +15,7 @@
 #endif
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct ALIGN( 64 ) {
    __m512i b[16]; // input buffer
diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c
index 7707366c..e2b62765 100644
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include "blake2b-hash.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define BLAKE2B_8WAY
 #elif defined(__AVX2__)
   #define BLAKE2B_4WAY
diff --git a/algo/blake/blake2s-hash.c b/algo/blake/blake2s-hash.c
index ae8030b0..ee895453 100644
--- a/algo/blake/blake2s-hash.c
+++ b/algo/blake/blake2s-hash.c
@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 
 #endif // __AVX2__
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // Blake2s-256 16 way
 
diff --git a/algo/blake/blake2s-hash.h b/algo/blake/blake2s-hash.h
index 2764a899..4e2d66c3 100644
--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -29,20 +29,20 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif
 
-   typedef struct __blake2s_nway_param
-   {
-      uint8_t  digest_length; // 1
-      uint8_t  key_length;    // 2
-      uint8_t  fanout;        // 3
-      uint8_t  depth;         // 4
-      uint32_t leaf_length;   // 8
-      uint8_t  node_offset[6];// 14
-      uint8_t  node_depth;    // 15
-      uint8_t  inner_length;  // 16
-      // uint8_t  reserved[0];
-      uint8_t  salt[8]; // 24
-      uint8_t  personal[8];  // 32
-   } blake2s_nway_param;
+typedef struct __blake2s_nway_param
+{
+   uint8_t  digest_length; // 1
+   uint8_t  key_length;    // 2
+   uint8_t  fanout;        // 3
+   uint8_t  depth;         // 4
+   uint32_t leaf_length;   // 8
+   uint8_t  node_offset[6];// 14
+   uint8_t  node_depth;    // 15
+   uint8_t  inner_length;  // 16
+   // uint8_t  reserved[0];
+   uint8_t  salt[8]; // 24
+   uint8_t  personal[8];  // 32
+} blake2s_nway_param;
 
 typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
 typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
    __m256i h[8];
-   uint8_t  buf[ 32 * 8 ];
+   uint8_t  buf[ 64 * 8 ];
    uint32_t t[2];
    uint32_t f[2];
    size_t   buflen;
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
    __m512i h[8];
-   uint8_t  buf[ 32 * 16 ];
+   uint8_t  buf[ 64 * 16 ];
    uint32_t t[2];
    uint32_t f[2];
    size_t   buflen;
diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c
index dba7ffb6..011bad37 100644
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -3,7 +3,7 @@
 #include <string.h>
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define BLAKE2S_16WAY
 #elif defined(__AVX2__)
   #define BLAKE2S_8WAY
diff --git a/algo/blake/blake512-hash.c b/algo/blake/blake512-hash.c
index 2ab7159e..73799e20 100644
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
    Va = v128_add64( Va, v128_add64( Vb, \
                             v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
                                         CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
+   Vd = v128_ror64xor( Vd, Va, 32 ); \
    Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
+   Vb = v128_ror64xor( Vb, Vc, 25 ); \
 \
    Va = v128_add64( Va, v128_add64( Vb, \
                             v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
                                         CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
+   Vd = v128_ror64xor( Vd, Va, 16 ); \
    Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
+   Vb = v128_ror64xor( Vb, Vc, 11 ); \
 }
 
 #define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
 
 #if defined(__AVX2__)
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 ////////////////////////////////////
 //
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
 #define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
 { \
    a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
-   d = v128_ror64( v128_xor( d, a ), 32 ); \
+   d = v128_ror64xor( d, a, 32 ); \
    c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 25 ); \
+   b = v128_ror64xor( b, c, 25 ); \
    a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
-   d = v128_ror64( v128_xor( d, a ), 16 ); \
+   d = v128_ror64xor( d, a, 16 ); \
    c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 11 ); \
+   b = v128_ror64xor( b, c, 11 ); \
 }
 
 #define ROUND_B_2X64(r) \
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
    // G4 skip nonce
    V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
                                           V0 );
-   VF = v128_ror64( v128_xor( VF, V0 ), 32 );
+   VF = v128_ror64xor( VF, V0, 32 );
    VA = v128_add64( VA, VF );
-   V5 = v128_ror64( v128_xor( V5, VA ), 25 );
+   V5 = v128_ror64xor( V5, VA, 25 );
    V0 = v128_add64( V0, V5 );
 
    GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
 
    // finish round 0, with the nonce now available 
    V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
-   VF = v128_ror64( v128_xor( VF, V0 ), 16 );
+   VF = v128_ror64xor( VF, V0, 16 );
    VA = v128_add64( VA, VF );
-   V5 = v128_ror64( v128_xor( V5, VA ), 11 );
+   V5 = v128_ror64xor( V5, VA, 11 );
 
    // Round 1
    // G0
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
 
    // G1
    V1 = v128_add64( V1, V5 );
-   VD = v128_ror64( v128_xor( VD, V1 ), 32 );
+   VD = v128_ror64xor( VD, V1, 32 );
    V9 = v128_add64( V9, VD );
-   V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
+   V5 = v128_ror64xor( V5, V9, 25 );
    V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
                                               V5 ) );
-   VD = v128_ror64( v128_xor( VD, V1 ), 16 );
+   VD = v128_ror64xor( VD, V1, 16 );
    V9 = v128_add64( V9, VD );
-   V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
+   V5 = v128_ror64xor( V5, V9, 11 );
 
    // G2
    V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
-   VE = v128_ror64( v128_xor( VE, V2 ), 32 );
+   VE = v128_ror64xor( VE, V2, 32 );
    VA = v128_add64( VA, VE );
-   V6 = v128_ror64( v128_xor( V6, VA ), 25 );
+   V6 = v128_ror64xor( V6, VA, 25 );
    V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
-   VE = v128_ror64( v128_xor( VE, V2 ), 16 );
+   VE = v128_ror64xor( VE, V2, 16 );
    VA = v128_add64( VA, VE );
-   V6 = v128_ror64( v128_xor( V6, VA ), 11 );
+   V6 = v128_ror64xor( V6, VA, 11 );
 
    // G3
-   VF = v128_ror64( v128_xor( VF, V3 ), 32 );
+   VF = v128_ror64xor( VF, V3, 32 );
    VB = v128_add64( VB, VF );
-   V7 = v128_ror64( v128_xor( V7, VB ), 25 );
+   V7 = v128_ror64xor( V7, VB, 25 );
    V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
                                               V7 ) );
-   VF = v128_ror64( v128_xor( VF, V3 ), 16 );
+   VF = v128_ror64xor( VF, V3, 16 );
    VB = v128_add64( VB, VF );
-   V7 = v128_ror64( v128_xor( V7, VB ), 11 );
+   V7 = v128_ror64xor( V7, VB, 11 );
 
    // G4, G5, G6, G7
    GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
diff --git a/algo/blake/blake512-hash.h b/algo/blake/blake512-hash.h
index 32de52ae..12e401e4 100644
--- a/algo/blake/blake512-hash.h
+++ b/algo/blake/blake512-hash.h
@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
 #define blake512_4way_prehash_le  blake512_4x64_prehash_le
 #define blake512_4way_final_le    blake512_4x64_final_le
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 ////////////////////////////
 //
diff --git a/algo/blake/blakecoin-gate.h b/algo/blake/blakecoin-gate.h
index 1abeef19..73adfc0a 100644
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define BLAKECOIN_16WAY
 #elif defined(__AVX2__)
   #define BLAKECOIN_8WAY
diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c
index 40bcfa30..6ee4fe54 100644
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -101,15 +101,15 @@
 { \
    Va = v128_add64( Va, v128_add64( Vb, \
                  v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
+   Vd = v128_ror64xor( Vd, Va, 32 ); \
    Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
+   Vb = v128_ror64xor( Vb, Vc, 24 ); \
 \
    Va = v128_add64( Va, v128_add64( Vb, \
                  v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
+   Vd = v128_ror64xor( Vd, Va, 16 ); \
    Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
+   Vb = v128_ror64xor( Vb, Vc, 63 ); \
 }
 
 #define BLAKE2B_ROUND( R ) \
diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h
index ecba9e44..4ab3ab01 100644
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // BMW-256 16 way 32
 
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
 
 #endif  // __AVX2__
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // BMW-512 64 bit 8 way
 typedef struct
diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c
index 38acd699..6392028b 100644
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
 
 #endif // __AVX2__
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // BMW-256 16 way 32
 
diff --git a/algo/bmw/bmw512-gate.h b/algo/bmw/bmw512-gate.h
index e7542cac..059ea595 100644
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define BMW512_8WAY 1
 #elif defined(__AVX2__)
   #define BMW512_4WAY 1
diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c
index 58e82387..e1645d38 100644
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 
 #endif  // __AVX2__
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // BMW-512 8 WAY
 
diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c
index 5888c2a4..4e12cdd6 100644
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // 4 way 128 is handy to avoid reinterleaving in many algos.
 // If reinterleaving is necessary it may be more efficient to use
diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h
index a31ffde0..3d2b0f0d 100644
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -6,7 +6,7 @@
 
 #if defined(__AVX2__)
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 struct _cube_4way_context
 {
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index 518a227d..6fab0ef2 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
     int r;
     const int rounds = sp->rounds;
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
     register __m512i x0, x1;
 
diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c
index 7891ec52..6fa25500 100644
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
 };
 */
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define ECHO_SUBBYTES4(state, j) \
    state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
diff --git a/algo/echo/echo-hash-4way.h b/algo/echo/echo-hash-4way.h
index 58086859..e9e10919 100644
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -5,7 +5,7 @@
 
 #include "simd-utils.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct
 {
diff --git a/algo/gost/sph_gost.c b/algo/gost/sph_gost.c
index c3eb91d8..a67584bb 100644
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
 
 static void AddXor512(const void *a,const void *b,void *c)
 {
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
    casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
                                            casti_m512i( b, 0 ) );
 #elif defined(__AVX2__)
diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h
index 0c102ad8..963c56ef 100644
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -103,7 +103,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
    This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
    K. Matusiewicz, 2011/05/29 */
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
   /* t_i = a_i + a_{i+1} */\
diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h
index 358ea104..cf5c41c3 100644
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -95,7 +95,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
    This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
    K. Matusiewicz, 2011/05/29 */
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
   /* t_i = a_i + a_{i+1} */\
diff --git a/algo/groestl/groestl-gate.h b/algo/groestl/groestl-gate.h
index 25551e60..0e00800e 100644
--- a/algo/groestl/groestl-gate.h
+++ b/algo/groestl/groestl-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
   #define GROESTL_4WAY_VAES 1
 #endif
 
diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c
index 6d804cb4..30b55460 100644
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -17,7 +17,7 @@
 
 #if defined(__AVX2__) && defined(__VAES__)
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 
 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h
index 1439ef18..7446d431 100644
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -43,7 +43,7 @@
 
 #define SIZE256 (SIZE_512/16)
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct {
   __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h
index db724720..8e647ad1 100644
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
    { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
 };
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                      0x1d1519111c141810, 0x1f171b131e161a12,
diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c
index b52e02ca..5afd014a 100644
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -17,7 +17,7 @@
 
 #if defined(__AVX2__) && defined(__VAES__)
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h
index 99870c0c..68236fe8 100644
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -33,7 +33,7 @@
 
 #define SIZE512 (SIZE_1024/16)
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct {
   __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h
index 33336a5c..99dfa895 100644
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
    { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
 };
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                      0x1d1519111c141810, 0x1f171b131e161a12,
diff --git a/algo/groestl/myrgr-gate.h b/algo/groestl/myrgr-gate.h
index 80cc3fd8..477053b3 100644
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
   #define MYRGR_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
   #define MYRGR_4WAY 1
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index b14b1281..f21f27bf 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -382,7 +382,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
 #define S1F   MF
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // Hamsi 8 way AVX512 
 
@@ -1122,7 +1122,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 
 // Hamsi 4 way AVX2
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define INPUT_BIG \
 do { \
@@ -1501,7 +1501,7 @@ do { /* order is important */ \
    sc->h[14] = CE; \
    sc->h[15] = CF;
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define INPUT_8X32 \
 { \
diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h
index 4850ca12..15d964ea 100644
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // Hamsi-512 8x64
 
diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c
index bf1fca37..651124ab 100644
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -53,7 +53,7 @@ extern "C"{
 #define SPH_SMALL_FOOTPRINT_HAVAL   1
 //#endif
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 // ( ~( a ^ b ) ) & c
 #define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \
 
 // Haval-256 8 way 32 bit avx2
 
-#if defined (__AVX512VL__)
+#if defined (VL256)
 
 // ( ~( a ^ b ) ) & c
 #define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \
 
 #endif // AVX2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512) 
 
 // ( ~( a ^ b ) ) & c
 #define mm512_andnotxor( a, b, c ) \
diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h
index db14188d..0006d965 100644
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
 
 #endif // AVX2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct {
    __m512i buf[32];
diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c
index fbc30180..a468a90f 100644
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
       (state)->H[15] = h7l; \
    } while (0)
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define Sb_8W(x0, x1, x2, x3, c) \
 { \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
 
 #if defined(__AVX2__)
 
-#if defined(__AVX512VL__)
-//TODO enable for AVX10_256, not used with AVX512VL
+#if defined(VL256)
 
 #define notxorandnot( a, b, c ) \
    _mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
 
 #endif   // AVX2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 void jh256_8x64_init( jh_8x64_context *sc )
 {
diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h
index faf3bb87..4551e59a 100644
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -55,7 +55,7 @@
  * <code>memcpy()</code>).
  */
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct
 {
diff --git a/algo/keccak/keccak-gate.h b/algo/keccak/keccak-gate.h
index bd2b6a3f..be776eb4 100644
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define KECCAK_8WAY 1
 #elif defined(__AVX2__)
   #define KECCAK_4WAY 1
@@ -12,7 +12,7 @@
   #define KECCAK_2WAY 1
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SHA3D_8WAY 1
 #elif defined(__AVX2__)
   #define SHA3D_4WAY 1
diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c
index 6d4a1a0d..bbffb732 100644
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
 
 #define DO(x)   x
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define INPUT_BUF(size)   do { \
     size_t j; \
diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h
index d387da03..c16465c7 100644
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -4,7 +4,7 @@
 #include <stddef.h>
 #include "simd-utils.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct
 {
diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c
index 45e27fa2..54df4887 100644
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
 };
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define cns4w(i)  mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
 
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
     a = _mm256_xor_si256( a, c0 ); \
     b = _mm256_xor_si256( b, c1 );
 
-//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
-#if defined(__AVX512VL__) 
+#if defined(VL256) 
 
 #define MULT2( a0, a1 ) \
 { \
diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h
index a274995f..24a2aae7 100644
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -51,7 +51,7 @@
 #define LIMIT_512 128
 /*********************************/
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct {
     uint32_t buffer[8*4];
diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c
index ef06313a..90a3acdc 100644
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -28,8 +28,7 @@
     a = v128_xor( a, c0 ); \
     b = v128_xor( b, c1 ); \
 
-#if defined(__AVX512VL__)
-//TODO enable for AVX10_512 AVX10_256
+#if defined(VL256)
 
 #define MULT2( a0, a1 ) \
 { \
@@ -69,8 +68,7 @@
 
 #endif
 
-#if defined(__AVX512VL__)
-//TODO enable for AVX10_512 AVX10_256
+#if defined(VL256)
 
 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h
index bbad313f..1d404bf8 100644
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
 
 int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
                                    const void* data, size_t inlen );
-#endif   // LUFFA_FOR_SSE2_H___
+#endif   // LUFFA_FOR_SSE2_H__
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index 0bf4d6c5..453ab126 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -15,7 +15,7 @@
  #include "algo/groestl/sph_groestl.h"
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define ALLIUM_16WAY 1
 #elif defined(__AVX2__)
   #define ALLIUM_8WAY 1
diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h
index 6648da3e..196dd51f 100644
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
 #include "lyra2.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define LYRA2REV3_16WAY 1
 #elif defined(__AVX2__)
   #define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
 
 //////////////////////////////////
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define LYRA2REV2_16WAY 1
 #elif defined(__AVX2__)
   #define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
 
 /////////////////////////////////////////
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define PHI2_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define PHI2_4WAY 1
diff --git a/algo/lyra2/lyra2-hash-2way.c b/algo/lyra2/lyra2-hash-2way.c
index 80124bae..acd02677 100644
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -41,7 +41,7 @@
 //  lyra2z330, lyra2h, 
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 /**
  * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h
index 91447b6e..62a546eb 100644
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 
 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
                   uint64_t timeCost, uint64_t nRows, uint64_t nCols );
diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c
index 5c36e00e..203e2520 100644
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -3,7 +3,7 @@
 #include "lyra2.h"
 #include "algo/blake/blake256-hash.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define LYRA2Z_16WAY 1
 #elif defined(__AVX2__)
   #define LYRA2Z_8WAY 1
diff --git a/algo/lyra2/phi2-4way.c b/algo/lyra2/phi2-4way.c
index 3d385c90..ec4e3856 100644
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -4,7 +4,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "lyra2.h"
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
   #include "algo/echo/echo-hash-4way.h"
 #elif defined(__AES__)
   #include "algo/echo/aes_ni/hash_api.h"
diff --git a/algo/lyra2/sponge-2way.c b/algo/lyra2/sponge-2way.c
index f798ef2e..1b8f3ebf 100644
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -27,7 +27,7 @@
 #include "lyra2.h"
 #include "simd-utils.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index e089ee94..7937981e 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
   0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define G2W_4X64(a,b,c,d) \
    a = _mm512_add_epi64( a, b ); \
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
    a = v128_add64( a, b ); \
-   d = v128_ror64( v128_xor( d, a), 32 ); \
+   d = v128_ror64xor( d, a, 32 ); \
    c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 24 ); \
+   b = v128_ror64xor( b, c, 24 ); \
    a = v128_add64( a, b ); \
-   d = v128_ror64( v128_xor( d, a ), 16 ); \
+   d = v128_ror64xor( d, a, 16 ); \
    c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 63 );
+   b = v128_ror64xor( b, c, 63 );
 
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 { \
@@ -222,7 +222,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
     G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 union _ovly_512
 {
diff --git a/algo/nist5/nist5-gate.h b/algo/nist5/nist5-gate.h
index 1846806d..af081b37 100644
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define NIST5_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define NIST5_4WAY 1
diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c
index 98b57e13..f586cc57 100644
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -71,8 +71,7 @@ do { \
 } while (0)
 
 #define GAMMA_4W(n0, n1, n2, n4)   \
-   (g ## n0 = v128_xor( a ## n0, \
-                             v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
+   (g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
 
 #define PI_ALL_4W   do { \
       a0  = g0; \
@@ -312,7 +311,7 @@ do { \
       BUPDATE1_8W( 7, 1 ); \
 } while (0)
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define GAMMA_8W(n0, n1, n2, n4)   \
    ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )  
diff --git a/algo/quark/anime-gate.h b/algo/quark/anime-gate.h
index a7b08376..db80fe82 100644
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define ANIME_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define ANIME_4WAY 1
diff --git a/algo/quark/hmq1725-gate.h b/algo/quark/hmq1725-gate.h
index bc0ff99b..fcaa9a8f 100644
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define HMQ1725_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define HMQ1725_4WAY 1
diff --git a/algo/quark/quark-gate.h b/algo/quark/quark-gate.h
index 69ec5605..d39102c3 100644
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define QUARK_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define QUARK_4WAY 1
diff --git a/algo/qubit/qubit-gate.h b/algo/qubit/qubit-gate.h
index 2a29e03d..de439842 100644
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define QUBIT_4WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define QUBIT_2WAY 1
diff --git a/algo/ripemd/lbry-gate.h b/algo/ripemd/lbry-gate.h
index 2aedd6b4..64eb05fc 100644
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define LBRY_16WAY 1
 #elif defined(__AVX2__)
   #define LBRY_8WAY 1
diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c
index 292949c4..b60c333f 100644
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
    _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
 
 #define F3(x, y, z) \
-   _mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
+   _mm_xor_si128( v128_ornot( y, x ), z )
 
 #define F4(x, y, z) \
    _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
 
 #define F5(x, y, z) \
-   _mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
+   _mm_xor_si128( x, v128_ornot( z, y ) )
 
 #define RR(a, b, c, d, e, f, s, r, k) \
 do{ \
@@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
 
 #define F8W_3(x, y, z) \
-   _mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
+   _mm256_xor_si256( mm256_ornot( y, x ), z )
 
 #define F8W_4(x, y, z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
 
 #define F8W_5(x, y, z) \
-   _mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
+   _mm256_xor_si256( x, mm256_ornot( z, y ) )
 
 #define RR_8W(a, b, c, d, e, f, s, r, k) \
 do{ \
@@ -625,7 +625,7 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
 
 #endif // __AVX2__
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 //  RIPEMD-160 16 way
 
diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h
index c0c87db4..2f0fceb5 100644
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -33,7 +33,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
                             size_t len );
 void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct
 {
diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
index 3c83b983..633d77c0 100644
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -745,7 +745,7 @@ do{ \
    SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // Tested OK but very slow
 // 16 way parallel, requires 16x32 interleaving
@@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
    XA3 = BA[3] = v128_xor( BA[3], CA[3] );
    XB3 = BB[3] = v128_xor( BB[3], CB[3] );
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
              
    SALSA_8ROUNDS_SIMD128_2BUF;
 
@@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
    XB3 = BB[3] = v128_xor( BB[3], CB[3] );
    XC3 = BC[3] = v128_xor( BC[3], CC[3] );
       
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
    
    SALSA_8ROUNDS_SIMD128_3BUF;
 
diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h
index 709ba674..7107bd3f 100644
--- a/algo/scrypt/scrypt-core-4way.h
+++ b/algo/scrypt/scrypt-core-4way.h
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
 
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index e0ab88be..13cd0aa4 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -35,7 +35,7 @@
 //#include <mm_malloc.h>
 #include "malloc-huge.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SCRYPT_THROUGHPUT 16
 #elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
   #define SCRYPT_THROUGHPUT 2
@@ -592,7 +592,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
 
 #endif /* HAVE_SHA256_8WAY */
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 static inline void sha256_16way_init_state( void *state )
 {
@@ -1494,7 +1494,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
 // scrypt_throughput defined at compile time and used to replace
 // MAX_WAYS to reduce memory usage.
    
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 //   scrypt_throughput = 16;
    if ( opt_param_n > 0x4000 )
       scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
diff --git a/algo/sha/hmac-sha256-hash-4way.c b/algo/sha/hmac-sha256-hash-4way.c
index c039ac94..bc4d095e 100644
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
    }
 }
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // HMAC 16-way AVX512
 
diff --git a/algo/sha/hmac-sha256-hash-4way.h b/algo/sha/hmac-sha256-hash-4way.h
index c096b08f..320c27b4 100644
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
 
 #endif  // AVX2
        
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct _hmac_sha256_16way_context
 {
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index 5e37c0b1..bfe4729f 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
 // to avoid recalculating it as Y^Z. This optimization is not applicable
 // when MAJ is optimized with ternary logic.
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define CHx(X, Y, Z)    _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
 
@@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
    G = _mm256_load_si256( state_in + 6 );
    H = _mm256_load_si256( state_in + 7 );
 
-#if !defined(__AVX512VL__)
+#if !defined(VL256)
    __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
 #endif
 
@@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    G = _mm256_load_si256( state_mid + 6 );
    H = _mm256_load_si256( state_mid + 7 );
 
-#if !defined(__AVX512VL__)
+#if !defined(VL256)
    __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
 #endif
 
@@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
    const __m256i IV7 = H;
    const __m256i IV6 = G;
 
-#if !defined(__AVX512VL__)
+#if !defined(VL256)
    __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
 #endif
 
@@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
    W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
    W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
 
-#if !defined(__AVX512VL__)
+#if !defined(VL256)
    Y_xor_Z = _mm256_xor_si256( B, C );
 #endif
 
@@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
    sha256_8way_close( &ctx, dst );
 }
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // SHA-256 16 way
 
diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c
index e2c10e94..5110faea 100644
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
 
 #if defined(__x86_64__) && defined(__SHA__)
 
+
+/* common code used for rounds 12 through 51  */
+
+#define sha256_generic_qround( s0, s1, m, a, b, c ) \
+    TMP = _mm_alignr_epi8( a, c, 4 ); \
+    s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
+    b  = _mm_add_epi32( b, TMP ); \
+    b  = _mm_sha256msg2_epu32( b, a ); \
+    m  = _mm_shuffle_epi32( m, 0x0e ); \
+    s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
+    c  = _mm_sha256msg1_epu32( c, a );
+
+// r12-15      
+// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
+// r16-19
+// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
+// r20-23
+// sha256_generic_qround( s0, s1, m, t1, t2, t0 ) 
+// r24-27
+// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ... 
+
+
 #define sha256_opt_rounds( state_out, input, state_in ) \
 { \
     __m128i STATE0, STATE1; \
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
 
 #define sha256_neon_rounds( state_out, input, state_in ) \
 { \
-    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
+    uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
     uint32x4_t MSG0, MSG1, MSG2, MSG3; \
     uint32x4_t TMP0, TMP1, TMP2; \
 \
     STATE0 = vld1q_u32( state_in   ); \
     STATE1 = vld1q_u32( state_in+4 ); \
-    ABEF_SAVE = STATE0; \
-    CDGH_SAVE = STATE1; \
+    ABCD_SAVE = STATE0; \
+    EFGH_SAVE = STATE1; \
 \
     MSG0 = load_msg( input, 0 ); \
     MSG1 = load_msg( input, 1 ); \
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
     TMP2 = STATE0; \
     STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
     STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
-    STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
-    STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
+    STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
+    STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
     vst1q_u32( state_out  , STATE0 ); \
     vst1q_u32( state_out+4, STATE1 ); \
 }
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
 #define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
                               input_Y, state_in_X, state_in_Y ) \
 { \
-    uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
-    uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
+    uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
+    uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
     uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
     uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
     uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
     STATE0_Y = vld1q_u32( state_in_Y   ); \
     STATE1_X = vld1q_u32( state_in_X+4 ); \
     STATE1_Y = vld1q_u32( state_in_Y+4 ); \
-    ABEF_SAVE_X = STATE0_X; \
-    ABEF_SAVE_Y = STATE0_Y; \
-    CDGH_SAVE_X = STATE1_X; \
-    CDGH_SAVE_Y = STATE1_Y; \
+    ABCD_SAVE_X = STATE0_X; \
+    ABCD_SAVE_Y = STATE0_Y; \
+    EFGH_SAVE_X = STATE1_X; \
+    EFGH_SAVE_Y = STATE1_Y; \
 \
     MSG0_X = load_msg( input_X, 0 ); \
     MSG0_Y = load_msg( input_Y, 0 ); \
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
     STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
     STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
     STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
-    STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
-    STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
-    STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
-    STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
+    STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
+    STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
+    STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
+    STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
     vst1q_u32( state_out_X  , STATE0_X ); \
     vst1q_u32( state_out_Y  , STATE0_Y ); \
     vst1q_u32( state_out_X+4, STATE1_X ); \
diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h
index 4a201110..70c652a3 100644
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -113,7 +113,7 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // SHA-256 16 way x86_64
 
diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h
index ce459e9c..7ec1143e 100644
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SHA256D_16WAY 1
 #elif defined(__SHA__)
   #define SHA256D_SHA 1
diff --git a/algo/sha/sha256d.h b/algo/sha/sha256d.h
index 35a093d8..d20319a1 100644
--- a/algo/sha/sha256d.h
+++ b/algo/sha/sha256d.h
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SHA256D_16WAY 1
 #elif defined(__SHA__)
   #define SHA256D_SHA 1
diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c
index 24cbdb4d..9a906e00 100644
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -6,7 +6,7 @@
 #include "sha256-hash.h"
 #include "sph_sha2.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SHA256DT_16X32 1
 #elif defined(__x86_64__) && defined(__SHA__)
   #define SHA256DT_X86_SHA256 1
diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h
index 5933555c..9e0bc82d 100644
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SHA256T_16WAY 1
 #elif defined(__SHA__)
   #define SHA256T_SHA 1
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index 2495c861..381219e9 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -73,29 +73,10 @@ static const uint64_t K512[80] =
 
 // Experimental. Not tested. Not reviewed. Compile tested only.
 
-// Needs GCC-13 for compilation.
-// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
+// Needs GCC-14 for compilation.
+// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
 // Modelled after noloader sha256 implementation.
 
-// It's not clear how SHA512 will be supported before AVX10 considering how
-// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
-// until AVX10-256.
-
-#if defined(__AVX512VL__)
-
-#define mm256_alignr_1x64( v1, v0 )   _mm256_alignr_epi64( v1, v0, 1 )
-
-#else
-// Ugly workaround to make it work with AVX2
-
-static const __m256i mask __attribute__ ((aligned (32)))
-                          = { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
-
-#define mm256_alignr_1x64( v1, v0 ) \
-  _mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ),           mask  ), \
-                   _mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
-
-#endif
 
 void sha512_opt_transform_be( uint64_t *state_out, const void *input,
                               const uint64_t *state_in )
@@ -109,7 +90,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
     TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
     STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
     BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
-                                                0x0001020304050607 ) )
+                                                0x0001020304050607 ) );
     TMP = _mm256_permute4x64_epi64( TMP, 0xB1 );             // CDAB
     STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B );       // EFGH
     STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -123,153 +104,233 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
     TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
     TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128 (MSG ) );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
 
     // Rounds 4-7
     TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
     TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
     MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                        _mm256_castsi256_si128( MSG ) );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 8-11
     TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
     TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
 
     // Rounds 12-15
     TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
     TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
     TMSG0 = _mm256_add_epi32( TMSG0, TMP );
     TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
     
     // Rounds 16-19
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
     TMSG1 = _mm256_add_epi64( TMSG1, TMP );
     TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
 
     // Rounds 20-23
     MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
     TMSG2 = _mm256_add_epi64( TMSG2, TMP );
     TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 24-27
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
     TMSG3 = _mm256_add_epi32( TMSG3, TMP );
     TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
     
     // Rounds 28-31
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
-    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
     TMSG0 = _mm256_add_epi64( TMSG0, TMP );
     TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
 
     // Rounds 32-35
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
     TMSG1 = _mm256_add_epi64( TMSG1, TMP );
     TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
 
     // Rounds 36-39
     MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
     TMSG2 = _mm256_add_epi64( TMSG2, TMP );
     TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 40-43
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
     TMSG3 = _mm256_add_epi64( TMSG3, TMP );
     TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
 
     // Rounds 44-47
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
     TMSG0 = _mm256_add_epi64( TMSG0, TMP );
     TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
 
     // Rounds 48-51
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
     TMSG1 = _mm256_add_epi64( TMSG1, TMP );
     TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
 
     // Rounds 52-55
     MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
     TMSG2 = _mm256_add_epi64( TMSG2, TMP );
     TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 56-59
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
     TMSG3 = _mm256_add_epi64( TMSG3, TMP );
     TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
 
     // Rounds 60-63
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
-    STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
+    TMSG0 = _mm256_add_epi64( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
+    
+    // Rounds 64-67
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
+
+    // Rounds 68-71
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+
+    // Rounds 72-75
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
+    TMSG3 = _mm256_add_epi64( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
+
+    // Rounds 76-79
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
+                                       _mm256_castsi256_si128( MSG ) );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
+                                       _mm256_castsi256_si128( MSG ) );
 
     // Add initial state
     STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -289,7 +350,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
                               const uint64_t *state_in )
 {
     __m256i STATE0, STATE1;
-    __m256i MSG, TMP, BSWAP64;
+    __m256i MSG, TMP;
     __m256i TMSG0, TMSG1, TMSG2, TMSG3;
     __m256i ABEF_SAVE, CDGH_SAVE;
 
@@ -308,141 +369,190 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
     // Rounds 0-3
     TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
 
     // Rounds 4-7
     TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
     MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 8-11
     TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
 
     // Rounds 12-15
     TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
     TMSG0 = _mm256_add_epi32( TMSG0, TMP );
     TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
 
     // Rounds 16-19
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
     TMSG1 = _mm256_add_epi64( TMSG1, TMP );
     TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
 
     // Rounds 20-23
     MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
     TMSG2 = _mm256_add_epi64( TMSG2, TMP );
     TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 24-27
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
     TMSG3 = _mm256_add_epi32( TMSG3, TMP );
     TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
 
     // Rounds 28-31
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
-    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
     TMSG0 = _mm256_add_epi64( TMSG0, TMP );
     TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
 
     // Rounds 32-35
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
     TMSG1 = _mm256_add_epi64( TMSG1, TMP );
     TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
 
     // Rounds 36-39
     MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
     TMSG2 = _mm256_add_epi64( TMSG2, TMP );
     TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 40-43
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
     TMSG3 = _mm256_add_epi64( TMSG3, TMP );
     TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
 
     // Rounds 44-47
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
     TMSG0 = _mm256_add_epi64( TMSG0, TMP );
     TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
 
     // Rounds 48-51
     MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
     TMSG1 = _mm256_add_epi64( TMSG1, TMP );
     TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
-    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
+
+    // Rounds 52-55
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
 
     // Rounds 56-59
     MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
-    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
-    TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
     TMSG3 = _mm256_add_epi64( TMSG3, TMP );
     TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
 
     // Rounds 60-63
     MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
-    STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
+    TMSG0 = _mm256_add_epi64( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
+
+    // Rounds 64-67
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
+
+    // Rounds 68-71
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+
+    // Rounds 72-75
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
+    TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
+    TMSG3 = _mm256_add_epi64( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
+
+    // Rounds 76-79
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
     MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
-    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
 
     // Add initial state
     STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -462,7 +572,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
 #endif
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // SHA-512 8 way 64 bit
 
@@ -664,8 +774,7 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
                                     mm256_ror_64( x, 61 ), \
                                     _mm256_srli_epi64( x, 6 ) )
 
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 // 4 way is not used whith AVX512 but will be whith AVX10_256 when it
 // becomes available.
 
@@ -717,7 +826,7 @@ sha512_4x64_round( sha512_4x64_context *ctx,  __m256i *in, __m256i r[8] )
    int i;
    register __m256i A, B, C, D, E, F, G, H;
 
-#if !defined(__AVX512VL__)
+#if !defined(VL256)
 // Disable for AVX10_256
    __m256i X_xor_Y, Y_xor_Z;
 #endif
@@ -754,7 +863,7 @@ sha512_4x64_round( sha512_4x64_context *ctx,  __m256i *in, __m256i r[8] )
       H = v256_64( 0x5BE0CD19137E2179 );
    }
 
-#if !defined(__AVX512VL__)
+#if !defined(VL256)
 // Disable for AVX10_256
    Y_xor_Z = _mm256_xor_si256( B, C );
 #endif
diff --git a/algo/sha/sha512-hash.h b/algo/sha/sha512-hash.h
index bb8d8665..0f8cda91 100644
--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -25,7 +25,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // SHA-512 8 way
 
diff --git a/algo/sha/sha512256d-4way.c b/algo/sha/sha512256d-4way.c
index 8d38da76..62edcac1 100644
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 #define SHA512256D_8WAY 1
 #elif defined(__AVX2__)
 #define SHA512256D_4WAY 1
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index d411389d..2bd4bddb 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -34,7 +34,7 @@
 #include <string.h>
 #include "shabal-hash-4way.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define DECL_STATE16   \
    __m512i A0, A1, A2, A3, A4, A5, A6, A7, \
diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h
index 81707021..fa51e3d1 100644
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -8,7 +8,7 @@
 
 #define SPH_SIZE_shabal512   512
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct {
    __m512i buf[16];
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index 66288ed4..6f8a3db5 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -30,8 +30,7 @@ static const uint32_t IV512[] =
 
 #endif
 
-#if defined (__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined (VL256)
 
 #define DECL_m256i_count \
    const __m256i count = \
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
index 9d3956db..cb8e7212 100644
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -1,7 +1,7 @@
 #include "shavite-hash-4way.h"
 #include <stdint.h>
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
 
 static const uint32_t IV512[] =
 {
diff --git a/algo/shavite/shavite-hash-4way.h b/algo/shavite/shavite-hash-4way.h
index 10ff0957..1e55271b 100644
--- a/algo/shavite/shavite-hash-4way.h
+++ b/algo/shavite/shavite-hash-4way.h
@@ -1,10 +1,10 @@
 #ifndef SHAVITE_HASH_4WAY_H__
 #define SHAVITE_HASH_4WAY_H__ 1
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  
 #include "simd-utils.h"
 
+#if defined(__VAES__) && defined(SIMD512)
+
 typedef struct {
         unsigned char buf[128<<2];
         uint32_t h[16<<2];
diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c
index 5debceca..a4f9d4ff 100644
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -803,8 +803,7 @@ static const m256_v16 FFT256_Twiddle[] =
 
 #define shufxor2w(x,s)      XCAT(SHUFXOR_,s)(x)
 
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 
 #define REDUCE(x) \
   _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
@@ -1500,7 +1499,7 @@ int simd512_2way( void *hashval, const void *data, int datalen )
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 ////////////////////////////////////
 //
diff --git a/algo/simd/simd-hash-2way.h b/algo/simd/simd-hash-2way.h
index 2a94518c..9bb0709a 100644
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -52,7 +52,7 @@ int simd512_2way( void *hashval, const void *data, int datalen );
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct
 {
diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h
index 1bdae7f4..96a15e85 100644
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SKEIN_8WAY 1
 #elif defined(__AVX2__)
   #define SKEIN_4WAY 1
diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c
index 3a2327f8..33677ebd 100644
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -298,7 +298,7 @@ static const uint64_t IV512[] = {
       sc->bcount = bcount;
    
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
   k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), \
@@ -511,7 +511,7 @@ do { \
 
 #endif  // AVX2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 void skein256_8way_init( skein256_8way_context *sc )
 {
diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h
index cc1fb96a..f1f2d427 100644
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -44,7 +44,7 @@
 #include <stddef.h>
 #include "simd-utils.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 typedef struct
 {
diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c
index 66d63886..7bd2d2a4 100644
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -687,7 +687,7 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 
    #undef ADD_SUB
 
-#if defined (__AVX512VL__) && defined(__AVX512BW__)   
+#if defined(VL256)   
 
    #define Q_REDUCE( a ) \
        _mm256_sub_epi32( _mm256_maskz_mov_epi8( 0x11111111, a ), \
@@ -1233,7 +1233,7 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
 	swift_int32_t result[N] __attribute__ ((aligned (64)));
 	register swift_int16_t carry = 0;
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
    __m512i *res = (__m512i*)result;
    for ( j = 0; j < N/16; ++j )
diff --git a/algo/verthash/tiny_sha3/sha3-4way.c b/algo/verthash/tiny_sha3/sha3-4way.c
index 1fada155..2aff1191 100644
--- a/algo/verthash/tiny_sha3/sha3-4way.c
+++ b/algo/verthash/tiny_sha3/sha3-4way.c
@@ -152,7 +152,7 @@ void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen )
     return md;
 }
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 void sha3_8way_keccakf( __m512i st[25] )
 {
diff --git a/algo/verthash/tiny_sha3/sha3-4way.h b/algo/verthash/tiny_sha3/sha3-4way.h
index 6723b73b..f1c3a92e 100644
--- a/algo/verthash/tiny_sha3/sha3-4way.h
+++ b/algo/verthash/tiny_sha3/sha3-4way.h
@@ -37,7 +37,7 @@ int sha3_4way_final( void *md, sha3_4way_ctx_t *c );    // digest goes to md
 void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen );
 
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
 // state context
 typedef struct
diff --git a/algo/x11/c11-gate.h b/algo/x11/c11-gate.h
index 712e7873..81ecaf32 100644
--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define C11_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define C11_4WAY 1
diff --git a/algo/x11/tribus-gate.h b/algo/x11/tribus-gate.h
index fbcf61d2..01758628 100644
--- a/algo/x11/tribus-gate.h
+++ b/algo/x11/tribus-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define TRIBUS_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define TRIBUS_4WAY 1
diff --git a/algo/x11/x11-gate.h b/algo/x11/x11-gate.h
index aed68370..c1afe3eb 100644
--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X11_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X11_4WAY 1
diff --git a/algo/x11/x11gost-gate.h b/algo/x11/x11gost-gate.h
index f24dbfc5..a6a820d5 100644
--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X11GOST_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X11GOST_4WAY 1
diff --git a/algo/x12/x12-gate.h b/algo/x12/x12-gate.h
index 998f09bb..5bdba0aa 100644
--- a/algo/x12/x12-gate.h
+++ b/algo/x12/x12-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X12_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X12_4WAY 1
diff --git a/algo/x13/phi1612-gate.h b/algo/x13/phi1612-gate.h
index b4151919..11523e00 100644
--- a/algo/x13/phi1612-gate.h
+++ b/algo/x13/phi1612-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define PHI1612_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define PHI1612_4WAY 1
diff --git a/algo/x13/skunk-gate.h b/algo/x13/skunk-gate.h
index e5ade93f..b1905c25 100644
--- a/algo/x13/skunk-gate.h
+++ b/algo/x13/skunk-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SKUNK_8WAY 1
 #elif defined(__AVX2__)
   #define SKUNK_4WAY 1
diff --git a/algo/x13/x13-gate.h b/algo/x13/x13-gate.h
index 6718eb37..f5fa4f6e 100644
--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X13_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X13_4WAY 1
diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h
index fc6154ac..85d4cda7 100644
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -26,7 +26,7 @@ void init_x13sm3_ctx();
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X13BCD_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X13BCD_4WAY 1
diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h
index 97f4800d..09065c96 100644
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X14_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X14_4WAY 1
diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h
index 44568c28..e5e1174c 100644
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X15_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X15_4WAY 1
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index 146338d1..b5bc1209 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -51,7 +51,7 @@
 #endif
 
 // X16R, X16S
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X16R_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X16R_4WAY   1
@@ -59,7 +59,7 @@
   #define X16R_2WAY   1
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X16RV2_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X16RV2_4WAY 1
@@ -68,7 +68,7 @@
 #endif
 
 // X16RT, VEIL
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X16RT_8WAY  1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X16RT_4WAY  1
@@ -76,7 +76,7 @@
   #define X16RT_2WAY  1
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X21S_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X21S_4WAY   1
diff --git a/algo/x16/x20r.c b/algo/x16/x20r.c
index b6c63966..5d8b60c2 100644
--- a/algo/x16/x20r.c
+++ b/algo/x16/x20r.c
@@ -22,7 +22,7 @@
 #include "algo/sha/sph_sha2.h"
 #include "x16r-gate.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X20R_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X20R_4WAY   1
diff --git a/algo/x17/sonoa-gate.h b/algo/x17/sonoa-gate.h
index 997bff18..d431b2e1 100644
--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define SONOA_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define SONOA_4WAY 1
diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h
index 4a5b035a..6bc030c6 100644
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X17_8WAY 1
 //  #define X17_16X32 1
 #elif defined(__AVX2__) && defined(__AES__)
diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h
index 8ef9a2e6..eccd433f 100644
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define XEVAN_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define XEVAN_4WAY 1
diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c
index b8e087f1..432cf949 100644
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -32,7 +32,7 @@ bool register_x22i_algo( algo_gate_t* gate )
 #endif
 
   gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                      | AVX512_OPT | VAES_OPT;
+                      | AVX512_OPT | VAES_OPT | NEON_OPT;
   return true;
 };
 
@@ -49,7 +49,7 @@ bool register_x25x_algo( algo_gate_t* gate )
   gate->hash      = (void*)&x25x_hash;
 #endif
   gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
-                        AVX512_OPT | VAES_OPT;
+                        AVX512_OPT | VAES_OPT | NEON_OPT;
   InitializeSWIFFTX();
   return true;
 };
diff --git a/algo/x22/x22i-gate.h b/algo/x22/x22i-gate.h
index 4dc1bf24..17b7999d 100644
--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -7,7 +7,7 @@
 #include <unistd.h>
 #include "algo/swifftx/swifftx.h"
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X22I_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X22I_4WAY 1
@@ -50,7 +50,7 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce,
 
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   #define X25X_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
   #define X25X_4WAY 1
diff --git a/algo/yespower/yespower-blake2b.c b/algo/yespower/yespower-blake2b.c
index 41dec41b..a5b34928 100644
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -259,7 +259,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 #define WRITE_X(out) \
     (out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3;
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define ARX(out, in1, in2, s) \
    out = _mm_xor_si128(out, _mm_rol_epi32(_mm_add_epi32(in1, in2), s));
diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c
index 9315baa4..dcfbe0f2 100644
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -93,12 +93,12 @@ typedef union
 #if defined(__AVX2__)
    __m256i m256[2];
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(YESPOWER_USE_AVX512) && defined(SIMD512)
    __m512i m512;
 #endif
 } salsa20_blk_t;
 
-#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(YESPOWER_USE_AVX512) && defined(SIMD512)
 // Slow
 
 static const __m512i simd_shuffle_index = 
@@ -114,7 +114,7 @@ static const __m512i simd_unshuffle_index =
 
 #elif defined(__AVX2__)
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 // alternative when not using 512 bit vectors
 
 static const __m256i simd_shuffle_index =
@@ -138,13 +138,13 @@ static const __m256i simd_shuffle_index =
 static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
     salsa20_blk_t *Bout)
 {
-#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(YESPOWER_USE_AVX512) && defined(SIMD512)
   
   Bout->m512 = _mm512_permutexvar_epi32( simd_shuffle_index, Bin->m512 );
 
 #elif defined(__AVX2__)
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
   Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_shuffle_index,
                                              Bin->m256[1] );
@@ -193,13 +193,13 @@ static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
 static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
     salsa20_blk_t *Bout)
 {
-#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(YESPOWER_USE_AVX512) && defined(SIMD512)
 
   Bout->m512 = _mm512_permutexvar_epi32( simd_unshuffle_index, Bin->m512 );    
 
 #elif defined(__AVX2__)
   
-#if defined(__AVX512VL__)
+#if defined(VL256)
   
   Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_unshuffle_index,
                                              Bin->m256[1] );
@@ -318,7 +318,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 
 
 // AVX512 ternary logic optimization
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define XOR_X_XOR_X( in1, in2 ) \
  X0 =  _mm_ternarylogic_epi32( X0, (in1).m128[0], (in2).m128[0], 0x96 ); \
@@ -335,7 +335,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 #endif
 
 // General vectored optimizations
-#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(YESPOWER_USE_AVX512) && defined(SIMD512)
 
 #define READ_X( in ) \
   X.m512 = (in).m512;
@@ -379,7 +379,7 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
   X.m256[0] = (in).m256[0]; \
   X.m256[1] = (in).m256[1];
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define XOR_X_2_XOR_X( in1, in2, in3 ) \
    X.m256[0] = _mm256_ternarylogic_epi32( (in1).m256[0], (in2).m256[0], \
diff --git a/armbuild-all.sh b/armbuild-all.sh
index 328fd71c..61690aac 100755
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -4,11 +4,37 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+
+# armv9 needs gcc-13
 
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
+CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv9-aes-sha3
+
+make clean || echo clean
+CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv9-aes-sha3-sve2
+
+make clean || echo clean
+CFLAGS="-O3 -march=armv8.2-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv8.2-aes-sha3-sve2
+
+make clean || echo clean
+CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv8-aes-sha2-sve2
+
+make clean || echo clean
 CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure  --with-curl 
 make -j $(nproc)
 strip -s cpuminer
@@ -28,6 +54,13 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-armv8-aes
 
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv8-crypto
+
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure  --with-curl
diff --git a/build-allarch.sh b/build-allarch.sh
index 7fbebc27..28f66ba9 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2  > /dev/null
 
 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -17,20 +17,35 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes
 
-# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
+# Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
+make -j 8
+strip -s cpuminer
+mv cpuminer cpuminer-alderlake
+
+# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14
 #make clean || echo clean
 #rm -f config.status
-#./autogen.sh || echo done
-#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
 #make -j 8
 #strip -s cpuminer
-#mv cpuminer cpuminer-alderlake
+#mv cpuminer cpuminer-arrowlake
+
+# Zen5: AVX512 SHA VAES, requires gcc-14.
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=znver5" ./configure --with-curl
+#make -j $(nproc)
+#strip -s cpuminer
+#mv cpuminer cpuminer-zen4
 
-# Zen4 AVX512 SHA VAES
+# Zen4: AVX512 SHA VAES
 make clean || echo clean
 rm -f config.status
 # znver3 needs gcc-11, znver4 needs gcc-12.3.
-#CFLAGS="-O3 -march=znver4" ./configure --with-curl
+#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
 # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer.
 CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall" ./configure --with-curl
@@ -55,7 +70,7 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx512
 
-# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
+# AVX2 SHA VAES: generic
 make clean || echo done
 rm -f config.status
 # vaes doesn't include aes
diff --git a/clean-all.sh b/clean-all.sh
index 8364e408..98979c28 100755
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,7 +2,7 @@
 #
 # make clean and rm all the targetted executables.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
 
 rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
 
diff --git a/configure b/configure
index dd6ce7fc..b7f7c371 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.1.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.1'
-PACKAGE_STRING='cpuminer-opt 24.1'
+PACKAGE_VERSION='24.2'
+PACKAGE_STRING='cpuminer-opt 24.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 24.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1432,7 +1432,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 24.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 24.1
+cpuminer-opt configure 24.2
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 24.1, which was
+It was created by cpuminer-opt $as_me 24.2, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='24.1'
+ VERSION='24.2'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.1, which was
+This file was extended by cpuminer-opt $as_me 24.2, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.1
+cpuminer-opt config.status 24.2
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 09fbbdb2..ff354444 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [24.1])
+AC_INIT([cpuminer-opt], [24.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/configure~ b/configure~
index dab87859..dd6ce7fc 100755
--- a/configure~
+++ b/configure~
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.16.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.16'
-PACKAGE_STRING='cpuminer-opt 23.16'
+PACKAGE_VERSION='24.1'
+PACKAGE_STRING='cpuminer-opt 24.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.16 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 24.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1432,7 +1432,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.16:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 24.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 23.16
+cpuminer-opt configure 24.1
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 23.16, which was
+It was created by cpuminer-opt $as_me 24.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='23.16'
+ VERSION='24.1'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.16, which was
+This file was extended by cpuminer-opt $as_me 24.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.16
+cpuminer-opt config.status 24.1
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/cpu-miner.c b/cpu-miner.c
index ad7cc03e..7ccbab0f 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2852,12 +2852,14 @@ static bool cpu_capability( bool display_only )
      bool cpu_has_avx10   = has_avx10();
      bool cpu_has_aes     = has_aes_ni();  // x86_64 or AArch64 AES
      bool cpu_has_vaes    = has_vaes();
-     bool cpu_has_sha     = has_sha();     // x86_64 or AArch64
+     bool cpu_has_sha256  = has_sha();     // x86_64 or AArch64
      bool cpu_has_sha512  = has_sha512();
      bool sw_has_x86_64   = false;
      bool sw_has_aarch64  = false;
-     int  sw_arm_arch     = 0;
-     bool sw_has_neon     = false;
+     int  sw_arm_arch     = 0;            // AArch64
+     bool sw_has_neon     = false;        // AArch64
+//     bool sw_has_sve      = false;        // AArch64
+//     bool sw_has_sve2     = false;        // AArch64
      bool sw_has_sse2     = false;        // x86_64
      bool sw_has_ssse3    = false;        // x86_64
      bool sw_has_sse41    = false;        // x86_64
@@ -2865,9 +2867,11 @@ static bool cpu_capability( bool display_only )
      bool sw_has_avx      = false;
      bool sw_has_avx2     = false;
      bool sw_has_avx512   = false;
+     bool sw_has_avx10_256 = false;
+     bool sw_has_avx10_512 = false;
      bool sw_has_aes      = false;
      bool sw_has_vaes     = false;
-     bool sw_has_sha      = false;        // x86_64 or AArch64 SHA2
+     bool sw_has_sha256   = false;        // x86_64 or AArch64 SHA2
      bool sw_has_sha512   = false;        // x86_64 or AArch64 SHA3
      set_t algo_features = algo_gate.optimizations;
      bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
@@ -2877,7 +2881,7 @@ static bool cpu_capability( bool display_only )
      bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
      bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
      bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_sha256  = set_incl( SHA_OPT,     algo_features );
      bool algo_has_sha512  = set_incl( SHA512_OPT,  algo_features );
      bool algo_has_neon    = set_incl( NEON_OPT,    algo_features );
      bool use_sse2;
@@ -2887,7 +2891,7 @@ static bool cpu_capability( bool display_only )
      bool use_avx512;
      bool use_aes;
      bool use_vaes;
-     bool use_sha;
+     bool use_sha256;
      bool use_sha512;
      bool use_neon;
      bool use_none;
@@ -2925,6 +2929,13 @@ static bool cpu_capability( bool display_only )
      #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
          sw_has_avx512 = true;
      #endif
+     #if defined(__AVX10_1_256__)
+         sw_has_avx10_256 = true;
+     #endif
+     #if defined(__AVX10_1_512__)
+         sw_has_avx10_512 = true;
+     #endif
+         
      #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        sw_has_aes = true;
      #endif
@@ -2932,16 +2943,21 @@ static bool cpu_capability( bool display_only )
          sw_has_vaes = true;
      #endif
      #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-         sw_has_sha = true;
+         sw_has_sha256 = true;
      #endif
-     #if defined(__SHA512__) || defined(____ARM_FEATURE_SHA3)
+     #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA3)
          sw_has_sha512 = true;
      #endif
      #if defined(__ARM_NEON)
          sw_has_neon = true;
      #endif
+//     #if defined(__ARM_FEATURE_SVE)
+//         sw_has_sve = true;
+//     #endif
+//     #if defined(__ARM_FEATURE_SVE2)
+//         sw_has_sve2 = true;
+//     #endif
 
-         
      cpu_brand_string( cpu_brand );
      printf( "CPU: %s\n", cpu_brand );
 
@@ -2983,7 +2999,7 @@ static bool cpu_capability( bool display_only )
      if        ( cpu_has_vaes   )   printf( " VAES"    );
      else if   ( cpu_has_aes    )   printf( "  AES"    );
      if        ( cpu_has_sha512 )   printf( " SHA512"  );
-     else if   ( cpu_has_sha    )   printf( " SHA256"  );
+     else if   ( cpu_has_sha256 )   printf( " SHA256"  );
      if        ( cpu_has_avx10  )   printf( " AVX10.%d-%d",
                                       avx10_version(), avx10_vector_length() );
 
@@ -2998,17 +3014,22 @@ static bool cpu_capability( bool display_only )
         else if ( sw_has_sse41   )   printf( " SSE4.1"  );
         else if ( sw_has_ssse3   )   printf( " SSSE3 "  );
         else if ( sw_has_sse2    )   printf( " SSE2  "  );
+        if      ( sw_has_avx10_512 ) printf( " AVX10-512" );
+        else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
      }
      else if    ( sw_has_aarch64 ) 
      {
                                      printf( " AArch64" );
         if      ( sw_arm_arch    )   printf( " armv%d", sw_arm_arch );
         if      ( sw_has_neon    )   printf( " NEON"    );
+//        if      ( sw_has_sve     )   printf( " SVE"     );
+//        else if ( sw_has_sve2    )   printf( " SVE2"    );
+
      }
      if         ( sw_has_vaes    )   printf( " VAES"    );
      else if    ( sw_has_aes     )   printf( "  AES"    );
      if         ( sw_has_sha512  )   printf( " SHA512"  );
-     else if    ( sw_has_sha     )   printf( " SHA256"  );
+     else if    ( sw_has_sha256  )   printf( " SHA256"  );
 
      if ( !display_only )
      {
@@ -3024,7 +3045,7 @@ static bool cpu_capability( bool display_only )
            if      ( algo_has_vaes   )  printf( " VAES"   );
            else if ( algo_has_aes    )  printf( "  AES"   );
            if      ( algo_has_sha512 )  printf( " SHA512" );
-           else if ( algo_has_sha    )  printf( " SHA256" );
+           else if ( algo_has_sha256 )  printf( " SHA256" );
         }
      }
      printf("\n");
@@ -3068,11 +3089,11 @@ static bool cpu_capability( bool display_only )
      use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
      use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
      use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
-     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
+     use_sha256 = cpu_has_sha256 && sw_has_sha256 && algo_has_sha256;
      use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512;
      use_neon   = sw_has_aarch64 && sw_has_neon   && algo_has_neon;
      use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
-                || use_avx2 || use_sha || use_vaes || use_sha512 || use_neon );
+             || use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon );
 
      // Display best options
      applog_nl( "Enabled optimizations:" );
@@ -3090,7 +3111,7 @@ static bool cpu_capability( bool display_only )
         if      ( use_vaes   ) printf( " VAES"   );
         else if ( use_aes    ) printf( " AES"    );
         if      ( use_sha512 ) printf( " SHA512" );
-        else if ( use_sha    ) printf( " SHA256" );
+        else if ( use_sha256 ) printf( " SHA256" );
         if      ( use_neon   ) printf( " NEON"   );
      }
      printf( "\n" );
diff --git a/simd-utils.h b/simd-utils.h
index 98552aee..406ed82f 100644
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -139,6 +139,43 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdint.h>
+#include <stddef.h>
+
+// SIMD512: Use 512, 256 & 128 bit vectors, excludes AVX512VBMI
+// VL256: Include AVX512VL instructions on 256 & 128 bit vectors
+// VBMI: Include AVX512VBMI instructions on all vectors.
+
+// AVX10 can exist without support for 512 bit vectors.
+#if defined(__AVX10_1_512__)
+  #define SIMD512 1
+#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   #define SIMD512 1
+#endif
+
+// AVX512VL instructions applied to 256 & 128 bit vectors is supported with 
+// either AVX512VL or any version of AVX10.
+#if defined(__AVX10_1__)
+  #define VL256 1
+#elif defined(__AVX512VL__)
+  #define VL256 1
+#endif
+
+// VBMI does not exist on early versions of AVX512
+#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
+  #define VBMI 1
+#endif
+
+/*
+#if defined(SIMD512)
+#warning "SIMD512"
+#endif
+#if defined(VBMI)
+#warning "VBMI"
+#endif
+#if defined(VL256)
+#warning "VL256"
+#endif
+*/
 
 #if defined(__x86_64__)
 
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index d5a87fd8..93687d8d 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -469,7 +469,7 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 #if defined(__SSSE3__)
 
   const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );
 
   s0 = _mm_shuffle_epi8( s0, bswap_shuf );
   s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -913,9 +913,7 @@ static inline void extr_lane_8x32( void *d, const void *s,
 
 #if defined(__AVX2__)
 
-#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
-
-//TODO Enable for AVX10_256 AVX10_512
+#if defined(VL256) && defined(VBMI)
 
 // Combine byte swap & broadcast in one permute
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
@@ -977,7 +975,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
   const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );
   const __m256i c1 = v256_32( 1 );
   const __m256i c2 = _mm256_add_epi32( c1, c1 );
   const __m256i c3 = _mm256_add_epi32( c2, c1 );
@@ -1035,7 +1033,8 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ), c3 );
 }
 
-#endif   // AVX512VBMI else
+#endif
+
 #endif   // AVX2
 
 // 16x32
@@ -1417,11 +1416,9 @@ static inline void extr_lane_16x32( void *d, const void *s,
    ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
 }
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
-#if defined(__AVX512VBMI__)
-
-// TODO Enable for AVX10_512
+#if defined(VBMI)
 
 // Combine byte swap & broadcast in one permute
 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
@@ -1540,7 +1537,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
                           _mm512_castsi128_si512( s4 ) );
 }
 
-#endif    // VBMI else
+#endif
 #endif    // AVX512
 
 ///////////////////////////
@@ -1983,9 +1980,9 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
 
 #endif
 
-#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
+#if defined(__AVX2__) 
 
-//TODO Enable for AVX10_256 AVX10_512
+#if defined(VL256) && defined(VBMI)
 
 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
@@ -2019,7 +2016,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ) );
 }
 
-#elif defined(__AVX2__)
+#else
 
 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
@@ -2049,6 +2046,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                           _mm256_castsi128_si256( s4 ), 0x55 );
 }
 
+#endif
+
 #endif   // AVX2
 
 #endif  // SSE2
@@ -2375,9 +2374,7 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
 
 #endif  // SSE2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__)
-
-//TODO Enable for AVX10_512
+#if defined(SIMD512)
 
 // broadcast to all lanes
 static inline void mm512_intrlv80_8x64( void *dst, const void *src )
@@ -2399,7 +2396,7 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
 
 // byte swap and broadcast to all lanes
 
-#if defined(__AVX512VBMI__)
+#if defined(VBMI)
 
 // Combine byte swap & broadcast in one permute
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
@@ -2626,10 +2623,9 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
 
 #endif  // SSE2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 
-#if defined(__AVX512VBMI__)
-//TODO Enable for AVX10_512
+#if defined(VBMI)
 
 static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
@@ -3532,9 +3528,7 @@ do { \
 
 #endif  // AVX2
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-//TODO Enable for AVX10_512
+#if defined(SIMD512)
 
 /*
 #define mm512_intrlv_blend_128( hi, lo ) \
@@ -3559,7 +3553,7 @@ do { \
     dst[7] = _mm512_mask_blend_epi64( mask, a[7], b[7] ); \
 } while(0)
 
-#endif // AVX512
+#endif // SIMD512
 
 #undef ILEAVE_4x32
 #undef LOAD_SRCE_4x32
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 2fa9898f..b963c24f 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -207,12 +207,12 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 
 #endif
 
-// broadcast (replicate) lane l to all lanes
-#define v128_replane64( v, l ) \
+// Broadcast lane l to all lanes
+#define v128_duplane64( v, l ) \
    ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
                 : _mm_shuffle_epi32( v, 0xee )
 
-#define v128_replane32( v, l ) \
+#define v128_duplane32( v, l ) \
     ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \
   : ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \
   : ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \
@@ -347,8 +347,7 @@ static inline __m128i v128_neg1_fn()
 // Basic operations without equivalent SIMD intrinsic
 
 // Bitwise not (~v)  
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 
 static inline __m128i v128_not( const __m128i v )
 {  return _mm_ternarylogic_epi64( v, v, v, 1 ); }
@@ -402,8 +401,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #define  memcpy_128           v128_memcpy  
 
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
+
+// ~v1 | v0
+#define v128_ornot( v1, v0 )      _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
 
 // a ^ b ^ c
 #define v128_xor3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x96 )
@@ -434,6 +435,8 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 
 #else
 
+#define v128_ornot( v1, v0 )      _mm_or_si128( v1, v128_not( v0 ) )
+
 #define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )
 
 #define v128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
@@ -454,7 +457,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 
 #endif
 
-#define v128_ornot( a, b )             _mm_or_si128( a, v128_not( b ) )
 
 // Mask making
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
@@ -494,7 +496,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_rol32_sse2( v, c ) \
    _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 // AVX512 fastest for all rotations.
 #define v128_ror64                _mm_ror_epi64
@@ -609,13 +611,15 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 // deprecated
 #define mm128_rol_32        v128_rol32
 
+// ror( v1 ^ v0, n )
+#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
+
 /* not used
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for SSE2, does nothing for AVX512 but is there for
 // transparency.
 
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 
 #define v128_2ror64( v1, v0, c ) \
    _mm_ror_epi64( v0, c ); \
@@ -917,10 +921,8 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 #define v128_block_bswap32             mm128_block_bswap_32
 #define v128_block_bswap64             mm128_block_bswap_64
 
-
 // alignr instruction for 32 & 64 bit elements is only available with AVX512
 // but emulated here. Behaviour is consistent with Intel alignr intrinsics.
-
 #if defined(__SSSE3__)
 
 #define v128_alignr8                   _mm_alignr_epi8
@@ -929,6 +931,9 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 
 #else
 
+#define v128_alignr8( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
+
 #define v128_alignr64( hi, lo, c ) \
    _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
 
@@ -937,12 +942,15 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 
 #endif
 
+// blend using vector mask
 #if defined(__SSE4_1__)
 
+// Bytewise using sign bit of each byte element of mask
 #define v128_blendv                    _mm_blendv_epi8
 
 #else
 
+// Bitwise
 #define v128_blendv( v1, v0, mask ) \
    v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
 
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index c568dafa..f48bfd5a 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -66,8 +66,7 @@ typedef union
 
 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 
 #define mm256_bcast128lo_64( i64 )     _mm256_maskz_set1_epi64( 0x55, i64 )
 #define mm256_bcast128hi_64( i64 )     _mm256_maskz_set1_epi64( 0xaa, i64 )
@@ -117,8 +116,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Basic operations without SIMD equivalent
 
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 
 static inline __m256i mm256_not( const __m256i v )
 {  return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
@@ -137,8 +135,10 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_add4_32( a, b, c, d ) \
    _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )
 
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
+
+// ~v1 | v0
+#define mm256_ornot( v1, v0 )      _mm256_ternarylogic_epi64( v1, v0, v0, 0xcf )
 
 // a ^ b ^ c
 #define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )
@@ -172,6 +172,8 @@ static inline __m256i mm256_not( const __m256i v )
     
 #else
 
+#define mm256_ornot( v1, v0 )      _mm256_or_si256( v1, mm256_not( v0 ) )
+
 #define mm256_xor3( a, b, c ) \
   _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
 
@@ -257,7 +259,7 @@ static inline __m256i mm256_not( const __m256i v )
    _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                     _mm256_srli_epi32( v, 32-(c) ) )
 
-#if defined(__AVX512VL__)
+#if defined(VL256)
 
 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -343,8 +345,7 @@ static inline __m256i mm256_not( const __m256i v )
 // optimization for AVX2, does nothing for AVX512 but is here for
 // transparency.
 
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 /*
 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -470,7 +471,7 @@ static inline __m256i mm256_not( const __m256i v )
 
 /* Not used
 // Rotate 256 bit vector by one 32 bit element.
-#if defined(__AVX512VL__)
+#if defined(VL256)
 static inline __m256i mm256_shuflr_32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
 static inline __m256i mm256_shufll_32( const __m256i v )
@@ -507,8 +508,8 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 #define mm256_shuflr128_32(v)   _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_shufll128_32(v)   _mm256_shuffle_epi32( v, 0x93 )
 
-#define mm256_shuflr128_16(v)   _mm256_shuffle_epi16( v, 0x39 )
-#define mm256_shufll128_16(v)   _mm256_shuffle_epi16( v, 0x93 )
+#define mm256_shuflr128_16(v)   mm256_shuffle_16( v, 0x39 )
+#define mm256_shufll128_16(v)   mm256_shuffle_16( v, 0x93 )
 
 /* Not used
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
@@ -606,6 +607,22 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
   casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
 }
 
+#if defined(VL256)
+
+#define mm256_alignr64      _mm256_alignr_epi64
+
+#else
+
+#define mm256_alignr64( v1, v0, c ) \
+    ( ( (c) & 3 ) == 1 ) ? _mm256_blend_epi32( mm256_shuflr_64( v1 ), \
+                                               mm256_shuflr_64( v0 ), 0x3f ) \
+  : ( ( (c) & 3 ) == 2 ) ? _mm256_blend_epi32( mm256_rev_128( v1 ), \
+                                               mm256_rev_128( v0 ), 0x0f ) \
+  : ( ( (c) & 3 ) == 3 ) ? _mm256_blend_epi32( mm256_shufll_64( v1 ), \
+                                               mm256_shufll_64( v0 ), 0x03 ) \
+  : v0
+
+#endif
 
 #endif // __AVX2__
 #endif // SIMD_256_H__
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index 66ea45cf..7675ada4 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -14,7 +14,13 @@
 //   vectors. It is therefore not technically required for any 512 bit vector
 //   utilities defined below.
 
-#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+// if avx10   // avx512 is always set
+//      if evex512: yes   
+// else if avx512 : yes   // avx512 is set but not avx10
+// else           : no    // avx512 not set or avx10.1 is set without evex512
+
+
+#if defined(SIMD512)
 
 //  AVX512 intrinsics have a few changes from previous conventions.
 //
@@ -180,6 +186,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Ternary logic uses 8 bit truth table to define any 3 input logical
 // expression using any number or combinations of AND, OR, XOR, NOT.
 
+// ~v1 | v0
+#define mm512_ornot( v1, v0 )      _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
+
 // a ^ b ^ c
 #define mm512_xor3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x96 )
 
diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h
index b8a31da8..9c55b16c 100644
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -4,22 +4,20 @@
 #if defined(__aarch64__) && defined(__ARM_NEON)
 
 // Targeted functions supporting NEON SIMD 128 & 64 bit vectors.
-// Size matters!
+// Element size matters!
 //
 // Intel naming is generally used.
 //
-// documented instructions that aren't defined on RPi 4.
-// They seem to be all 3 op instructionsi.
+// Some advanced logical operations that require SHA3. Prior to GCC-13
+// they also require armv8.2
 //
-//  veor3q ie xor3
-//  vxarq_u64( v1, v0, n )    ror( xor( v1, v0 ), n )
-//  vraxlq_u64( v1, v0 )      xor( rol( v1, 1 ), rol( v0, 1 ) )
-//  vbcaxq( v2, v1, v0 )      xor( v2, and( v1, not(v0) ) )
-//  vsraq_n( v1, v0, n )   add( v1, sr( v0, n ) )
+//  veor3q( v2, v1, v0 )                xor3        v2 ^ v1 ^ v0   
+//  vxarq_u64( v1, v0, n )              ror64xor    ( v1 ^ v0 ) >>> n )
+//  vbcaxq_u{64,32,16,8}( v2, v1, v0 )  xorandnot   v2 ^ ( v1 & ~v0 )
 //
-//  Doesn't work on RPi but works on OPi:
-//
-//  vornq( v1, v0 )        or( v1, not( v0 ) )
+// not used anywhere yet
+//  vrax1q_u64( v1, v0 )                  v1 ^ ( v0 <<< 1 )
+//  vsraq_n_u{64,32,16,8}( v1, v0, n )    v1 + ( v0 >> n )
 
 #define v128_t                        uint32x4_t   // default, 
 #define v128u64_t                     uint64x2_t
@@ -87,15 +85,15 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // Not yet needed
 //#define v128_cmpeq1
 // Signed
-#define v128_cmpgt64( v1, v0 )        vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
-#define v128_cmpgt32( v1, v0 )        vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
-#define v128_cmpgt16( v1, v0 )        vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
-#define v128_cmpgt8( v1, v0 )         vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 )
+#define v128_cmpgt64( v1, v0 )      vcgtq_s64( (int64x2_t)v1, (int64x2_t)(v0) )
+#define v128_cmpgt32( v1, v0 )      vcgtq_s32( (int32x4_t)v1, (int32x4_t)(v0) )
+#define v128_cmpgt16( v1, v0 )      vcgtq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
+#define v128_cmpgt8( v1, v0 )       vcgtq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
 
-#define v128_cmplt64( v1, v0 )        vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 )
-#define v128_cmplt32( v1, v0 )        vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 )
-#define v128_cmplt16( v1, v0 )        vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
-#define v128_cmplt8( v1, v0 )         vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
+#define v128_cmplt64( v1, v0 )      vcltq_s64( (int64x2_t)v1, (int64x2_t)(v0) )
+#define v128_cmplt32( v1, v0 )      vcltq_s32( (int32x4_t)v1, (int32x4_t)(v0) )
+#define v128_cmplt16( v1, v0 )      vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
+#define v128_cmplt8( v1, v0 )       vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
 
 // Logical bit shift
 #define v128_sl64                     vshlq_n_u64
@@ -109,33 +107,38 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_sr8                      vshrq_n_u8
 
 // Arithmetic shift.
-#define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)v, c )
-#define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)v, c )
-#define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)v, c )
+#define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)(v), c )
+#define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)(v), c )
+#define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)(v), c )
 
 // unary logic
+
 #define v128_not                      vmvnq_u32
 
 // binary logic
+
 #define v128_or                       vorrq_u32
 #define v128_and                      vandq_u32
 #define v128_xor                      veorq_u32
 
 // ~v1 & v0
-#define v128_andnot( v1, v0 )         vandq_u32( vmvnq_u32( v1 ), v0 )
+#define v128_andnot( v1, v0 )         vbicq_u32( v0, v1 )
 
 // ~( a ^ b ), same as (~a) ^ b
 #define v128_xnor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )
 
-// ~v1 | v0, x86_64 convention, first arg is  not'ed
-#define v128_ornot( v1, v0 )          vornq_u32( v0, v1 ) 
+// ~v1 | v0,  args reversed for consistency with x86_64
+#define v128_ornot( v1, v0 )          vornq_u32( v0, v1 )
 
 // ternary logic
 
-// v2 ^ v1 ^ v0
-// veorq_u32 not defined
-//#define v128_xor3                      veor3q_u32
-#define v128_xor3( v2, v1, v0 )       veorq_u32( v2, veorq_u32( v1, v0 ) )
+// This will compile with GCC-11 on armv8.2 and above. At this time there is no
+// known way to test arm minor version.
+#if defined(__ARM_FEATURE_SHA3)
+  #define v128_xor3                   veor3q_u32
+#else
+  #define v128_xor3( v2, v1, v0 )     veorq_u32( v2, veorq_u32( v1, v0 ) )
+#endif
 
 // v2 & v1 & v0
 #define v128_and3( v2, v1, v0 )       v128_and( v2, v128_and( v1, v0 ) )
@@ -143,8 +146,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // v2 | v1 | v0
 #define v128_or3( v2, v1, v0 )        v128_or( v2, v128_or( v1, v0 ) )
 
-// a ^ ( ~b & c )
-#define v128_xorandnot( v2, v1, v0 )  v128_xor( v2, v128_andnot( v1, v0 ) )
+// v2 ^ ( ~v1 & v0 )
+#if defined(__ARM_FEATURE_SHA3)
+  #define v128_xorandnot( v2, v1, v0 )  vbcaxq_u32( v2, v0, v1 )
+#else
+  #define v128_xorandnot( v2, v1, v0 )  v128_xor( v2, v128_andnot( v1, v0 ) )
+#endif
 
 // a ^ ( b & c )
 #define v128_xorand( v2, v1, v0 )     v128_xor( v2, v128_and( v1, v0 ) )
@@ -158,12 +165,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // v2 | ( v1 & v0 )
 #define v128_orand( v2, v1, v0 )      v128_or( v2, v128_and( v1, v0 ) )
 
-// shift 2 concatenated vectors right.
+// shift 2 concatenated vectors right, args reversed for consistency with x86_64
 #define v128_alignr64( v1, v0, c )    vextq_u64( v0, v1, c )
 #define v128_alignr32( v1, v0, c )    vextq_u32( v0, v1, c )
 #define v128_alignr8(  v1, v0, c )    vextq_u8(  v0, v1, c ) 
 
-// Intetleave high or low half of 2 vectors.
+// Interleave high or low half of 2 vectors.
 #define v128_unpacklo64( v1, v0 )     vzip1q_u64( v1, v0 )
 #define v128_unpackhi64( v1, v0 )     vzip2q_u64( v1, v0 )
 #define v128_unpacklo32( v1, v0 )     vzip1q_u32( v1, v0 )
@@ -214,10 +221,10 @@ typedef union
 #define v128_bcast32(v)                vdupq_laneq_u32( v, 0 )
 #define v128_bcast16(v)                vdupq_laneq_u16( v, 0 )
 
-// Replicate (broadcast) lane l to all lanes
-#define v128_replane64( v, l )         vdupq_laneq_u64( v, l )
-#define v128_replane32( v, l )         vdupq_laneq_u32( v, l )
-#define v128_replane16( v, l )         vdupq_laneq_u16( v, l )
+// Broadcast lane l to all lanes
+#define v128_duplane64( v, l )         vdupq_laneq_u64( v, l )
+#define v128_duplane32( v, l )         vdupq_laneq_u32( v, l )
+#define v128_duplane16( v, l )         vdupq_laneq_u16( v, l )
 
 // pointer indexing
 #define casti_v128( p, i )             (((uint32x4_t*)(p))[i])
@@ -232,16 +239,6 @@ typedef union
 #define cast_v128u32( p )              (*((uint32x4_t*)(p)))
 #define castp_v128u32( p )             ((uint32x4_t*)(p))
 
-// use C cast, flexible source type
-#define u32_to_u64                     vreinterpretq_u64_u32
-#define u64_to_u32                     vreinterpretq_u32_u64
-
-#define u64_to_u8                      vreinterpretq_u8_u64
-#define u8_to_u64                      vreinterpretq_u64_u8
-
-#define u32_to_u8                      vreinterpretq_u8_u32
-#define u8_to_u32                      vreinterpretq_u32_u8
-
 #define v128_zero                      v128_64( 0ull )
 
 #define v128_cmpeq_zero                vceqzq_u64
@@ -336,35 +333,56 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_movmask64                
 
 // Bit rotation
+
 #define v128_ror64( v, c ) \
-  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
-   : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
+                : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
+                               ((uint64x2_t)(v)), c )
 
 #define v128_rol64( v, c ) \
-  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
-   : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
+                : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
+                               ((uint64x2_t)(v)), c )
 
 #define v128_ror32( v, c ) \
-  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
-   : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
+                : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
+                               ((uint32x4_t)(v)), c )
 
 #define v128_rol32( v, c ) \
-  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
-   : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
+                : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
+                               ((uint32x4_t)(v)), c )
 
 #define v128_ror16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
-   : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
+               : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
+                             ((uint16x8_t)(v)), c )
 
 #define v128_rol16( v, c ) \
   ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
-   : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
+               : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
+                             ((uint16x8_t)(v)), c )
 
 #define v128_ror8( v, c ) \
-      vsriq_n_u8( vshlq_n_u8(  ((uint8x16_t)v),  8-c ), ((uint8x16_t)v), c )
+      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+                  ((uint8x16_t)(v)), c )
 
 #define v128_rol8( v, c ) \
-      vsliq_n_u8( vshrq_n_u8(  ((uint8x16_t)v),  8-c ), ((uint8x16_t)v), c )
+      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+                 ((uint8x16_t)(v)), c )
+
+
+// ror( v1 ^ v0, n )
+#if defined(__ARM_FEATURE_SHA3)
+
+#define v128_ror64xor( v1, v0, n )  vxarq_u64( v1, v0, n ) 
+
+#else
+
+#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
+
+#endif
 
 #define v128_2ror64( v1, v0, c ) \
 { \
@@ -416,7 +434,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 */
 
 #define v128_shuffle8( v, vmask ) \
-     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )
+     vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
 
 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -465,7 +483,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
 #define v128_bswap32(v)        (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
 #define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
 #define v128_bswap128(v)       (uint32x4_t)v128_swap64( v128_bswap64(v) )
-#define v128_bswap256(p)       v128_bswap128( (p)[0], (p)[1] ) 
 
 // Usefull for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \
@@ -534,16 +551,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
    casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }
 
-// Blendv
-#define v128_blendv( v1, v0, mask ) \
-   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
-
-/*
-// vbcaxq not defined
-#define v128_blendv( v1, v0, mask ) \
-   vbcaxq_u32( v128_and( mask, v1 ), v0, mask )
-*/
+// Bitwise blend using vector mask
+#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )
 
 #endif   // __ARM_NEON
-
 #endif   // SIMD_NEON_H__
diff --git a/sysinfos.c b/sysinfos.c
index 18fb507f..a6c3c769 100644
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -436,19 +436,40 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #warning "__ARM_NEON"
 #endif
 #ifdef __ARM_FEATURE_CRYPTO
-#warning "___ARM_FEATURE_CRYPTO"
+#warning "__ARM_FEATURE_CRYPTO"
 #endif
 #ifdef __ARM_FEATURE_AES
-#warning "___ARM_FEATURE_AES"
+#warning "__ARM_FEATURE_AES"
 #endif
 #ifdef __ARM_FEATURE_SHA2
-#warning "___ARM_FEATURE_SHA2"
+#warning "__ARM_FEATURE_SHA2"
 #endif
 #ifdef __ARM_FEATURE_SHA3
-#warning "___ARM_FEATURE_SHA3"
+#warning "__ARM_FEATURE_SHA3"
 #endif
 */
 
+// GCC-14.1: the AVX512 macros are defined even when compiled with only
+// -mavx10.1-256, causing compile errors in AVX512 code. Only with
+// -mavx10.1-512 does it compile successfully.
+// __EVEX512__ is set only when compiled with -mavx10.1-512.
+// Adding -fno-evex512 doesn't help.
+// Building with -mapxf fails to configure on a CPU without APX because it can
+// run the test program.
+/*
+#ifdef __AVX10_1__
+#warning "__AVX10_1__"
+#endif
+#ifdef __AVX10_1_256__
+#warning "__AVX10_1_256__"
+#endif
+#ifdef __AVX10_1_512__
+#warning "__AVX10_1_512__"
+#endif
+#ifdef __EVEX512__
+#warning "__EVEX512__"
+#endif
+*/
 
 
 // Typical display format: AVX10.[version]_[vectorlength], if vector length is