From 7c3d66ecd6936cfb9e2e4f9b58859d6b4ec6cc74 Mon Sep 17 00:00:00 2001
From: Sean Parkinson <sean@wolfssl.com>
Date: Thu, 29 Aug 2024 12:30:16 +1000
Subject: [PATCH] RISC-V ASM: SHA-3

Add assembly implementations of SHA-3.
Use VSRL_VX instead of two VSRL_VI operations as immediate is only 5
bits.
---
 src/include.am                               |   9 +
 wolfcrypt/src/port/riscv/riscv-64-poly1305.c |   8 +-
 wolfcrypt/src/port/riscv/riscv-64-sha3.c     | 863 +++++++++++++++++++
 wolfcrypt/src/sha3.c                         |  10 +-
 wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h  |  68 +-
 wolfssl/wolfcrypt/sha3.h                     |   4 +-
 6 files changed, 950 insertions(+), 12 deletions(-)
 create mode 100644 wolfcrypt/src/port/riscv/riscv-64-sha3.c

diff --git a/src/include.am b/src/include.am
index 1bfc754f79..3bd53273f8 100644
--- a/src/include.am
+++ b/src/include.am
@@ -286,6 +286,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha3-asm
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha3-asm.S
 endif !BUILD_ARMASM_INLINE
 endif BUILD_ARMASM
+if BUILD_RISCV_ASM
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-sha3.c
+endif BUILD_RISCV_ASM
 if !BUILD_X86_ASM
 if BUILD_INTELASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha3_asm.S
@@ -447,6 +450,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha3-asm
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha3-asm.S
 endif !BUILD_ARMASM_INLINE
 endif BUILD_ARMASM
+if BUILD_RISCV_ASM
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-sha3.c
+endif BUILD_RISCV_ASM
 if BUILD_INTELASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha3_asm.S
 endif
@@ -800,6 +806,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha3-asm
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha3-asm.S
 endif !BUILD_ARMASM_INLINE
 endif BUILD_ARMASM
+if BUILD_RISCV_ASM
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-sha3.c
+endif BUILD_RISCV_ASM
 if !BUILD_X86_ASM
 if BUILD_INTELASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha3_asm.S
diff --git a/wolfcrypt/src/port/riscv/riscv-64-poly1305.c b/wolfcrypt/src/port/riscv/riscv-64-poly1305.c
index a0b880b8a1..22d6f408ed 100644
--- a/wolfcrypt/src/port/riscv/riscv-64-poly1305.c
+++ b/wolfcrypt/src/port/riscv/riscv-64-poly1305.c
@@ -252,10 +252,9 @@ static WC_INLINE void poly1305_blocks_riscv64_16(Poly1305* ctx,
 #ifdef WOLFSSL_RISCV_VECTOR
 
 #define MUL_RES_REDIS(l, h, t)  \
-        VSRL_VI(t, l, 26)       \
-        VAND_VX(l, l, REG_A6)   \
-        VSRL_VI(t, t, 26)       \
+        VSRL_VX(t, l, REG_A7)   \
         VSLL_VI(h, h, 12)       \
+        VAND_VX(l, l, REG_A6)   \
         VOR_VV(h, h, t)
 
 #endif
@@ -273,6 +272,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m,
         "li     a4, 0xffffffc000000\n\t"
         "li     a5, 0x3ffffff\n\t"
         "li     a6, 0xfffffffffffff\n\t"
+        "li     a7, 52\n\t"
 
         /* Load r and r^2 */
         "mv     t0, %[r2]\n\t"
@@ -430,7 +430,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m,
         : [bytes] "+r" (bytes), [m] "+r" (m)
         : [r2] "r" (ctx->r2), [h] "r" (ctx->h)
         : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
-          "s3", "s4", "s5", "a4", "a5", "a6"
+          "s3", "s4", "s5", "a4", "a5", "a6", "a7"
     );
 #endif
     poly1305_blocks_riscv64_16(ctx, m, bytes, 1);
diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha3.c b/wolfcrypt/src/port/riscv/riscv-64-sha3.c
new file mode 100644
index 0000000000..45722269fd
--- /dev/null
+++ b/wolfcrypt/src/port/riscv/riscv-64-sha3.c
@@ -0,0 +1,863 @@
+/* riscv-64-sha3.c
+ *
+ * Copyright (C) 2006-2024 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h>
+
+#if defined(WOLFSSL_SHA3) && !defined(WOLFSSL_XILINX_CRYPT) && \
+   !defined(WOLFSSL_AFALG_XILINX_SHA3)
+
+#if FIPS_VERSION3_GE(2,0,0)
+    /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
+    #define FIPS_NO_WRAPPERS
+
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$n")
+        #pragma const_seg(".fipsB$n")
+    #endif
+#endif
+
+#include <wolfssl/wolfcrypt/sha3.h>
+
+static const word64 hash_keccak_r[24] =
+{
+    0x0000000000000001UL, 0x0000000000008082UL,
+    0x800000000000808aUL, 0x8000000080008000UL,
+    0x000000000000808bUL, 0x0000000080000001UL,
+    0x8000000080008081UL, 0x8000000000008009UL,
+    0x000000000000008aUL, 0x0000000000000088UL,
+    0x0000000080008009UL, 0x000000008000000aUL,
+    0x000000008000808bUL, 0x800000000000008bUL,
+    0x8000000000008089UL, 0x8000000000008003UL,
+    0x8000000000008002UL, 0x8000000000000080UL,
+    0x000000000000800aUL, 0x800000008000000aUL,
+    0x8000000080008081UL, 0x8000000000008080UL,
+    0x0000000080000001UL, 0x8000000080008008UL
+};
+
+#ifndef WOLFSSL_RISCV_VECTOR
+
+#define S0_0     "a1"
+#define S0_1     "a2"
+#define S0_2     "a3"
+#define S0_3     "a4"
+#define S0_4     "a5"
+#define S1_0     "s1"
+#define S1_1     "s2"
+#define S1_2     "s3"
+#define S1_3     "s4"
+#define S1_4     "s5"
+#define S2_0     "s6"
+#define S2_1     "s7"
+#define S2_2     "s8"
+#define S2_3     "s9"
+#define S2_4     "s10"
+#define S3_0     "t0"
+#define S3_1     "t1"
+#define S3_2     "t2"
+#define S3_3     "t3"
+#define S3_4     "t4"
+
+#define T_0      "a6"
+#define T_1      "a7"
+#define T_2      "t5"
+#define T_3      "t6"
+#define T_4      "s11"
+
+#define SR0_0    REG_A1
+#define SR0_1    REG_A2
+#define SR0_2    REG_A3
+#define SR0_3    REG_A4
+#define SR0_4    REG_A5
+#define SR1_0    REG_S1
+#define SR1_1    REG_S2
+#define SR1_2    REG_S3
+#define SR1_3    REG_S4
+#define SR1_4    REG_S5
+#define SR2_0    REG_S6
+#define SR2_1    REG_S7
+#define SR2_2    REG_S8
+#define SR2_3    REG_S9
+#define SR2_4    REG_S10
+#define SR3_0    REG_T0
+#define SR3_1    REG_T1
+#define SR3_2    REG_T2
+#define SR3_3    REG_T3
+#define SR3_4    REG_T4
+
+#define TR_0     REG_A6
+#define TR_1     REG_A7
+#define TR_2     REG_T5
+#define TR_3     REG_T6
+#define TR_4     REG_S11
+
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+
+#define SWAP_ROTL(t0, tr0, t1, s, sr, rr, rl)       \
+        "mv    " t1 ", " s  "\n\t"                  \
+        "srli  " s  ", " t0 ", " #rr "\n\t"         \
+        "slli  " t0 ", " t0 ", " #rl "\n\t"         \
+        "or    " s  ", " s  ", " t0 "\n\t"
+
+#define SWAP_ROTL_MEM(t0, tr0, t1, t2, s, rr, rl)   \
+        "ld    " t1 ", " #s "(%[s])\n\t"            \
+        "srli  " t2 ", " t0 ", " #rr "\n\t"         \
+        "slli  " t0 ", " t0 ", " #rl "\n\t"         \
+        "or    " t0 ", " t0 ", " t2 "\n\t"          \
+        "sd    " t0 ", " #s "(%[s])\n\t"
+
+#else
+
+#define SWAP_ROTL(t0, tr0, t1, s, sr, rr, rl)       \
+        "mv    " t1 ", " s "\n\t"                   \
+        RORI(sr, tr0, rr)
+
+#define SWAP_ROTL_MEM(t0, tr0, t1, t2, s, rr, rl)   \
+        "ld    " t1 ", " #s "(%[s])\n\t"            \
+        RORI(tr0, tr0, rr)                          \
+        "sd    " t0 ", " #s "(%[s])\n\t"
+
+#endif
+
+void BlockSha3(word64* s)
+{
+    const word64* r = hash_keccak_r;
+
+    __asm__ __volatile__ (
+        "addi   sp, sp, -24\n\t"
+        "li     " T_4 ", 24\n\t"
+        "ld     " S0_0 ", 0(%[s])\n\t"
+        "ld     " S0_1 ", 8(%[s])\n\t"
+        "ld     " S0_2 ", 16(%[s])\n\t"
+        "ld     " S0_3 ", 24(%[s])\n\t"
+        "ld     " S0_4 ", 32(%[s])\n\t"
+        "ld     " S1_0 ", 40(%[s])\n\t"
+        "ld     " S1_1 ", 48(%[s])\n\t"
+        "ld     " S1_2 ", 56(%[s])\n\t"
+        "ld     " S1_3 ", 64(%[s])\n\t"
+        "ld     " S1_4 ", 72(%[s])\n\t"
+        "ld     " S2_0 ", 80(%[s])\n\t"
+        "ld     " S2_1 ", 88(%[s])\n\t"
+        "ld     " S2_2 ", 96(%[s])\n\t"
+        "ld     " S2_3 ", 104(%[s])\n\t"
+        "ld     " S2_4 ", 112(%[s])\n\t"
+        "ld     " S3_0 ", 120(%[s])\n\t"
+        "ld     " S3_1 ", 128(%[s])\n\t"
+        "ld     " S3_2 ", 136(%[s])\n\t"
+        "ld     " S3_3 ", 144(%[s])\n\t"
+        "ld     " S3_4 ", 152(%[s])\n\t"
+        "ld     " T_0 ", 160(%[s])\n\t"
+        "ld     " T_1 ", 168(%[s])\n\t"
+        "ld     " T_2 ", 176(%[s])\n\t"
+        "\n"
+    "L_riscv_64_block_sha3_loop:\n\t"
+        "sd     " T_4 ", 16(sp)\n\t"
+
+        /* COLUMN MIX */
+        /* Calc b[0], b[1], b[2], b[3], b[4] */
+        "ld     " T_3 ", 184(%[s])\n\t"
+        "ld     " T_4 ", 192(%[s])\n\t"
+        "xor    " T_0 ", " T_0 ", " S0_0 "\n\t"
+        "xor    " T_1 ", " T_1 ", " S0_1 "\n\t"
+        "xor    " T_2 ", " T_2 ", " S0_2 "\n\t"
+        "xor    " T_3 ", " T_3 ", " S0_3 "\n\t"
+        "xor    " T_4 ", " T_4 ", " S0_4 "\n\t"
+        "xor    " T_0 ", " T_0 ", " S1_0 "\n\t"
+        "xor    " T_1 ", " T_1 ", " S1_1 "\n\t"
+        "xor    " T_2 ", " T_2 ", " S1_2 "\n\t"
+        "xor    " T_3 ", " T_3 ", " S1_3 "\n\t"
+        "xor    " T_4 ", " T_4 ", " S1_4 "\n\t"
+        "xor    " T_0 ", " T_0 ", " S2_0 "\n\t"
+        "xor    " T_1 ", " T_1 ", " S2_1 "\n\t"
+        "xor    " T_2 ", " T_2 ", " S2_2 "\n\t"
+        "xor    " T_3 ", " T_3 ", " S2_3 "\n\t"
+        "xor    " T_4 ", " T_4 ", " S2_4 "\n\t"
+        "xor    " T_0 ", " T_0 ", " S3_0 "\n\t"
+        "xor    " T_1 ", " T_1 ", " S3_1 "\n\t"
+        "xor    " T_2 ", " T_2 ", " S3_2 "\n\t"
+        "xor    " T_3 ", " T_3 ", " S3_3 "\n\t"
+        "xor    " T_4 ", " T_4 ", " S3_4 "\n\t"
+        "sd     " T_1 ", 0(sp)\n\t"
+        "sd     " T_3 ", 8(sp)\n\t"
+        /* T_0, T_1, T_2, T_3, T_4 */
+
+        /* s[0],s[5],s[10],s[15],s[20] ^= b[4] ^ ROTL(b[1], 1) */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "srli   " T_3 ", " T_1 ", 63\n\t"
+        "slli   " T_1 ", " T_1 ", 1\n\t"
+        "or     " T_1 ", " T_1 ", " T_3 "\n\t"
+#else
+        RORI(TR_1, TR_1, 63)
+#endif
+        "ld     " T_3 ", 160(%[s])\n\t"
+        "xor    " T_1 ", " T_1 ", " T_4 "\n\t"
+        "xor    " S0_0 ", " S0_0 ", " T_1 "\n\t"
+        "xor    " S1_0 ", " S1_0 ", " T_1 "\n\t"
+        "xor    " T_3 ", " T_3 ", " T_1 "\n\t"
+        "xor    " S2_0 ", " S2_0 ", " T_1 "\n\t"
+        "xor    " S3_0 ", " S3_0 ", " T_1 "\n\t"
+        "sd     " T_3 ", 160(%[s])\n\t"
+        /* T_0, T_2, T_4 */
+
+        /* s[1],s[6],s[11],s[16],s[21] ^= b[0] ^ ROTL(b[2], 1)*/
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "srli   " T_3 ", " T_2 ", 63\n\t"
+        "slli   " T_1 ", " T_2 ", 1\n\t"
+        "or     " T_1 ", " T_1 ", " T_3 "\n\t"
+#else
+        RORI(TR_1, TR_2, 63)
+#endif
+        "ld     " T_3 ", 168(%[s])\n\t"
+        "xor    " T_1 ", " T_1 ", " T_0 "\n\t"
+        "xor    " S0_1 ", " S0_1 ", " T_1 "\n\t"
+        "xor    " S1_1 ", " S1_1 ", " T_1 "\n\t"
+        "xor    " T_3 ", " T_3 ", " T_1 "\n\t"
+        "xor    " S2_1 ", " S2_1 ", " T_1 "\n\t"
+        "xor    " S3_1 ", " S3_1 ", " T_1 "\n\t"
+        "sd     " T_3 ", 168(%[s])\n\t"
+        /* T_0, T_2, T_4 */
+
+        /* s[3],s[8],s[13],s[18],s[23] ^= b[2] ^ ROTL(b[4], 1) */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "srli   " T_3 ", " T_4 ", 63\n\t"
+        "slli   " T_4 ", " T_4 ", 1\n\t"
+        "or     " T_4 ", " T_4 ", " T_3 "\n\t"
+#else
+        RORI(TR_4, TR_4, 63)
+#endif
+        "ld     " T_3 ", 184(%[s])\n\t"
+        "xor    " T_4 ", " T_4 ", " T_2 "\n\t"
+        "xor    " S0_3 ", " S0_3 ", " T_4 "\n\t"
+        "xor    " S1_3 ", " S1_3 ", " T_4 "\n\t"
+        "xor    " T_3 ", " T_3 ", " T_4 "\n\t"
+        "xor    " S2_3 ", " S2_3 ", " T_4 "\n\t"
+        "xor    " S3_3 ", " S3_3 ", " T_4 "\n\t"
+        "sd     " T_3 ", 184(%[s])\n\t"
+        /* T_0, T_2 */
+
+        "ld     " T_3 ", 8(sp)\n\t"
+        /* s[4],s[9],s[14],s[19],s[24] ^= b[3] ^ ROTL(b[0], 1) */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "srli   " T_2 ", " T_0 ", 63\n\t"
+        "slli   " T_0 ", " T_0 ", 1\n\t"
+        "or     " T_0 ", " T_0 ", " T_2 "\n\t"
+#else
+        RORI(TR_0, TR_0, 63)
+#endif
+        "ld     " T_4 ", 192(%[s])\n\t"
+        "xor    " T_0 ", " T_0 ", " T_3 "\n\t"
+        "xor    " S0_4 ", " S0_4 ", " T_0 "\n\t"
+        "xor    " S1_4 ", " S1_4 ", " T_0 "\n\t"
+        "xor    " T_4 ", " T_4 ", " T_0 "\n\t"
+        "xor    " S2_4 ", " S2_4 ", " T_0 "\n\t"
+        "xor    " S3_4 ", " S3_4 ", " T_0 "\n\t"
+        /* T_3 */
+
+        "ld     " T_1 ", 0(sp)\n\t"
+        /* s[2],s[7],s[12],s[17],s[22] ^= b[1] ^ ROTL(b[3], 1) */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "srli   " T_2 ", " T_3 ", 63\n\t"
+        "slli   " T_3 ", " T_3 ", 1\n\t"
+        "or     " T_3 ", " T_3 ", " T_2 "\n\t"
+#else
+        RORI(TR_3, TR_3, 63)
+#endif
+        "ld     " T_2 ", 176(%[s])\n\t"
+        "xor    " T_3 ", " T_3 ", " T_1 "\n\t"
+        "xor    " S0_2 ", " S0_2 ", " T_3 "\n\t"
+        "xor    " S1_2 ", " S1_2 ", " T_3 "\n\t"
+        "xor    " T_2 ", " T_2 ", " T_3 "\n\t"
+        "xor    " S2_2 ", " S2_2 ", " T_3 "\n\t"
+        "xor    " S3_2 ", " S3_2 ", " T_3 "\n\t"
+
+        /* SWAP ROTL */
+        /* t0 = s[10], s[10] = s[1] >>> 63 */
+        "mv    " T_0 ", " S2_0 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "srli  " T_1 ", " S0_1 ", 63\n\t"
+        "slli  " S2_0 ", " S0_1 ", 1\n\t"
+        "or    " S2_0 ", " S2_0 ", " T_1 "\n\t"
+#else
+        RORI(SR2_0, SR0_1, 63)
+#endif
+        /* t1 = s[ 7], s[ 7] = t0 >>> 61 */
+        SWAP_ROTL(T_0, TR_0, T_1, S1_2, SR1_2, 61, 3)
+        /* t0 = s[11], s[11] = t1 >>> 58 */
+        SWAP_ROTL(T_1, TR_1, T_0, S2_1, SR2_1, 58, 6)
+        /* t1 = s[17], s[17] = t0 >>> 54 */
+        SWAP_ROTL(T_0, TR_0, T_1, S3_2, SR3_2, 54, 10)
+        /* t0 = s[18], s[18] = t1 >>> 49 */
+        SWAP_ROTL(T_1, TR_1, T_0, S3_3, SR3_3, 49, 15)
+        /* t1 = s[ 3], s[ 3] = t0 >>> 43 */
+        SWAP_ROTL(T_0, TR_0, T_1, S0_3, SR0_3, 43, 21)
+        /* t0 = s[ 5], s[ 5] = t1 >>> 36 */
+        SWAP_ROTL(T_1, TR_1, T_0, S1_0, SR1_0, 36, 28)
+        /* t1 = s[16], s[16] = t0 >>> 28 */
+        SWAP_ROTL(T_0, TR_0, T_1, S3_1, SR3_1, 28, 36)
+        /* t0 = s[ 8], s[ 8] = t1 >>> 19 */
+        SWAP_ROTL(T_1, TR_1, T_0, S1_3, SR1_3, 19, 45)
+        /* t1 = s[21], s[21] = t0 >>>  9 */
+        SWAP_ROTL_MEM(T_0, TR_0, T_1, T_3, 168,  9, 55)
+        /* t0 = s[24], s[24] = t1 >>> 62 */
+        SWAP_ROTL(T_1, TR_1, T_0, T_4, TR_4, 62,  2)
+        /* t1 = s[ 4], s[ 4] = t0 >>> 50 */
+        SWAP_ROTL(T_0, TR_0, T_1, S0_4, SR0_4, 50, 14)
+        /* t0 = s[15], s[15] = t1 >>> 37 */
+        SWAP_ROTL(T_1, TR_1, T_0, S3_0, SR3_0, 37, 27)
+        /* t1 = s[23], s[23] = t0 >>> 23 */
+        SWAP_ROTL_MEM(T_0, TR_0, T_1, T_3, 184, 23, 41)
+        /* t0 = s[19], s[19] = t1 >>>  8 */
+        SWAP_ROTL(T_1, TR_1, T_0, S3_4, SR3_4,  8, 56)
+        /* t1 = s[13], s[13] = t0 >>> 56 */
+        SWAP_ROTL(T_0, TR_0, T_1, S2_3, SR2_3, 56,  8)
+        /* t0 = s[12], s[12] = t1 >>> 39 */
+        SWAP_ROTL(T_1, TR_1, T_0, S2_2, SR2_2, 39, 25)
+        /* t1 = s[ 2], s[ 2] = t0 >>> 21 */
+        SWAP_ROTL(T_0, TR_0, T_1, S0_2, SR0_2, 21, 43)
+        /* t0 = s[20], s[20] = t1 >>>  2 */
+        SWAP_ROTL_MEM(T_1, TR_1, T_0, T_3, 160,  2, 62)
+        /* t1 = s[14], s[14] = t0 >>> 46 */
+        SWAP_ROTL(T_0, TR_0, T_1, S2_4, SR2_4, 46, 18)
+        /* t0 = s[22], s[22] = t1 >>> 25 */
+        SWAP_ROTL(T_1, TR_1, T_0, T_2, TR_2, 25, 39)
+        /* t1 = s[ 9], s[ 9] = t0 >>> 3 */
+        SWAP_ROTL(T_0, TR_0, T_1, S1_4, SR1_4,  3, 61)
+        /* t0 = s[ 6], s[ 6] = t1 >>> 44 */
+        SWAP_ROTL(T_1, TR_1, T_0, S1_1, SR1_1, 44, 20)
+        /*             s[ 1] = t0 >>> 20 */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "srli  " S0_1 ", " T_0 ", 20\n\t"
+        "slli  " T_0 ", " T_0 ", 44\n\t"
+        "or    " S0_1 ", " S0_1 ", " T_0 "\n\t"
+#else
+        RORI(SR0_1, TR_0, 20)
+#endif
+
+        /* ROW MIX */
+        /* s[0] */
+        "mv     " T_0 ", " S0_0 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S0_1 "\n\t"
+        "and    " T_3 ", " T_3 ", " S0_2 "\n\t"
+#else
+        ANDN(TR_3, SR0_2, SR0_1)
+#endif
+        "xor    " S0_0 ", " S0_0 ", " T_3 "\n\t"
+        /* s[1] */
+        "mv     " T_1 ", " S0_1 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S0_2 "\n\t"
+        "and    " T_3 ", " T_3 ", " S0_3 "\n\t"
+#else
+        ANDN(TR_3, SR0_3, SR0_2)
+#endif
+        "xor    " S0_1 ", " S0_1 ", " T_3 "\n\t"
+        /* s[2] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S0_3 "\n\t"
+        "and    " T_3 ", " T_3 ", " S0_4 "\n\t"
+#else
+        ANDN(TR_3, SR0_4, SR0_3)
+#endif
+        "xor    " S0_2 ", " S0_2 ", " T_3 "\n\t"
+        /* s[3] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S0_4 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_0 "\n\t"
+#else
+        ANDN(TR_3, TR_0, SR0_4)
+#endif
+        "xor    " S0_3 ", " S0_3 ", " T_3 "\n\t"
+        /* s[4] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " T_0 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_1 "\n\t"
+#else
+        ANDN(TR_3, TR_1, TR_0)
+#endif
+        "xor    " S0_4 ", " S0_4 ", " T_3 "\n\t"
+
+        /* s[5] */
+        "mv     " T_0 ", " S1_0 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S1_1 "\n\t"
+        "and    " T_3 ", " T_3 ", " S1_2 "\n\t"
+#else
+        ANDN(TR_3, SR1_2, SR1_1)
+#endif
+        "xor    " S1_0 ", " S1_0 ", " T_3 "\n\t"
+        /* s[6] */
+        "mv     " T_1 ", " S1_1 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S1_2 "\n\t"
+        "and    " T_3 ", " T_3 ", " S1_3 "\n\t"
+#else
+        ANDN(TR_3, SR1_3, SR1_2)
+#endif
+        "xor    " S1_1 ", " S1_1 ", " T_3 "\n\t"
+        /* s[7] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S1_3 "\n\t"
+        "and    " T_3 ", " T_3 ", " S1_4 "\n\t"
+#else
+        ANDN(TR_3, SR1_4, SR1_3)
+#endif
+        "xor    " S1_2 ", " S1_2 ", " T_3 "\n\t"
+        /* s[8] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S1_4 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_0 "\n\t"
+#else
+        ANDN(TR_3, TR_0, SR1_4)
+#endif
+        "xor    " S1_3 ", " S1_3 ", " T_3 "\n\t"
+        /* s[9] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " T_0 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_1 "\n\t"
+#else
+        ANDN(TR_3, TR_1, TR_0)
+#endif
+        "xor    " S1_4 ", " S1_4 ", " T_3 "\n\t"
+
+        /* s[10] */
+        "mv     " T_0 ", " S2_0 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S2_1 "\n\t"
+        "and    " T_3 ", " T_3 ", " S2_2 "\n\t"
+#else
+        ANDN(TR_3, SR2_2, SR2_1)
+#endif
+        "xor    " S2_0 ", " S2_0 ", " T_3 "\n\t"
+        /* s[11] */
+        "mv     " T_1 ", " S2_1 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S2_2 "\n\t"
+        "and    " T_3 ", " T_3 ", " S2_3 "\n\t"
+#else
+        ANDN(TR_3, SR2_3, SR2_2)
+#endif
+        "xor    " S2_1 ", " S2_1 ", " T_3 "\n\t"
+        /* s[12] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S2_3 "\n\t"
+        "and    " T_3 ", " T_3 ", " S2_4 "\n\t"
+#else
+        ANDN(TR_3, SR2_4, SR2_3)
+#endif
+        "xor    " S2_2 ", " S2_2 ", " T_3 "\n\t"
+        /* s[13] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S2_4 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_0 "\n\t"
+#else
+        ANDN(TR_3, TR_0, SR2_4)
+#endif
+        "xor    " S2_3 ", " S2_3 ", " T_3 "\n\t"
+        /* s[14] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " T_0 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_1 "\n\t"
+#else
+        ANDN(TR_3, TR_1, TR_0)
+#endif
+        "xor    " S2_4 ", " S2_4 ", " T_3 "\n\t"
+
+        /* s[15] */
+        "mv     " T_0 ", " S3_0 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S3_1 "\n\t"
+        "and    " T_3 ", " T_3 ", " S3_2 "\n\t"
+#else
+        ANDN(TR_3, SR3_2, SR3_1)
+#endif
+        "xor    " S3_0 ", " S3_0 ", " T_3 "\n\t"
+        /* s[16] */
+        "mv     " T_1 ", " S3_1 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S3_2 "\n\t"
+        "and    " T_3 ", " T_3 ", " S3_3 "\n\t"
+#else
+        ANDN(TR_3, SR3_3, SR3_2)
+#endif
+        "xor    " S3_1 ", " S3_1 ", " T_3 "\n\t"
+        /* s[17] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S3_3 "\n\t"
+        "and    " T_3 ", " T_3 ", " S3_4 "\n\t"
+#else
+        ANDN(TR_3, SR3_4, SR3_3)
+#endif
+        "xor    " S3_2 ", " S3_2 ", " T_3 "\n\t"
+        /* s[18] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " S3_4 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_0 "\n\t"
+#else
+        ANDN(TR_3, TR_0, SR3_4)
+#endif
+        "xor    " S3_3 ", " S3_3 ", " T_3 "\n\t"
+        /* s[19] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " T_3 ", " T_0 "\n\t"
+        "and    " T_3 ", " T_3 ", " T_1 "\n\t"
+#else
+        ANDN(TR_3, TR_1, TR_0)
+#endif
+        "xor    " S3_4 ", " S3_4 ", " T_3 "\n\t"
+
+        "sd     " S3_0 ", 120(%[s])\n\t"
+        "sd     " S3_1 ", 128(%[s])\n\t"
+        "sd     " S3_2 ", 136(%[s])\n\t"
+        "ld     " T_0 ", 160(%[s])\n\t"
+        "ld     " T_1 ", 168(%[s])\n\t"
+        "ld     " T_3 ", 184(%[s])\n\t"
+
+        /* s[20] */
+        "mv     " S3_0 ", " T_0 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " S3_2 ", " T_1 "\n\t"
+        "and    " S3_2 ", " S3_2 ", " T_2 "\n\t"
+#else
+        ANDN(SR3_2, TR_2, TR_1)
+#endif
+        "xor    " T_0 ", " T_0 ", " S3_2 "\n\t"
+        /* s[21] */
+        "mv     " S3_1 ", " T_1 "\n\t"
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " S3_2 ", " T_2 "\n\t"
+        "and    " S3_2 ", " S3_2 ", " T_3 "\n\t"
+#else
+        ANDN(SR3_2, TR_3, TR_2)
+#endif
+        "xor    " T_1 ", " T_1 ", " S3_2 "\n\t"
+        /* s[22] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " S3_2 ", " T_3 "\n\t"
+        "and    " S3_2 ", " S3_2 ", " T_4 "\n\t"
+#else
+        ANDN(SR3_2, TR_4, TR_3)
+#endif
+        "xor    " T_2 ", " T_2 ", " S3_2 "\n\t"
+        /* s[23] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " S3_2 ", " T_4 "\n\t"
+        "and    " S3_2 ", " S3_2 ", " S3_0 "\n\t"
+#else
+        ANDN(SR3_2, SR3_0, TR_4)
+#endif
+        "xor    " T_3 ", " T_3 ", " S3_2 "\n\t"
+        /* s[24] */
+#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION
+        "not    " S3_2 ", " S3_0 "\n\t"
+        "and    " S3_2 ", " S3_2 ", " S3_1 "\n\t"
+#else
+        ANDN(SR3_2, SR3_1, SR3_0)
+#endif
+        "xor    " T_4 ", " T_4 ", " S3_2 "\n\t"
+
+        "ld     " S3_0 ", 120(%[s])\n\t"
+        "ld     " S3_1 ", 128(%[s])\n\t"
+        "ld     " S3_2 ", 136(%[s])\n\t"
+        "sd     " T_0 ", 160(%[s])\n\t"
+        "sd     " T_1 ", 168(%[s])\n\t"
+        "sd     " T_2 ", 176(%[s])\n\t"
+        "sd     " T_3 ", 184(%[s])\n\t"
+        "sd     " T_4 ", 192(%[s])\n\t"
+
+        "ld     " T_4 ", 16(sp)\n\t"
+        "ld     " T_3 ", 0(%[r])\n\t"
+        "addi   %[r], %[r], 8\n\t"
+        "addi   " T_4 ", " T_4 ", -1\n\t"
+        "xor    " S0_0 ", " S0_0 ", " T_3 "\n\t"
+        "bnez   " T_4 ", L_riscv_64_block_sha3_loop\n\t"
+
+        "sd     " S0_0 ", 0(%[s])\n\t"
+        "sd     " S0_1 ", 8(%[s])\n\t"
+        "sd     " S0_2 ", 16(%[s])\n\t"
+        "sd     " S0_3 ", 24(%[s])\n\t"
+        "sd     " S0_4 ", 32(%[s])\n\t"
+        "sd     " S1_0 ", 40(%[s])\n\t"
+        "sd     " S1_1 ", 48(%[s])\n\t"
+        "sd     " S1_2 ", 56(%[s])\n\t"
+        "sd     " S1_3 ", 64(%[s])\n\t"
+        "sd     " S1_4 ", 72(%[s])\n\t"
+        "sd     " S2_0 ", 80(%[s])\n\t"
+        "sd     " S2_1 ", 88(%[s])\n\t"
+        "sd     " S2_2 ", 96(%[s])\n\t"
+        "sd     " S2_3 ", 104(%[s])\n\t"
+        "sd     " S2_4 ", 112(%[s])\n\t"
+        "sd     " S3_0 ", 120(%[s])\n\t"
+        "sd     " S3_1 ", 128(%[s])\n\t"
+        "sd     " S3_2 ", 136(%[s])\n\t"
+        "sd     " S3_3 ", 144(%[s])\n\t"
+        "sd     " S3_4 ", 152(%[s])\n\t"
+
+        "addi   sp, sp, 24\n\t"
+
+        : [r] "+r" (r)
+        : [s] "r" (s)
+        : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+           "a1", "a2", "a3", "a4", "a5", "a6", "a7",
+           "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11"
+    );
+}
+
+#else
+
+#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION
+
+#define COL_MIX(r, b1, b4)                      \
+        VSLL_VI(REG_V31, b1, 1)                 \
+        VSRL_VX(REG_V30, b1, REG_T1)            \
+        VXOR_VV(REG_V31, REG_V31, b4)           \
+        VXOR_VV(REG_V31, REG_V31, REG_V30)      \
+        VXOR_VV((r +  0), (r +  0), REG_V31)    \
+        VXOR_VV((r +  5), (r +  5), REG_V31)    \
+        VXOR_VV((r + 10), (r + 10), REG_V31)    \
+        VXOR_VV((r + 15), (r + 15), REG_V31)    \
+        VXOR_VV((r + 20), (r + 20), REG_V31)
+
+#define SWAP_ROTL_LO(vr, vt0, vt1, sl)          \
+        VMV_V_V(vt0, vr)                        \
+        "li     t1, 64 - " #sl "\n\t"           \
+        VSLL_VI(vr, vt1, sl)                    \
+        VSRL_VX(vt1, vt1, REG_T1)               \
+        VOR_VV(vr, vr, vt1)
+
+#define SWAP_ROTL_HI(vr, vt0, vt1, sl)          \
+        VMV_V_V(vt0, vr)                        \
+        "li     t1, " #sl "\n\t"                \
+        VSRL_VI(vr, vt1, (64 - sl))             \
+        VSLL_VX(vt1, vt1, REG_T1)               \
+        VOR_VV(vr, vr, vt1)
+
+#define ROW_MIX(r)                              \
+        VMV_V_V(REG_V25, (r + 0))               \
+        VMV_V_V(REG_V26, (r + 1))               \
+        VNOT_V(REG_V30, (r + 1))                \
+        VNOT_V(REG_V31, (r + 2))                \
+        VAND_VV(REG_V30, REG_V30, (r + 2))      \
+        VAND_VV(REG_V31, REG_V31, (r + 3))      \
+        VXOR_VV((r + 0), REG_V30, (r + 0))      \
+        VXOR_VV((r + 1), REG_V31, (r + 1))      \
+        VNOT_V(REG_V30, (r + 3))                \
+        VNOT_V(REG_V31, (r + 4))                \
+        VAND_VV(REG_V30, REG_V30, (r + 4))      \
+        VAND_VV(REG_V31, REG_V31, REG_V25)      \
+        VNOT_V(REG_V25, REG_V25)                \
+        VXOR_VV((r + 2), REG_V30, (r + 2))      \
+        VAND_VV(REG_V25, REG_V25, REG_V26)      \
+        VXOR_VV((r + 3), REG_V31, (r + 3))      \
+        VXOR_VV((r + 4), REG_V25, (r + 4))
+
+#else
+
+#define COL_MIX(r, t)                           \
+        VXOR_VV((r +  0), (r +  0), t)          \
+        VXOR_VV((r +  5), (r +  5), t)          \
+        VXOR_VV((r + 10), (r + 10), t)          \
+        VXOR_VV((r + 15), (r + 15), t)          \
+        VXOR_VV((r + 20), (r + 20), t)
+
+#define SWAP_ROTL(vr, vt0, vt1, sl)             \
+        VMV_V_V(vt0, vr)                        \
+        VROR_VI(vr, (64 - sl), vt1)
+
+#define SWAP_ROTL_LO    SWAP_ROTL
+#define SWAP_ROTL_HI    SWAP_ROTL
+
+#define ROW_MIX(r)                              \
+        VMV_V_V(REG_V25, (r + 0))               \
+        VMV_V_V(REG_V26, (r + 1))               \
+        VANDN_VV(REG_V30, (r + 1), (r + 2))     \
+        VANDN_VV(REG_V31, (r + 2), (r + 3))     \
+        VXOR_VV((r + 0), REG_V30, (r + 0))      \
+        VXOR_VV((r + 1), REG_V31, (r + 1))      \
+        VANDN_VV(REG_V30, (r + 3), (r + 4))     \
+        VANDN_VV(REG_V31, (r + 4), REG_V25)     \
+        VANDN_VV(REG_V25, REG_V25, REG_V26)     \
+        VXOR_VV((r + 2), REG_V30, (r + 2))      \
+        VXOR_VV((r + 3), REG_V31, (r + 3))      \
+        VXOR_VV((r + 4), REG_V25, (r + 4))
+
+#endif
+
+
+void BlockSha3(word64* s)
+{
+    __asm__ __volatile__ (
+        /* 1 x 64-bit */
+        VSETIVLI(REG_X0, 1, 0, 1, 0b011, 0b000)
+
+        "li     t2, 24\n\t"
+        "mv     t0, %[r]\n\t"
+        "mv     t1, %[s]\n\t"
+        VLSEG8E64_V(REG_V0, REG_T1)
+        "addi   t1, %[s], 64\n\t"
+        VLSEG8E64_V(REG_V8, REG_T1)
+        "addi   t1, %[s], 128\n\t"
+        VLSEG8E64_V(REG_V16, REG_T1)
+        "addi   t1, %[s], 192\n\t"
+        VLSEG1E64_V(REG_V24, REG_T1)
+
+        "\n"
+    "L_riscv_64_block_sha3_loop:\n\t"
+
+        /* COLUMN MIX */
+        VXOR_VV(REG_V25, REG_V0, REG_V5)
+        VXOR_VV(REG_V26, REG_V1, REG_V6)
+        VXOR_VV(REG_V27, REG_V2, REG_V7)
+        VXOR_VV(REG_V28, REG_V3, REG_V8)
+        VXOR_VV(REG_V29, REG_V4, REG_V9)
+        VXOR_VV(REG_V25, REG_V25, REG_V10)
+        VXOR_VV(REG_V26, REG_V26, REG_V11)
+        VXOR_VV(REG_V27, REG_V27, REG_V12)
+        VXOR_VV(REG_V28, REG_V28, REG_V13)
+        VXOR_VV(REG_V29, REG_V29, REG_V14)
+        VXOR_VV(REG_V25, REG_V25, REG_V15)
+        VXOR_VV(REG_V26, REG_V26, REG_V16)
+        VXOR_VV(REG_V27, REG_V27, REG_V17)
+        VXOR_VV(REG_V28, REG_V28, REG_V18)
+        VXOR_VV(REG_V29, REG_V29, REG_V19)
+        VXOR_VV(REG_V25, REG_V25, REG_V20)
+        VXOR_VV(REG_V26, REG_V26, REG_V21)
+        VXOR_VV(REG_V27, REG_V27, REG_V22)
+        VXOR_VV(REG_V28, REG_V28, REG_V23)
+        VXOR_VV(REG_V29, REG_V29, REG_V24)
+
+#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION
+        "li     t1, 63\n\t"
+        COL_MIX(REG_V0, REG_V26, REG_V29)
+        COL_MIX(REG_V1, REG_V27, REG_V25)
+        COL_MIX(REG_V2, REG_V28, REG_V26)
+        COL_MIX(REG_V3, REG_V29, REG_V27)
+        COL_MIX(REG_V4, REG_V25, REG_V28)
+#else
+        VROR_VI(REG_V30, 63, REG_V26)
+        VROR_VI(REG_V31, 63, REG_V27)
+        VXOR_VV(REG_V30, REG_V30, REG_V29)
+        VXOR_VV(REG_V31, REG_V31, REG_V25)
+        COL_MIX(REG_V0, REG_V30)
+        COL_MIX(REG_V1, REG_V31)
+
+        VROR_VI(REG_V30, 63, REG_V28)
+        VROR_VI(REG_V31, 63, REG_V29)
+        VROR_VI(REG_V25, 63, REG_V25)
+        VXOR_VV(REG_V30, REG_V30, REG_V26)
+        VXOR_VV(REG_V31, REG_V31, REG_V27)
+        VXOR_VV(REG_V25, REG_V25, REG_V28)
+        COL_MIX(REG_V2, REG_V30)
+        COL_MIX(REG_V3, REG_V31)
+        COL_MIX(REG_V4, REG_V25)
+#endif
+        /* SWAP ROTL */
+        /* t1 = s[ 1]                   */
+        VMV_V_V(REG_V26, REG_V1)
+        /* t0 = s[10], s[10] = t1 <<< 1 */
+        SWAP_ROTL_LO(REG_V10, REG_V25, REG_V26, 1)
+        /* t1 = s[ 7], s[ 7] = t0 <<< 3 */
+        SWAP_ROTL_LO(REG_V7 , REG_V26, REG_V25, 3)
+        /* t0 = s[11], s[11] = t1 <<< 6 */
+        SWAP_ROTL_LO(REG_V11, REG_V25, REG_V26, 6)
+        /* t1 = s[17], s[17] = t0 <<< 10 */
+        SWAP_ROTL_LO(REG_V17, REG_V26, REG_V25, 10)
+        /* t0 = s[18], s[18] = t1 <<< 15 */
+        SWAP_ROTL_LO(REG_V18, REG_V25, REG_V26, 15)
+        /* t1 = s[ 3], s[ 3] = t0 <<< 21 */
+        SWAP_ROTL_LO(REG_V3 , REG_V26, REG_V25, 21)
+        /* t0 = s[ 5], s[ 5] = t1 <<< 28 */
+        SWAP_ROTL_LO(REG_V5 , REG_V25, REG_V26, 28)
+        /* t1 = s[16], s[16] = t0 <<< 36 */
+        SWAP_ROTL_HI(REG_V16, REG_V26, REG_V25, 36)
+        /* t0 = s[ 8], s[ 8] = t1 <<< 45 */
+        SWAP_ROTL_HI(REG_V8 , REG_V25, REG_V26, 45)
+        /* t1 = s[21], s[21] = t0 <<< 55 */
+        SWAP_ROTL_HI(REG_V21, REG_V26, REG_V25, 55)
+        /* t0 = s[24], s[24] = t1 <<< 2 */
+        SWAP_ROTL_LO(REG_V24, REG_V25, REG_V26,  2)
+        /* t1 = s[ 4], s[ 4] = t0 <<< 14 */
+        SWAP_ROTL_LO(REG_V4 , REG_V26, REG_V25, 14)
+        /* t0 = s[15], s[15] = t1 <<< 27 */
+        SWAP_ROTL_LO(REG_V15, REG_V25, REG_V26, 27)
+        /* t1 = s[23], s[23] = t0 <<< 41 */
+        SWAP_ROTL_HI(REG_V23, REG_V26, REG_V25, 41)
+        /* t0 = s[19], s[19] = t1 <<< 56 */
+        SWAP_ROTL_HI(REG_V19, REG_V25, REG_V26, 56)
+        /* t1 = s[13], s[13] = t0 <<< 8 */
+        SWAP_ROTL_LO(REG_V13, REG_V26, REG_V25,  8)
+        /* t0 = s[12], s[12] = t1 <<< 25 */
+        SWAP_ROTL_LO(REG_V12, REG_V25, REG_V26, 25)
+        /* t1 = s[ 2], s[ 2] = t0 <<< 43 */
+        SWAP_ROTL_HI(REG_V2 , REG_V26, REG_V25, 43)
+        /* t0 = s[20], s[20] = t1 <<< 62 */
+        SWAP_ROTL_HI(REG_V20, REG_V25, REG_V26, 62)
+        /* t1 = s[14], s[14] = t0 <<< 18 */
+        SWAP_ROTL_LO(REG_V14, REG_V26, REG_V25, 18)
+        /* t0 = s[22], s[22] = t1 <<< 39 */
+        SWAP_ROTL_HI(REG_V22, REG_V25, REG_V26, 39)
+        /* t1 = s[ 9], s[ 9] = t0 <<< 61 */
+        SWAP_ROTL_HI(REG_V9 , REG_V26, REG_V25, 61)
+        /* t0 = s[ 6], s[ 6] = t1 <<< 20 */
+        SWAP_ROTL_LO(REG_V6 , REG_V25, REG_V26, 20)
+        /*             s[ 1] = t0 <<< 44 */
+        "li     t1, 44\n\t"
+        VSRL_VI(REG_V1, REG_V25, (64 - 44))
+        VSLL_VX(REG_V25, REG_V25, REG_T1)
+        VOR_VV(REG_V1, REG_V1, REG_V25)
+
+        /* ROW MIX */
+        ROW_MIX(REG_V0)
+        ROW_MIX(REG_V5)
+        ROW_MIX(REG_V10)
+        ROW_MIX(REG_V15)
+        ROW_MIX(REG_V20)
+
+        VL1RE64_V(REG_V25, REG_T0)
+        "addi   t0, t0, 8\n\t"
+        "addi   t2, t2, -1\n\t"
+        VXOR_VV(REG_V0, REG_V0, REG_V25)
+        "bnez   t2, L_riscv_64_block_sha3_loop\n\t"
+
+        "mv     t1, %[s]\n\t"
+        VSSEG8E64_V(REG_V0, REG_T1)
+        "addi   t1, %[s], 64\n\t"
+        VSSEG8E64_V(REG_V8, REG_T1)
+        "addi   t1, %[s], 128\n\t"
+        VSSEG8E64_V(REG_V16, REG_T1)
+        "addi   t1, %[s], 192\n\t"
+        VSSEG1E64_V(REG_V24, REG_T1)
+
+        :
+        : [s] "r" (s), [r] "r" (hash_keccak_r)
+        : "memory", "t0", "t1", "t2"
+    );
+}
+
+#endif
+
+#endif
+
diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c
index 6346173199..2bba29bcef 100644
--- a/wolfcrypt/src/sha3.c
+++ b/wolfcrypt/src/sha3.c
@@ -62,8 +62,8 @@
     }
 #endif
 
-#if !defined(WOLFSSL_ARMASM) || (!defined(__arm__) && \
-    !defined(WOLFSSL_ARMASM_CRYPTO_SHA3))
+#if (!defined(WOLFSSL_ARMASM) || (!defined(__arm__) && \
+     !defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) && !defined(WOLFSSL_RISCV_ASM)
 
 #ifdef USE_INTEL_SPEEDUP
     #include <wolfssl/wolfcrypt/cpuid.h>
@@ -250,7 +250,7 @@ while (0)
 #ifndef USE_INTEL_SPEEDUP
 static
 #endif
-void BlockSha3(word64 *s)
+void BlockSha3(word64* s)
 {
     byte i, x, y;
     word64 t0, t1;
@@ -541,7 +541,7 @@ while (0)
 #ifndef USE_INTEL_SPEEDUP
 static
 #endif
-void BlockSha3(word64 *s)
+void BlockSha3(word64* s)
 {
     word64 n[25];
     word64 b[5];
@@ -563,7 +563,7 @@ void BlockSha3(word64 *s)
     }
 }
 #endif /* WOLFSSL_SHA3_SMALL */
-#endif /* !WOLFSSL_ARMASM */
+#endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
 
 static WC_INLINE word64 Load64Unaligned(const unsigned char *a)
 {
diff --git a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h
index 89a84bf6d7..023448d5c1 100644
--- a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h
+++ b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h
@@ -137,11 +137,24 @@
              (0b0010011 << 0) |                             \
              (rs << 15) | (rd << 7))
 
+/* Rotate right 32-bit register 5-bit value. */
 #define RORIW(rd, rs, imm)                                  \
     ASM_WORD((0b0110000 << 25) | (0b101 << 12) |            \
              (0b0011011 << 0) |                             \
              (imm << 20) | (rs << 15) | (rd << 7))
 
+/* Rotate right 64-bit register 7-bit value. */
+#define RORI(rd, rs, imm)                                   \
+    ASM_WORD((0b01100 << 27) | (0b101 << 12) |              \
+             (0b0010011 << 0) |                             \
+             ((imm) << 20) | ((rs) << 15) | ((rd) << 7))
+
+/* rs1 and not rs2 into rd. */
+#define ANDN(rd, rs1, rs2)                                  \
+    ASM_WORD((0b0100000 << 25) | (0b111 << 12) |            \
+             (0b0110011 << 0) |                             \
+             ((rs2) << 20) | ((rs1) << 15) | ((rd) << 7))
+
 
 /* rd = rs1[0..31] | rs2[0..31]. */
 #define PACK(rd, rs1, rs2)                                     \
@@ -168,6 +181,23 @@
 /* 32-bit width when loading. */
 #define WIDTH_32  0b110
 
+
+#define VLSEG_V(vd, rs1, cnt, width) \
+    ASM_WORD(0b0000111 | (width << 12) | (0b10101000 << 20) |   \
+        (0 << 28) | ((cnt - 1) << 29) | (vd << 7) | (rs1 << 15))
+/* Load 8 Vector registers' 64-bit element. */
+#define VLSEG8E64_V(vd, rs1)  VLSEG_V(vd, rs1, 8, WIDTH_64)
+/* Load 1 Vector register's 64-bit element. */
+#define VLSEG1E64_V(vd, rs1)  VLSEG_V(vd, rs1, 1, WIDTH_64)
+
+#define VSSEG_V(vd, rs1, cnt, width) \
+    ASM_WORD(0b0100111 | (width << 12) | (0b10101000 << 20) |   \
+        (0 << 28) | ((cnt - 1) << 29) | (vd << 7) | (rs1 << 15))
+/* Store 8 Vector registers' 64-bit element. */
+#define VSSEG8E64_V(vd, rs1)  VSSEG_V(vd, rs1, 8, WIDTH_64)
+/* Store 1 Vector register's 64-bit element. */
+#define VSSEG1E64_V(vd, rs1)  VSSEG_V(vd, rs1, 1, WIDTH_64)
+
 /* Load n Vector registers with width-bit components. */
 #define VLRE_V(vd, rs1, cnt, width)                             \
     ASM_WORD(0b0000111 | (width << 12) | (0b00101000 << 20) |   \
@@ -225,11 +255,21 @@
  * Logic
  */
 
+/* vd = vs2 << rs1 */
+#define VSLL_VX(vd, vs2, rs1)                       \
+    ASM_WORD((0b100101 << 26) | (0b1 << 25) |       \
+             (0b100 << 12) | (0b1010111 << 0) |     \
+             (vd << 7) | (rs1 << 15) | (vs2 << 20))
 /* vd = vs2 << uimm */
 #define VSLL_VI(vd, vs2, uimm)                      \
     ASM_WORD((0b100101 << 26) | (0b1 << 25) |       \
              (0b011 << 12) | (0b1010111 << 0) |     \
              (vd << 7) | (uimm << 15) | (vs2 << 20))
+/* vd = vs2 >> rs1 */
+#define VSRL_VX(vd, vs2, rs1)                       \
+    ASM_WORD((0b101000 << 26) | (0b1 << 25) |       \
+             (0b100 << 12) | (0b1010111 << 0) |     \
+             (vd << 7) | (rs1 << 15) | (vs2 << 20))
 /* vd = vs2 >> uimm */
 #define VSRL_VI(vd, vs2, uimm)                      \
     ASM_WORD((0b101000 << 26) | (0b1 << 25) |       \
@@ -257,6 +297,14 @@
     ASM_WORD((0b001011 << 26) | (0b1 << 25) |       \
              (0b000 << 12) | (0b1010111 << 0) |     \
              (vd << 7) | (vs1 << 15) | (vs2 << 20))
+/* vd = imm ^ vs2 */
+#define VXOR_VI(vd, vs2, imm)                       \
+    ASM_WORD((0b001011 << 26) | (0b1 << 25) |       \
+             (0b011 << 12) | (0b1010111 << 0) |     \
+             (vd << 7) | (imm << 15) | (vs2 << 20))
+/* vd = ~vs */
+#define VNOT_V(vd, vs)  VXOR_VI(vd, vs, 0b11111)
+
 /* vd = vs1 & vs2 */
 #define VAND_VV(vd, vs1, vs2)                       \
     ASM_WORD((0b001001 << 26) | (0b1 << 25) |       \
@@ -286,6 +334,13 @@
              (vs2 << 20) | (vs1 << 15) | (vd << 7))
 
 
+#define VMERGE_VVM(vd, vs2, vs1)                    \
+    ASM_WORD((0b010111 << 26) | (0b0 << 25) |       \
+             (0b000 << 12) | (0b1010111 << 0) |     \
+             (vs2 << 20) | (vs1 << 15) | (vd << 7))
+
+
+
 /*
  * Permute
  */
@@ -354,12 +409,23 @@
              (0b010 << 12) | (0b1010111 << 0) |                \
              (vs2 << 20) | (vd << 7))
 
-/* Reverse order of bytes in words of vector regsiter. */
+/* Rotate left bits of vector regsiter. */
+#define VROL_VX(vd, vs2, rs) \
+    ASM_WORD((0b010101 << 26) | (0b1 << 25) | (0b100 << 12) |    \
+             (0b1010111 << 0) |                                  \
+             (vs2 << 20) | (rs << 15) | (vd << 7))
+
+/* Rotate right bits of vector regsiter. */
 #define VROR_VI(vd, imm, vs2) \
     ASM_WORD((0b01010 << 27) | (0b1 << 25) | (0b011 << 12) |    \
              (0b1010111 << 0) | ((imm >> 5) << 26) |            \
              (vs2 << 20) | ((imm & 0x1f) << 15) | (vd << 7))
 
+/* Vector ANDN - vd = ~vs1 & vs2. */
+#define VANDN_VV(vd, vs1, vs2) \
+    ASM_WORD((0b000001 << 26) | (0b1 << 25) | (0b000 << 12) |    \
+             (0b1010111 << 0) |                                  \
+             (vs2 << 20) | (vs1 << 15) | (vd << 7))
 
 #endif /* WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION ||
         * WOLFSSL_RISCV_VECTOR_CRYPTO_ASM */
diff --git a/wolfssl/wolfcrypt/sha3.h b/wolfssl/wolfcrypt/sha3.h
index 0931a95584..0120051508 100644
--- a/wolfssl/wolfcrypt/sha3.h
+++ b/wolfssl/wolfcrypt/sha3.h
@@ -220,8 +220,8 @@ WOLFSSL_LOCAL void sha3_block_bmi2(word64* s);
 WOLFSSL_LOCAL void sha3_block_avx2(word64* s);
 WOLFSSL_LOCAL void BlockSha3(word64 *s);
 #endif
-#if defined(WOLFSSL_ARMASM) && (defined(__arm__) || \
-    defined(WOLFSSL_ARMASM_CRYPTO_SHA3))
+#if (defined(WOLFSSL_ARMASM) && (defined(__arm__) || \
+     defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) || defined(WOLFSSL_RISCV_ASM)
 WOLFSSL_LOCAL void BlockSha3(word64 *s);
 #endif