From de657787cfa93187b2bca3e7d4ecc00e82ecd2ce Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 20 Sep 2024 11:21:56 +1000 Subject: [PATCH] Kyber Aarch64: assembly implementations of functions Aarch64 assembly implementation of Kyber functions. SHA-3 assembly implementations when not hardware crypto. --- configure.ac | 2 +- src/include.am | 7 + wolfcrypt/src/port/arm/armv8-curve25519.S | 84 +- wolfcrypt/src/port/arm/armv8-kyber-asm.S | 10079 +++++++++++++ wolfcrypt/src/port/arm/armv8-kyber-asm_c.c | 14303 +++++++++++++++++++ wolfcrypt/src/port/arm/armv8-sha3-asm.S | 267 +- wolfcrypt/src/port/arm/armv8-sha3-asm_c.c | 216 + wolfcrypt/src/port/arm/armv8-sha512-asm.S | 22 +- wolfcrypt/src/sha3.c | 3 +- wolfcrypt/src/wc_kyber.c | 7 +- wolfcrypt/src/wc_kyber_poly.c | 748 +- wolfssl/wolfcrypt/sha3.h | 3 +- wolfssl/wolfcrypt/wc_kyber.h | 24 +- 13 files changed, 25658 insertions(+), 107 deletions(-) create mode 100644 wolfcrypt/src/port/arm/armv8-kyber-asm.S create mode 100644 wolfcrypt/src/port/arm/armv8-kyber-asm_c.c diff --git a/configure.ac b/configure.ac index 0841cc5342..0d2ae428a5 100644 --- a/configure.ac +++ b/configure.ac @@ -2977,7 +2977,7 @@ then AM_CPPFLAGS="$AM_CPPFLAGS+sm4" fi else - AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto" + AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto -DWOLFSSL_AARCH64_NO_SQRMLSH" fi ;; esac diff --git a/src/include.am b/src/include.am index c3d8376a1d..881a6fe85f 100644 --- a/src/include.am +++ b/src/include.am @@ -1057,6 +1057,13 @@ if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber_asm.S endif endif +if BUILD_ARMASM_NEON +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-kyber-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-kyber-asm.S +endif !BUILD_ARMASM_INLINE +endif BUILD_ARMASM_NEON endif if BUILD_DILITHIUM diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index cf20f60809..228fcf0068 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -337,8 +337,7 @@ _fe_cmov_table: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] @@ -546,8 +545,7 @@ _fe_cmov_table: stp x10, x11, [x0, #48] stp x12, x13, [x0, #64] stp x14, x15, [x0, #80] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] @@ -573,8 +571,7 @@ _fe_mul: #endif /* __APPLE__ */ stp x29, x30, [sp, #-64]! add x29, sp, #0 - str x17, [x29, #24] - str x19, [x29, #32] + stp x17, x19, [x29, #24] stp x20, x21, [x29, #40] str x22, [x29, #56] # Multiply @@ -703,8 +700,7 @@ _fe_mul: # Store stp x6, x7, [x0] stp x8, x9, [x0, #16] - ldr x17, [x29, #24] - ldr x19, [x29, #32] + ldp x17, x19, [x29, #24] ldp x20, x21, [x29, #40] ldr x22, [x29, #56] ldp x29, x30, [sp], #0x40 @@ -835,8 +831,7 @@ _fe_invert: #endif /* __APPLE__ */ stp x29, x30, [sp, #-176]! add x29, sp, #0 - str x17, [x29, #160] - str x20, [x29, #168] + stp x17, x20, [x29, #160] # Invert str x0, [x29, #144] str x1, [x29, #152] @@ -1694,8 +1689,7 @@ L_fe_invert8: #else bl _fe_mul #endif /* __APPLE__ */ - ldr x17, [x29, #160] - ldr x20, [x29, #168] + ldp x17, x20, [x29, #160] ldp x29, x30, [sp], #0xb0 ret #ifndef __APPLE__ @@ -1715,8 +1709,7 @@ _curve25519: #endif /* __APPLE__ */ stp x29, x30, [sp, #-288]! add x29, sp, #0 - str x17, [x29, #200] - str x19, [x29, #208] + stp x17, x19, [x29, #200] stp x20, x21, [x29, #216] stp x22, x23, [x29, #232] stp x24, x25, [x29, #248] @@ -3801,8 +3794,7 @@ L_curve25519_inv_8: stp x14, x15, [x0] stp x16, x17, [x0, #16] mov x0, xzr - ldr x17, [x29, #200] - ldr x19, [x29, #208] + ldp x17, x19, [x29, #200] ldp x20, x21, [x29, #216] ldp x22, x23, [x29, #232] ldp x24, x25, [x29, #248] @@ -3828,8 +3820,7 @@ _fe_pow22523: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #128] - str x23, [x29, #136] + stp x17, x23, [x29, #128] # pow22523 str x0, [x29, #112] str x1, [x29, #120] @@ -4619,8 +4610,7 @@ L_fe_pow22523_7: #else bl _fe_mul #endif /* __APPLE__ */ - ldr x17, [x29, #128] - ldr x23, [x29, #136] + ldp x17, x23, [x29, #128] ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ @@ -4640,8 +4630,7 @@ _ge_p1p1_to_p2: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] str x22, [x29, #72] str x0, [x29, #16] @@ -5002,8 +4991,7 @@ _ge_p1p1_to_p2: # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldr x22, [x29, #72] ldp x29, x30, [sp], #0x50 @@ -5025,8 +5013,7 @@ _ge_p1p1_to_p3: #endif /* __APPLE__ */ stp x29, x30, [sp, #-112]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] @@ -5505,8 +5492,7 @@ _ge_p1p1_to_p3: # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] @@ -5530,8 +5516,7 @@ _ge_p2_dbl: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #40] - str x19, [x29, #48] + stp x17, x19, [x29, #40] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] @@ -5986,8 +5971,7 @@ _ge_p2_dbl: sbc x7, x7, xzr stp x4, x5, [x0] stp x6, x7, [x0, #16] - ldr x17, [x29, #40] - ldr x19, [x29, #48] + ldp x17, x19, [x29, #40] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] @@ -6012,8 +5996,7 @@ _ge_madd: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -6503,8 +6486,7 @@ _ge_madd: stp x10, x11, [x0, #16] stp x4, x5, [x1] stp x6, x7, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -6529,8 +6511,7 @@ _ge_msub: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -7020,8 +7001,7 @@ _ge_msub: stp x10, x11, [x0, #16] stp x4, x5, [x1] stp x6, x7, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -7046,8 +7026,7 @@ _ge_add: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -7663,8 +7642,7 @@ _ge_add: stp x23, x24, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -7689,8 +7667,7 @@ _ge_sub: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #56] - str x19, [x29, #64] + stp x17, x19, [x29, #56] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] @@ -8321,8 +8298,7 @@ _ge_sub: stp x14, x15, [x0, #16] stp x21, x22, [x1] stp x23, x24, [x1, #16] - ldr x17, [x29, #56] - ldr x19, [x29, #64] + ldp x17, x19, [x29, #56] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] @@ -8347,8 +8323,7 @@ _sc_reduce: #endif /* __APPLE__ */ stp x29, x30, [sp, #-64]! add x29, sp, #0 - str x17, [x29, #16] - str x19, [x29, #24] + stp x17, x19, [x29, #16] stp x20, x21, [x29, #32] stp x22, x23, [x29, #48] ldp x2, x3, [x0] @@ -8525,8 +8500,7 @@ _sc_reduce: # Store result stp x2, x3, [x0] stp x4, x5, [x0, #16] - ldr x17, [x29, #16] - ldr x19, [x29, #24] + ldp x17, x19, [x29, #16] ldp x20, x21, [x29, #32] ldp x22, x23, [x29, #48] ldp x29, x30, [sp], #0x40 @@ -8548,8 +8522,7 @@ _sc_muladd: #endif /* __APPLE__ */ stp x29, x30, [sp, #-96]! add x29, sp, #0 - str x17, [x29, #24] - str x19, [x29, #32] + stp x17, x19, [x29, #24] stp x20, x21, [x29, #40] stp x22, x23, [x29, #56] stp x24, x25, [x29, #72] @@ -8824,8 +8797,7 @@ _sc_muladd: # Store result stp x4, x5, [x0] stp x6, x7, [x0, #16] - ldr x17, [x29, #24] - ldr x19, [x29, #32] + ldp x17, x19, [x29, #24] ldp x20, x21, [x29, #40] ldp x22, x23, [x29, #56] ldp x24, x25, [x29, #72] diff --git a/wolfcrypt/src/port/arm/armv8-kyber-asm.S b/wolfcrypt/src/port/arm/armv8-kyber-asm.S new file mode 100644 index 0000000000..e73adbcc18 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-kyber-asm.S @@ -0,0 +1,10079 @@ +/* armv8-kyber-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./kyber/kyber.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-kyber-asm.S + */ +#ifdef WOLFSSL_ARMASM +#ifdef __aarch64__ +#ifndef WOLFSSL_ARMASM_INLINE +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_q, %object + .section .rodata + .size L_kyber_aarch64_q, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_q: + .short 0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_consts, %object + .section .rodata + .size L_kyber_aarch64_consts, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_consts: + .short 0x0d01,0xf301,0x4ebf,0x0549,0x5049,0x0000,0x0000,0x0000 +#ifndef __APPLE__ + .text + .type L_sha3_aarch64_r, %object + .section .rodata + .size L_sha3_aarch64_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_sha3_aarch64_r: + .xword 0x0000000000000001 + .xword 0x0000000000008082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x000000000000808b + .xword 0x0000000080000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x000000000000008a + .xword 0x0000000000000088 + .xword 0x0000000080008009 + .xword 0x000000008000000a + .xword 0x000000008000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x000000000000800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x0000000080000001 + .xword 0x8000000080008008 +#ifdef WOLFSSL_WC_KYBER +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas, %object + .section .rodata + .size L_kyber_aarch64_zetas, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas: + .short 0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca + .short 0x0c56,0x026e,0x0629,0x00b6,0x03c2,0x084f,0x073f,0x05bc + .short 0x023d,0x07d4,0x0108,0x017f,0x09c4,0x05b2,0x06bf,0x0c7f + .short 0x0a58,0x03f9,0x02dc,0x0260,0x06fb,0x019b,0x0c34,0x06de + .short 0x04c7,0x04c7,0x04c7,0x04c7,0x028c,0x028c,0x028c,0x028c + .short 0x0ad9,0x0ad9,0x0ad9,0x0ad9,0x03f7,0x03f7,0x03f7,0x03f7 + .short 0x07f4,0x07f4,0x07f4,0x07f4,0x05d3,0x05d3,0x05d3,0x05d3 + .short 0x0be7,0x0be7,0x0be7,0x0be7,0x06f9,0x06f9,0x06f9,0x06f9 + .short 0x0204,0x0204,0x0204,0x0204,0x0cf9,0x0cf9,0x0cf9,0x0cf9 + .short 0x0bc1,0x0bc1,0x0bc1,0x0bc1,0x0a67,0x0a67,0x0a67,0x0a67 + .short 0x06af,0x06af,0x06af,0x06af,0x0877,0x0877,0x0877,0x0877 + .short 0x007e,0x007e,0x007e,0x007e,0x05bd,0x05bd,0x05bd,0x05bd + .short 0x09ac,0x09ac,0x09ac,0x09ac,0x0ca7,0x0ca7,0x0ca7,0x0ca7 + .short 0x0bf2,0x0bf2,0x0bf2,0x0bf2,0x033e,0x033e,0x033e,0x033e + .short 0x006b,0x006b,0x006b,0x006b,0x0774,0x0774,0x0774,0x0774 + .short 0x0c0a,0x0c0a,0x0c0a,0x0c0a,0x094a,0x094a,0x094a,0x094a + .short 0x0b73,0x0b73,0x0b73,0x0b73,0x03c1,0x03c1,0x03c1,0x03c1 + .short 0x071d,0x071d,0x071d,0x071d,0x0a2c,0x0a2c,0x0a2c,0x0a2c + .short 0x01c0,0x01c0,0x01c0,0x01c0,0x08d8,0x08d8,0x08d8,0x08d8 + .short 0x02a5,0x02a5,0x02a5,0x02a5,0x0806,0x0806,0x0806,0x0806 + .short 0x08b2,0x08b2,0x01ae,0x01ae,0x022b,0x022b,0x034b,0x034b + .short 0x081e,0x081e,0x0367,0x0367,0x060e,0x060e,0x0069,0x0069 + .short 0x01a6,0x01a6,0x024b,0x024b,0x00b1,0x00b1,0x0c16,0x0c16 + .short 0x0bde,0x0bde,0x0b35,0x0b35,0x0626,0x0626,0x0675,0x0675 + .short 0x0c0b,0x0c0b,0x030a,0x030a,0x0487,0x0487,0x0c6e,0x0c6e + .short 0x09f8,0x09f8,0x05cb,0x05cb,0x0aa7,0x0aa7,0x045f,0x045f + .short 0x06cb,0x06cb,0x0284,0x0284,0x0999,0x0999,0x015d,0x015d + .short 0x01a2,0x01a2,0x0149,0x0149,0x0c65,0x0c65,0x0cb6,0x0cb6 + .short 0x0331,0x0331,0x0449,0x0449,0x025b,0x025b,0x0262,0x0262 + .short 0x052a,0x052a,0x07fc,0x07fc,0x0748,0x0748,0x0180,0x0180 + .short 0x0842,0x0842,0x0c79,0x0c79,0x04c2,0x04c2,0x07ca,0x07ca + .short 0x0997,0x0997,0x00dc,0x00dc,0x085e,0x085e,0x0686,0x0686 + .short 0x0860,0x0860,0x0707,0x0707,0x0803,0x0803,0x031a,0x031a + .short 0x071b,0x071b,0x09ab,0x09ab,0x099b,0x099b,0x01de,0x01de + .short 0x0c95,0x0c95,0x0bcd,0x0bcd,0x03e4,0x03e4,0x03df,0x03df + .short 0x03be,0x03be,0x074d,0x074d,0x05f2,0x05f2,0x065c,0x065c +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_qinv, %object + .section .rodata + .size L_kyber_aarch64_zetas_qinv, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_qinv: + .short 0xffed,0x7b0b,0x399a,0x0314,0x34d5,0xcf8e,0x6e1f,0xbeca + .short 0xae56,0x6c6e,0xf129,0xc2b6,0x29c2,0x054f,0xd43f,0x79bc + .short 0xe93d,0x43d4,0x9908,0x8e7f,0x15c4,0xfbb2,0x53bf,0x997f + .short 0x9258,0x5ef9,0xd6dc,0x2260,0x47fb,0x229b,0x6834,0xc0de + .short 0xe9c7,0xe9c7,0xe9c7,0xe9c7,0xe68c,0xe68c,0xe68c,0xe68c + .short 0x05d9,0x05d9,0x05d9,0x05d9,0x78f7,0x78f7,0x78f7,0x78f7 + .short 0xa3f4,0xa3f4,0xa3f4,0xa3f4,0x4ed3,0x4ed3,0x4ed3,0x4ed3 + .short 0x50e7,0x50e7,0x50e7,0x50e7,0x61f9,0x61f9,0x61f9,0x61f9 + .short 0xce04,0xce04,0xce04,0xce04,0x67f9,0x67f9,0x67f9,0x67f9 + .short 0x3ec1,0x3ec1,0x3ec1,0x3ec1,0xcf67,0xcf67,0xcf67,0xcf67 + .short 0x23af,0x23af,0x23af,0x23af,0xfd77,0xfd77,0xfd77,0xfd77 + .short 0x9a7e,0x9a7e,0x9a7e,0x9a7e,0x6cbd,0x6cbd,0x6cbd,0x6cbd + .short 0x4dac,0x4dac,0x4dac,0x4dac,0x91a7,0x91a7,0x91a7,0x91a7 + .short 0xc1f2,0xc1f2,0xc1f2,0xc1f2,0xdd3e,0xdd3e,0xdd3e,0xdd3e + .short 0x916b,0x916b,0x916b,0x916b,0x2374,0x2374,0x2374,0x2374 + .short 0x8a0a,0x8a0a,0x8a0a,0x8a0a,0x474a,0x474a,0x474a,0x474a + .short 0x3473,0x3473,0x3473,0x3473,0x36c1,0x36c1,0x36c1,0x36c1 + .short 0x8e1d,0x8e1d,0x8e1d,0x8e1d,0xce2c,0xce2c,0xce2c,0xce2c + .short 0x41c0,0x41c0,0x41c0,0x41c0,0x10d8,0x10d8,0x10d8,0x10d8 + .short 0xa1a5,0xa1a5,0xa1a5,0xa1a5,0xba06,0xba06,0xba06,0xba06 + .short 0xfeb2,0xfeb2,0x2bae,0x2bae,0xd32b,0xd32b,0x344b,0x344b + .short 0x821e,0x821e,0xc867,0xc867,0x500e,0x500e,0xab69,0xab69 + .short 0x93a6,0x93a6,0x334b,0x334b,0x03b1,0x03b1,0xee16,0xee16 + .short 0xc5de,0xc5de,0x5a35,0x5a35,0x1826,0x1826,0x1575,0x1575 + .short 0x7d0b,0x7d0b,0x810a,0x810a,0x2987,0x2987,0x766e,0x766e + .short 0x71f8,0x71f8,0xb6cb,0xb6cb,0x8fa7,0x8fa7,0x315f,0x315f + .short 0xb7cb,0xb7cb,0x4e84,0x4e84,0x4499,0x4499,0x485d,0x485d + .short 0xc7a2,0xc7a2,0x4c49,0x4c49,0xeb65,0xeb65,0xceb6,0xceb6 + .short 0x8631,0x8631,0x4f49,0x4f49,0x635b,0x635b,0x0862,0x0862 + .short 0xe32a,0xe32a,0x3bfc,0x3bfc,0x5f48,0x5f48,0x8180,0x8180 + .short 0xae42,0xae42,0xe779,0xe779,0x2ac2,0x2ac2,0xc5ca,0xc5ca + .short 0x5e97,0x5e97,0xd4dc,0xd4dc,0x425e,0x425e,0x3886,0x3886 + .short 0x2860,0x2860,0xac07,0xac07,0xe103,0xe103,0xb11a,0xb11a + .short 0xa81b,0xa81b,0x5aab,0x5aab,0x2a9b,0x2a9b,0xbbde,0xbbde + .short 0x7b95,0x7b95,0xa2cd,0xa2cd,0x6fe4,0x6fe4,0xb0df,0xb0df + .short 0x5dbe,0x5dbe,0x1e4d,0x1e4d,0xbbf2,0xbbf2,0x5a5c,0x5a5c +#ifndef __APPLE__ +.text +.globl kyber_ntt +.type kyber_ntt,@function +.align 2 +kyber_ntt: +#else +.section __TEXT,__text +.globl _kyber_ntt +.p2align 2 +_kyber_ntt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_zetas + add x2, x2, :lo12:L_kyber_aarch64_zetas +#else + adrp x2, L_kyber_aarch64_zetas@PAGE + add x2, x2, :lo12:L_kyber_aarch64_zetas@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_qinv + add x3, x3, :lo12:L_kyber_aarch64_zetas_qinv +#else + adrp x3, L_kyber_aarch64_zetas_qinv@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_qinv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + add x1, x0, #0x100 + ldr q4, [x4] + ldr q5, [x0] + ldr q6, [x0, #32] + ldr q7, [x0, #64] + ldr q8, [x0, #96] + ldr q9, [x0, #128] + ldr q10, [x0, #160] + ldr q11, [x0, #192] + ldr q12, [x0, #224] + ldr q13, [x1] + ldr q14, [x1, #32] + ldr q15, [x1, #64] + ldr q16, [x1, #96] + ldr q17, [x1, #128] + ldr q18, [x1, #160] + ldr q19, [x1, #192] + ldr q20, [x1, #224] + ldr q0, [x2] + ldr q1, [x3] + mul v29.8h, v13.8h, v1.h[1] + mul v30.8h, v14.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[1] + sqrdmulh v22.8h, v14.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v15.8h, v1.h[1] + mul v30.8h, v16.8h, v1.h[1] + sqrdmulh v23.8h, v15.8h, v0.h[1] + sqrdmulh v24.8h, v16.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[1] + mul v30.8h, v18.8h, v1.h[1] + sqrdmulh v25.8h, v17.8h, v0.h[1] + sqrdmulh v26.8h, v18.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[1] + mul v30.8h, v20.8h, v1.h[1] + sqrdmulh v27.8h, v19.8h, v0.h[1] + sqrdmulh v28.8h, v20.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v13.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v14.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v15.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v16.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v9.8h, v25.8h + add v9.8h, v9.8h, v25.8h + sub v18.8h, v10.8h, v26.8h + add v10.8h, v10.8h, v26.8h + sub v19.8h, v11.8h, v27.8h + add v11.8h, v11.8h, v27.8h + sub v20.8h, v12.8h, v28.8h + add v12.8h, v12.8h, v28.8h + mul v29.8h, v9.8h, v1.h[2] + mul v30.8h, v10.8h, v1.h[2] + sqrdmulh v21.8h, v9.8h, v0.h[2] + sqrdmulh v22.8h, v10.8h, v0.h[2] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[2] + sqrdmulh v23.8h, v11.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[2] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[3] + mul v30.8h, v18.8h, v1.h[3] + sqrdmulh v25.8h, v17.8h, v0.h[3] + sqrdmulh v26.8h, v18.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[3] + mul v30.8h, v20.8h, v1.h[3] + sqrdmulh v27.8h, v19.8h, v0.h[3] + sqrdmulh v28.8h, v20.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v9.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v10.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v12.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v18.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v15.8h, v27.8h + add v15.8h, v15.8h, v27.8h + sub v20.8h, v16.8h, v28.8h + add v16.8h, v16.8h, v28.8h + mul v29.8h, v7.8h, v1.h[4] + mul v30.8h, v8.8h, v1.h[4] + sqrdmulh v21.8h, v7.8h, v0.h[4] + sqrdmulh v22.8h, v8.8h, v0.h[4] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[5] + mul v30.8h, v12.8h, v1.h[5] + sqrdmulh v23.8h, v11.8h, v0.h[5] + sqrdmulh v24.8h, v12.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v15.8h, v1.h[6] + mul v30.8h, v16.8h, v1.h[6] + sqrdmulh v25.8h, v15.8h, v0.h[6] + sqrdmulh v26.8h, v16.8h, v0.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[7] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v19.8h, v0.h[7] + sqrdmulh v28.8h, v20.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v7.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + sub v15.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v18.8h, v28.8h + add v18.8h, v18.8h, v28.8h + ldr q0, [x2, #16] + ldr q1, [x3, #16] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + str q5, [x0] + str q6, [x0, #32] + str q7, [x0, #64] + str q8, [x0, #96] + str q9, [x0, #128] + str q10, [x0, #160] + str q11, [x0, #192] + str q12, [x0, #224] + str q13, [x1] + str q14, [x1, #32] + str q15, [x1, #64] + str q16, [x1, #96] + str q17, [x1, #128] + str q18, [x1, #160] + str q19, [x1, #192] + str q20, [x1, #224] + ldr q5, [x0, #16] + ldr q6, [x0, #48] + ldr q7, [x0, #80] + ldr q8, [x0, #112] + ldr q9, [x0, #144] + ldr q10, [x0, #176] + ldr q11, [x0, #208] + ldr q12, [x0, #240] + ldr q13, [x1, #16] + ldr q14, [x1, #48] + ldr q15, [x1, #80] + ldr q16, [x1, #112] + ldr q17, [x1, #144] + ldr q18, [x1, #176] + ldr q19, [x1, #208] + ldr q20, [x1, #240] + ldr q0, [x2] + ldr q1, [x3] + mul v29.8h, v13.8h, v1.h[1] + mul v30.8h, v14.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[1] + sqrdmulh v22.8h, v14.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v15.8h, v1.h[1] + mul v30.8h, v16.8h, v1.h[1] + sqrdmulh v23.8h, v15.8h, v0.h[1] + sqrdmulh v24.8h, v16.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[1] + mul v30.8h, v18.8h, v1.h[1] + sqrdmulh v25.8h, v17.8h, v0.h[1] + sqrdmulh v26.8h, v18.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[1] + mul v30.8h, v20.8h, v1.h[1] + sqrdmulh v27.8h, v19.8h, v0.h[1] + sqrdmulh v28.8h, v20.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v13.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v14.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v15.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v16.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v9.8h, v25.8h + add v9.8h, v9.8h, v25.8h + sub v18.8h, v10.8h, v26.8h + add v10.8h, v10.8h, v26.8h + sub v19.8h, v11.8h, v27.8h + add v11.8h, v11.8h, v27.8h + sub v20.8h, v12.8h, v28.8h + add v12.8h, v12.8h, v28.8h + mul v29.8h, v9.8h, v1.h[2] + mul v30.8h, v10.8h, v1.h[2] + sqrdmulh v21.8h, v9.8h, v0.h[2] + sqrdmulh v22.8h, v10.8h, v0.h[2] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[2] + sqrdmulh v23.8h, v11.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[2] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[3] + mul v30.8h, v18.8h, v1.h[3] + sqrdmulh v25.8h, v17.8h, v0.h[3] + sqrdmulh v26.8h, v18.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[3] + mul v30.8h, v20.8h, v1.h[3] + sqrdmulh v27.8h, v19.8h, v0.h[3] + sqrdmulh v28.8h, v20.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v9.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v10.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v12.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v18.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v15.8h, v27.8h + add v15.8h, v15.8h, v27.8h + sub v20.8h, v16.8h, v28.8h + add v16.8h, v16.8h, v28.8h + mul v29.8h, v7.8h, v1.h[4] + mul v30.8h, v8.8h, v1.h[4] + sqrdmulh v21.8h, v7.8h, v0.h[4] + sqrdmulh v22.8h, v8.8h, v0.h[4] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[5] + mul v30.8h, v12.8h, v1.h[5] + sqrdmulh v23.8h, v11.8h, v0.h[5] + sqrdmulh v24.8h, v12.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v15.8h, v1.h[6] + mul v30.8h, v16.8h, v1.h[6] + sqrdmulh v25.8h, v15.8h, v0.h[6] + sqrdmulh v26.8h, v16.8h, v0.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[7] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v19.8h, v0.h[7] + sqrdmulh v28.8h, v20.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v7.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + sub v15.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v18.8h, v28.8h + add v18.8h, v18.8h, v28.8h + ldr q0, [x2, #16] + ldr q1, [x3, #16] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + str q5, [x0, #16] + str q6, [x0, #48] + str q7, [x0, #80] + str q8, [x0, #112] + str q9, [x0, #144] + str q10, [x0, #176] + str q11, [x0, #208] + str q12, [x0, #240] + str q13, [x1, #16] + str q14, [x1, #48] + str q15, [x1, #80] + str q16, [x1, #112] + str q17, [x1, #144] + str q18, [x1, #176] + str q19, [x1, #208] + str q20, [x1, #240] + ldp q5, q6, [x0] + ldp q7, q8, [x0, #32] + ldp q9, q10, [x0, #64] + ldp q11, q12, [x0, #96] + ldp q13, q14, [x0, #128] + ldp q15, q16, [x0, #160] + ldp q17, q18, [x0, #192] + ldp q19, q20, [x0, #224] + ldr q0, [x2, #32] + ldr q1, [x3, #32] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #64] + ldr q2, [x2, #80] + ldr q1, [x3, #64] + ldr q3, [x3, #80] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.2d, v5.2d, v6.2d + trn1 v7.2d, v7.2d, v8.2d + trn2 v6.2d, v29.2d, v6.2d + trn2 v8.2d, v30.2d, v8.2d + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #96] + ldr q2, [x2, #112] + ldr q1, [x3, #96] + ldr q3, [x3, #112] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v29.2d, v10.2d + trn2 v12.2d, v30.2d, v12.2d + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #128] + ldr q2, [x2, #144] + ldr q1, [x3, #128] + ldr q3, [x3, #144] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v29.2d, v14.2d + trn2 v16.2d, v30.2d, v16.2d + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #160] + ldr q2, [x2, #176] + ldr q1, [x3, #160] + ldr q3, [x3, #176] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v29.2d, v18.2d + trn2 v20.2d, v30.2d, v20.2d + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #320] + ldr q2, [x2, #336] + ldr q1, [x3, #320] + ldr q3, [x3, #336] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.4s, v5.4s, v6.4s + trn1 v7.4s, v7.4s, v8.4s + trn2 v6.4s, v29.4s, v6.4s + trn2 v8.4s, v30.4s, v8.4s + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #352] + ldr q2, [x2, #368] + ldr q1, [x3, #352] + ldr q3, [x3, #368] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v29.4s, v10.4s + trn2 v12.4s, v30.4s, v12.4s + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #384] + ldr q2, [x2, #400] + ldr q1, [x3, #384] + ldr q3, [x3, #400] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v29.4s, v14.4s + trn2 v16.4s, v30.4s, v16.4s + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #416] + ldr q2, [x2, #432] + ldr q1, [x3, #416] + ldr q3, [x3, #432] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v29.4s, v18.4s + trn2 v20.4s, v30.4s, v20.4s + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + sqdmulh v21.8h, v5.8h, v4.h[2] + sqdmulh v22.8h, v6.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v5.8h, v21.8h, v4.h[0] + mls v6.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v7.8h, v4.h[2] + sqdmulh v22.8h, v8.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v7.8h, v21.8h, v4.h[0] + mls v8.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v9.8h, v4.h[2] + sqdmulh v22.8h, v10.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v9.8h, v21.8h, v4.h[0] + mls v10.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v11.8h, v4.h[2] + sqdmulh v22.8h, v12.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v11.8h, v21.8h, v4.h[0] + mls v12.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v13.8h, v4.h[2] + sqdmulh v22.8h, v14.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v13.8h, v21.8h, v4.h[0] + mls v14.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v15.8h, v4.h[2] + sqdmulh v22.8h, v16.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v15.8h, v21.8h, v4.h[0] + mls v16.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v17.8h, v4.h[2] + sqdmulh v22.8h, v18.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v17.8h, v21.8h, v4.h[0] + mls v18.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v19.8h, v4.h[2] + sqdmulh v22.8h, v20.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v19.8h, v21.8h, v4.h[0] + mls v20.8h, v22.8h, v4.h[0] + mov v29.16b, v5.16b + trn1 v5.4s, v5.4s, v6.4s + trn2 v6.4s, v29.4s, v6.4s + mov v29.16b, v5.16b + trn1 v5.2d, v5.2d, v6.2d + trn2 v6.2d, v29.2d, v6.2d + mov v29.16b, v7.16b + trn1 v7.4s, v7.4s, v8.4s + trn2 v8.4s, v29.4s, v8.4s + mov v29.16b, v7.16b + trn1 v7.2d, v7.2d, v8.2d + trn2 v8.2d, v29.2d, v8.2d + mov v29.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v29.4s, v10.4s + mov v29.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v29.2d, v10.2d + mov v29.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v29.4s, v12.4s + mov v29.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v29.2d, v12.2d + mov v29.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v29.4s, v14.4s + mov v29.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v29.2d, v14.2d + mov v29.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v29.4s, v16.4s + mov v29.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v29.2d, v16.2d + mov v29.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v29.4s, v18.4s + mov v29.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v29.2d, v18.2d + mov v29.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v29.4s, v20.4s + mov v29.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v29.2d, v20.2d + stp q5, q6, [x0] + stp q7, q8, [x0, #32] + stp q9, q10, [x0, #64] + stp q11, q12, [x0, #96] + stp q13, q14, [x0, #128] + stp q15, q16, [x0, #160] + stp q17, q18, [x0, #192] + stp q19, q20, [x0, #224] + ldp q5, q6, [x1] + ldp q7, q8, [x1, #32] + ldp q9, q10, [x1, #64] + ldp q11, q12, [x1, #96] + ldp q13, q14, [x1, #128] + ldp q15, q16, [x1, #160] + ldp q17, q18, [x1, #192] + ldp q19, q20, [x1, #224] + ldr q0, [x2, #48] + ldr q1, [x3, #48] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #192] + ldr q2, [x2, #208] + ldr q1, [x3, #192] + ldr q3, [x3, #208] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.2d, v5.2d, v6.2d + trn1 v7.2d, v7.2d, v8.2d + trn2 v6.2d, v29.2d, v6.2d + trn2 v8.2d, v30.2d, v8.2d + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #224] + ldr q2, [x2, #240] + ldr q1, [x3, #224] + ldr q3, [x3, #240] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v29.2d, v10.2d + trn2 v12.2d, v30.2d, v12.2d + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #256] + ldr q2, [x2, #272] + ldr q1, [x3, #256] + ldr q3, [x3, #272] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v29.2d, v14.2d + trn2 v16.2d, v30.2d, v16.2d + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #288] + ldr q2, [x2, #304] + ldr q1, [x3, #288] + ldr q3, [x3, #304] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v29.2d, v18.2d + trn2 v20.2d, v30.2d, v20.2d + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #448] + ldr q2, [x2, #464] + ldr q1, [x3, #448] + ldr q3, [x3, #464] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.4s, v5.4s, v6.4s + trn1 v7.4s, v7.4s, v8.4s + trn2 v6.4s, v29.4s, v6.4s + trn2 v8.4s, v30.4s, v8.4s + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #480] + ldr q2, [x2, #496] + ldr q1, [x3, #480] + ldr q3, [x3, #496] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v29.4s, v10.4s + trn2 v12.4s, v30.4s, v12.4s + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v23.8h, v23.8h, v29.8h + sub v24.8h, v24.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #512] + ldr q2, [x2, #528] + ldr q1, [x3, #512] + ldr q3, [x3, #528] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v29.4s, v14.4s + trn2 v16.4s, v30.4s, v16.4s + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v25.8h, v25.8h, v29.8h + sub v26.8h, v26.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #544] + ldr q2, [x2, #560] + ldr q1, [x3, #544] + ldr q3, [x3, #560] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v29.4s, v18.4s + trn2 v20.4s, v30.4s, v20.4s + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] +#else + sqrdmulh v29.8h, v29.8h, v4.h[0] + sqrdmulh v30.8h, v30.8h, v4.h[0] + sub v27.8h, v27.8h, v29.8h + sub v28.8h, v28.8h, v30.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + sqdmulh v21.8h, v5.8h, v4.h[2] + sqdmulh v22.8h, v6.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v5.8h, v21.8h, v4.h[0] + mls v6.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v7.8h, v4.h[2] + sqdmulh v22.8h, v8.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v7.8h, v21.8h, v4.h[0] + mls v8.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v9.8h, v4.h[2] + sqdmulh v22.8h, v10.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v9.8h, v21.8h, v4.h[0] + mls v10.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v11.8h, v4.h[2] + sqdmulh v22.8h, v12.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v11.8h, v21.8h, v4.h[0] + mls v12.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v13.8h, v4.h[2] + sqdmulh v22.8h, v14.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v13.8h, v21.8h, v4.h[0] + mls v14.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v15.8h, v4.h[2] + sqdmulh v22.8h, v16.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v15.8h, v21.8h, v4.h[0] + mls v16.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v17.8h, v4.h[2] + sqdmulh v22.8h, v18.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v17.8h, v21.8h, v4.h[0] + mls v18.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v19.8h, v4.h[2] + sqdmulh v22.8h, v20.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v19.8h, v21.8h, v4.h[0] + mls v20.8h, v22.8h, v4.h[0] + mov v29.16b, v5.16b + trn1 v5.4s, v5.4s, v6.4s + trn2 v6.4s, v29.4s, v6.4s + mov v29.16b, v5.16b + trn1 v5.2d, v5.2d, v6.2d + trn2 v6.2d, v29.2d, v6.2d + mov v29.16b, v7.16b + trn1 v7.4s, v7.4s, v8.4s + trn2 v8.4s, v29.4s, v8.4s + mov v29.16b, v7.16b + trn1 v7.2d, v7.2d, v8.2d + trn2 v8.2d, v29.2d, v8.2d + mov v29.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v29.4s, v10.4s + mov v29.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v29.2d, v10.2d + mov v29.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v29.4s, v12.4s + mov v29.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v29.2d, v12.2d + mov v29.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v29.4s, v14.4s + mov v29.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v29.2d, v14.2d + mov v29.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v29.4s, v16.4s + mov v29.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v29.2d, v16.2d + mov v29.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v29.4s, v18.4s + mov v29.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v29.2d, v18.2d + mov v29.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v29.4s, v20.4s + mov v29.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v29.2d, v20.2d + stp q5, q6, [x1] + stp q7, q8, [x1, #32] + stp q9, q10, [x1, #64] + stp q11, q12, [x1, #96] + stp q13, q14, [x1, #128] + stp q15, q16, [x1, #160] + stp q17, q18, [x1, #192] + stp q19, q20, [x1, #224] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_ntt,.-kyber_ntt +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_inv, %object + .section .rodata + .size L_kyber_aarch64_zetas_inv, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_inv: + .short 0x06a5,0x06a5,0x070f,0x070f,0x05b4,0x05b4,0x0943,0x0943 + .short 0x0922,0x0922,0x091d,0x091d,0x0134,0x0134,0x006c,0x006c + .short 0x0b23,0x0b23,0x0366,0x0366,0x0356,0x0356,0x05e6,0x05e6 + .short 0x09e7,0x09e7,0x04fe,0x04fe,0x05fa,0x05fa,0x04a1,0x04a1 + .short 0x067b,0x067b,0x04a3,0x04a3,0x0c25,0x0c25,0x036a,0x036a + .short 0x0537,0x0537,0x083f,0x083f,0x0088,0x0088,0x04bf,0x04bf + .short 0x0b81,0x0b81,0x05b9,0x05b9,0x0505,0x0505,0x07d7,0x07d7 + .short 0x0a9f,0x0a9f,0x0aa6,0x0aa6,0x08b8,0x08b8,0x09d0,0x09d0 + .short 0x004b,0x004b,0x009c,0x009c,0x0bb8,0x0bb8,0x0b5f,0x0b5f + .short 0x0ba4,0x0ba4,0x0368,0x0368,0x0a7d,0x0a7d,0x0636,0x0636 + .short 0x08a2,0x08a2,0x025a,0x025a,0x0736,0x0736,0x0309,0x0309 + .short 0x0093,0x0093,0x087a,0x087a,0x09f7,0x09f7,0x00f6,0x00f6 + .short 0x068c,0x068c,0x06db,0x06db,0x01cc,0x01cc,0x0123,0x0123 + .short 0x00eb,0x00eb,0x0c50,0x0c50,0x0ab6,0x0ab6,0x0b5b,0x0b5b + .short 0x0c98,0x0c98,0x06f3,0x06f3,0x099a,0x099a,0x04e3,0x04e3 + .short 0x09b6,0x09b6,0x0ad6,0x0ad6,0x0b53,0x0b53,0x044f,0x044f + .short 0x04fb,0x04fb,0x04fb,0x04fb,0x0a5c,0x0a5c,0x0a5c,0x0a5c + .short 0x0429,0x0429,0x0429,0x0429,0x0b41,0x0b41,0x0b41,0x0b41 + .short 0x02d5,0x02d5,0x02d5,0x02d5,0x05e4,0x05e4,0x05e4,0x05e4 + .short 0x0940,0x0940,0x0940,0x0940,0x018e,0x018e,0x018e,0x018e + .short 0x03b7,0x03b7,0x03b7,0x03b7,0x00f7,0x00f7,0x00f7,0x00f7 + .short 0x058d,0x058d,0x058d,0x058d,0x0c96,0x0c96,0x0c96,0x0c96 + .short 0x09c3,0x09c3,0x09c3,0x09c3,0x010f,0x010f,0x010f,0x010f + .short 0x005a,0x005a,0x005a,0x005a,0x0355,0x0355,0x0355,0x0355 + .short 0x0744,0x0744,0x0744,0x0744,0x0c83,0x0c83,0x0c83,0x0c83 + .short 0x048a,0x048a,0x048a,0x048a,0x0652,0x0652,0x0652,0x0652 + .short 0x029a,0x029a,0x029a,0x029a,0x0140,0x0140,0x0140,0x0140 + .short 0x0008,0x0008,0x0008,0x0008,0x0afd,0x0afd,0x0afd,0x0afd + .short 0x0608,0x0608,0x0608,0x0608,0x011a,0x011a,0x011a,0x011a + .short 0x072e,0x072e,0x072e,0x072e,0x050d,0x050d,0x050d,0x050d + .short 0x090a,0x090a,0x090a,0x090a,0x0228,0x0228,0x0228,0x0228 + .short 0x0a75,0x0a75,0x0a75,0x0a75,0x083a,0x083a,0x083a,0x083a + .short 0x0623,0x00cd,0x0b66,0x0606,0x0aa1,0x0a25,0x0908,0x02a9 + .short 0x0082,0x0642,0x074f,0x033d,0x0b82,0x0bf9,0x052d,0x0ac4 + .short 0x0745,0x05c2,0x04b2,0x093f,0x0c4b,0x06d8,0x0a93,0x00ab + .short 0x0c37,0x0be2,0x0773,0x072c,0x05ed,0x0167,0x02f6,0x05a1 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_inv_qinv, %object + .section .rodata + .size L_kyber_aarch64_zetas_inv_qinv, 576 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_inv_qinv: + .short 0xa5a5,0xa5a5,0x440f,0x440f,0xe1b4,0xe1b4,0xa243,0xa243 + .short 0x4f22,0x4f22,0x901d,0x901d,0x5d34,0x5d34,0x846c,0x846c + .short 0x4423,0x4423,0xd566,0xd566,0xa556,0xa556,0x57e6,0x57e6 + .short 0x4ee7,0x4ee7,0x1efe,0x1efe,0x53fa,0x53fa,0xd7a1,0xd7a1 + .short 0xc77b,0xc77b,0xbda3,0xbda3,0x2b25,0x2b25,0xa16a,0xa16a + .short 0x3a37,0x3a37,0xd53f,0xd53f,0x1888,0x1888,0x51bf,0x51bf + .short 0x7e81,0x7e81,0xa0b9,0xa0b9,0xc405,0xc405,0x1cd7,0x1cd7 + .short 0xf79f,0xf79f,0x9ca6,0x9ca6,0xb0b8,0xb0b8,0x79d0,0x79d0 + .short 0x314b,0x314b,0x149c,0x149c,0xb3b8,0xb3b8,0x385f,0x385f + .short 0xb7a4,0xb7a4,0xbb68,0xbb68,0xb17d,0xb17d,0x4836,0x4836 + .short 0xcea2,0xcea2,0x705a,0x705a,0x4936,0x4936,0x8e09,0x8e09 + .short 0x8993,0x8993,0xd67a,0xd67a,0x7ef7,0x7ef7,0x82f6,0x82f6 + .short 0xea8c,0xea8c,0xe7db,0xe7db,0xa5cc,0xa5cc,0x3a23,0x3a23 + .short 0x11eb,0x11eb,0xfc50,0xfc50,0xccb6,0xccb6,0x6c5b,0x6c5b + .short 0x5498,0x5498,0xaff3,0xaff3,0x379a,0x379a,0x7de3,0x7de3 + .short 0xcbb6,0xcbb6,0x2cd6,0x2cd6,0xd453,0xd453,0x014f,0x014f + .short 0x45fb,0x45fb,0x45fb,0x45fb,0x5e5c,0x5e5c,0x5e5c,0x5e5c + .short 0xef29,0xef29,0xef29,0xef29,0xbe41,0xbe41,0xbe41,0xbe41 + .short 0x31d5,0x31d5,0x31d5,0x31d5,0x71e4,0x71e4,0x71e4,0x71e4 + .short 0xc940,0xc940,0xc940,0xc940,0xcb8e,0xcb8e,0xcb8e,0xcb8e + .short 0xb8b7,0xb8b7,0xb8b7,0xb8b7,0x75f7,0x75f7,0x75f7,0x75f7 + .short 0xdc8d,0xdc8d,0xdc8d,0xdc8d,0x6e96,0x6e96,0x6e96,0x6e96 + .short 0x22c3,0x22c3,0x22c3,0x22c3,0x3e0f,0x3e0f,0x3e0f,0x3e0f + .short 0x6e5a,0x6e5a,0x6e5a,0x6e5a,0xb255,0xb255,0xb255,0xb255 + .short 0x9344,0x9344,0x9344,0x9344,0x6583,0x6583,0x6583,0x6583 + .short 0x028a,0x028a,0x028a,0x028a,0xdc52,0xdc52,0xdc52,0xdc52 + .short 0x309a,0x309a,0x309a,0x309a,0xc140,0xc140,0xc140,0xc140 + .short 0x9808,0x9808,0x9808,0x9808,0x31fd,0x31fd,0x31fd,0x31fd + .short 0x9e08,0x9e08,0x9e08,0x9e08,0xaf1a,0xaf1a,0xaf1a,0xaf1a + .short 0xb12e,0xb12e,0xb12e,0xb12e,0x5c0d,0x5c0d,0x5c0d,0x5c0d + .short 0x870a,0x870a,0x870a,0x870a,0xfa28,0xfa28,0xfa28,0xfa28 + .short 0x1975,0x1975,0x1975,0x1975,0x163a,0x163a,0x163a,0x163a + .short 0x3f23,0x97cd,0xdd66,0xb806,0xdda1,0x2925,0xa108,0x6da9 + .short 0x6682,0xac42,0x044f,0xea3d,0x7182,0x66f9,0xbc2d,0x16c4 + .short 0x8645,0x2bc2,0xfab2,0xd63f,0x3d4b,0x0ed8,0x9393,0x51ab + .short 0x4137,0x91e2,0x3073,0xcb2c,0xfced,0xc667,0x84f6,0xd8a1 +#ifndef __APPLE__ +.text +.globl kyber_invntt +.type kyber_invntt,@function +.align 2 +kyber_invntt: +#else +.section __TEXT,__text +.globl _kyber_invntt +.p2align 2 +_kyber_invntt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_zetas_inv + add x2, x2, :lo12:L_kyber_aarch64_zetas_inv +#else + adrp x2, L_kyber_aarch64_zetas_inv@PAGE + add x2, x2, :lo12:L_kyber_aarch64_zetas_inv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_inv_qinv + add x3, x3, :lo12:L_kyber_aarch64_zetas_inv_qinv +#else + adrp x3, L_kyber_aarch64_zetas_inv_qinv@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_inv_qinv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + add x1, x0, #0x100 + ldr q8, [x4] + ldp q9, q10, [x0] + ldp q11, q12, [x0, #32] + ldp q13, q14, [x0, #64] + ldp q15, q16, [x0, #96] + ldp q17, q18, [x0, #128] + ldp q19, q20, [x0, #160] + ldp q21, q22, [x0, #192] + ldp q23, q24, [x0, #224] + mov v25.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v25.2d, v10.2d + mov v25.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v25.4s, v10.4s + mov v25.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v25.2d, v12.2d + mov v25.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v25.4s, v12.4s + mov v25.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v25.2d, v14.2d + mov v25.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v25.4s, v14.4s + mov v25.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v25.2d, v16.2d + mov v25.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v25.4s, v16.4s + mov v25.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v25.2d, v18.2d + mov v25.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v25.4s, v18.4s + mov v25.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v25.2d, v20.2d + mov v25.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v25.4s, v20.4s + mov v25.16b, v21.16b + trn1 v21.2d, v21.2d, v22.2d + trn2 v22.2d, v25.2d, v22.2d + mov v25.16b, v21.16b + trn1 v21.4s, v21.4s, v22.4s + trn2 v22.4s, v25.4s, v22.4s + mov v25.16b, v23.16b + trn1 v23.2d, v23.2d, v24.2d + trn2 v24.2d, v25.2d, v24.2d + mov v25.16b, v23.16b + trn1 v23.4s, v23.4s, v24.4s + trn2 v24.4s, v25.4s, v24.4s + ldr q0, [x2] + ldr q1, [x2, #16] + ldr q2, [x3] + ldr q3, [x3, #16] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #32] + ldr q1, [x2, #48] + ldr q2, [x3, #32] + ldr q3, [x3, #48] + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #64] + ldr q1, [x2, #80] + ldr q2, [x3, #64] + ldr q3, [x3, #80] + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #96] + ldr q1, [x2, #112] + ldr q2, [x3, #96] + ldr q3, [x3, #112] + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #256] + ldr q1, [x2, #272] + ldr q2, [x3, #256] + ldr q3, [x3, #272] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v25.4s, v10.4s + trn2 v12.4s, v26.4s, v12.4s + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #288] + ldr q1, [x2, #304] + ldr q2, [x3, #288] + ldr q3, [x3, #304] + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v25.4s, v14.4s + trn2 v16.4s, v26.4s, v16.4s + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #320] + ldr q1, [x2, #336] + ldr q2, [x3, #320] + ldr q3, [x3, #336] + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v25.4s, v18.4s + trn2 v20.4s, v26.4s, v20.4s + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #352] + ldr q1, [x2, #368] + ldr q2, [x3, #352] + ldr q3, [x3, #368] + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.4s, v21.4s, v22.4s + trn1 v23.4s, v23.4s, v24.4s + trn2 v22.4s, v25.4s, v22.4s + trn2 v24.4s, v26.4s, v24.4s + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #512] + ldr q2, [x3, #512] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v25.2d, v10.2d + trn2 v12.2d, v26.2d, v12.2d + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.h[0] + mul v27.8h, v28.8h, v2.h[1] + sqrdmulh v10.8h, v26.8h, v0.h[0] + sqrdmulh v12.8h, v28.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v25.2d, v14.2d + trn2 v16.2d, v26.2d, v16.2d + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.h[2] + mul v27.8h, v28.8h, v2.h[3] + sqrdmulh v14.8h, v26.8h, v0.h[2] + sqrdmulh v16.8h, v28.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v25.2d, v18.2d + trn2 v20.2d, v26.2d, v20.2d + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.h[4] + mul v27.8h, v28.8h, v2.h[5] + sqrdmulh v18.8h, v26.8h, v0.h[4] + sqrdmulh v20.8h, v28.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.2d, v21.2d, v22.2d + trn1 v23.2d, v23.2d, v24.2d + trn2 v22.2d, v25.2d, v22.2d + trn2 v24.2d, v26.2d, v24.2d + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.h[6] + mul v27.8h, v28.8h, v2.h[7] + sqrdmulh v22.8h, v26.8h, v0.h[6] + sqrdmulh v24.8h, v28.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v11.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v11.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v13.8h, v8.h[2] + sqdmulh v26.8h, v15.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v13.8h, v25.8h, v8.h[0] + mls v15.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v19.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v19.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v21.8h, v8.h[2] + sqdmulh v26.8h, v23.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v21.8h, v25.8h, v8.h[0] + mls v23.8h, v26.8h, v8.h[0] + stp q9, q10, [x0] + stp q11, q12, [x0, #32] + stp q13, q14, [x0, #64] + stp q15, q16, [x0, #96] + stp q17, q18, [x0, #128] + stp q19, q20, [x0, #160] + stp q21, q22, [x0, #192] + stp q23, q24, [x0, #224] + ldp q9, q10, [x1] + ldp q11, q12, [x1, #32] + ldp q13, q14, [x1, #64] + ldp q15, q16, [x1, #96] + ldp q17, q18, [x1, #128] + ldp q19, q20, [x1, #160] + ldp q21, q22, [x1, #192] + ldp q23, q24, [x1, #224] + mov v25.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v25.2d, v10.2d + mov v25.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v25.4s, v10.4s + mov v25.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v25.2d, v12.2d + mov v25.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v25.4s, v12.4s + mov v25.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v25.2d, v14.2d + mov v25.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v25.4s, v14.4s + mov v25.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v25.2d, v16.2d + mov v25.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v25.4s, v16.4s + mov v25.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v25.2d, v18.2d + mov v25.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v25.4s, v18.4s + mov v25.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v25.2d, v20.2d + mov v25.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v25.4s, v20.4s + mov v25.16b, v21.16b + trn1 v21.2d, v21.2d, v22.2d + trn2 v22.2d, v25.2d, v22.2d + mov v25.16b, v21.16b + trn1 v21.4s, v21.4s, v22.4s + trn2 v22.4s, v25.4s, v22.4s + mov v25.16b, v23.16b + trn1 v23.2d, v23.2d, v24.2d + trn2 v24.2d, v25.2d, v24.2d + mov v25.16b, v23.16b + trn1 v23.4s, v23.4s, v24.4s + trn2 v24.4s, v25.4s, v24.4s + ldr q0, [x2, #128] + ldr q1, [x2, #144] + ldr q2, [x3, #128] + ldr q3, [x3, #144] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #160] + ldr q1, [x2, #176] + ldr q2, [x3, #160] + ldr q3, [x3, #176] + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #192] + ldr q1, [x2, #208] + ldr q2, [x3, #192] + ldr q3, [x3, #208] + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #224] + ldr q1, [x2, #240] + ldr q2, [x3, #224] + ldr q3, [x3, #240] + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #384] + ldr q1, [x2, #400] + ldr q2, [x3, #384] + ldr q3, [x3, #400] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v25.4s, v10.4s + trn2 v12.4s, v26.4s, v12.4s + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #416] + ldr q1, [x2, #432] + ldr q2, [x3, #416] + ldr q3, [x3, #432] + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v25.4s, v14.4s + trn2 v16.4s, v26.4s, v16.4s + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #448] + ldr q1, [x2, #464] + ldr q2, [x3, #448] + ldr q3, [x3, #464] + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v25.4s, v18.4s + trn2 v20.4s, v26.4s, v20.4s + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #480] + ldr q1, [x2, #496] + ldr q2, [x3, #480] + ldr q3, [x3, #496] + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.4s, v21.4s, v22.4s + trn1 v23.4s, v23.4s, v24.4s + trn2 v22.4s, v25.4s, v22.4s + trn2 v24.4s, v26.4s, v24.4s + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #528] + ldr q2, [x3, #528] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v25.2d, v10.2d + trn2 v12.2d, v26.2d, v12.2d + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.h[0] + mul v27.8h, v28.8h, v2.h[1] + sqrdmulh v10.8h, v26.8h, v0.h[0] + sqrdmulh v12.8h, v28.8h, v0.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v25.2d, v14.2d + trn2 v16.2d, v26.2d, v16.2d + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.h[2] + mul v27.8h, v28.8h, v2.h[3] + sqrdmulh v14.8h, v26.8h, v0.h[2] + sqrdmulh v16.8h, v28.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v25.2d, v18.2d + trn2 v20.2d, v26.2d, v20.2d + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.h[4] + mul v27.8h, v28.8h, v2.h[5] + sqrdmulh v18.8h, v26.8h, v0.h[4] + sqrdmulh v20.8h, v28.8h, v0.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.2d, v21.2d, v22.2d + trn1 v23.2d, v23.2d, v24.2d + trn2 v22.2d, v25.2d, v22.2d + trn2 v24.2d, v26.2d, v24.2d + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.h[6] + mul v27.8h, v28.8h, v2.h[7] + sqrdmulh v22.8h, v26.8h, v0.h[6] + sqrdmulh v24.8h, v28.8h, v0.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v11.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v11.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v13.8h, v8.h[2] + sqdmulh v26.8h, v15.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v13.8h, v25.8h, v8.h[0] + mls v15.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v19.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v19.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v21.8h, v8.h[2] + sqdmulh v26.8h, v23.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v21.8h, v25.8h, v8.h[0] + mls v23.8h, v26.8h, v8.h[0] + stp q9, q10, [x1] + stp q11, q12, [x1, #32] + stp q13, q14, [x1, #64] + stp q15, q16, [x1, #96] + stp q17, q18, [x1, #128] + stp q19, q20, [x1, #160] + stp q21, q22, [x1, #192] + stp q23, q24, [x1, #224] + ldr q4, [x2, #544] + ldr q5, [x2, #560] + ldr q6, [x3, #544] + ldr q7, [x3, #560] + ldr q9, [x0] + ldr q10, [x0, #32] + ldr q11, [x0, #64] + ldr q12, [x0, #96] + ldr q13, [x0, #128] + ldr q14, [x0, #160] + ldr q15, [x0, #192] + ldr q16, [x0, #224] + ldr q17, [x1] + ldr q18, [x1, #32] + ldr q19, [x1, #64] + ldr q20, [x1, #96] + ldr q21, [x1, #128] + ldr q22, [x1, #160] + ldr q23, [x1, #192] + ldr q24, [x1, #224] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v6.h[0] + mul v27.8h, v28.8h, v6.h[1] + sqrdmulh v10.8h, v26.8h, v4.h[0] + sqrdmulh v12.8h, v28.8h, v4.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v6.h[2] + mul v27.8h, v28.8h, v6.h[3] + sqrdmulh v14.8h, v26.8h, v4.h[2] + sqrdmulh v16.8h, v28.8h, v4.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v6.h[4] + mul v27.8h, v28.8h, v6.h[5] + sqrdmulh v18.8h, v26.8h, v4.h[4] + sqrdmulh v20.8h, v28.8h, v4.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v6.h[6] + mul v27.8h, v28.8h, v6.h[7] + sqrdmulh v22.8h, v26.8h, v4.h[6] + sqrdmulh v24.8h, v28.8h, v4.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v11.8h + sub v28.8h, v10.8h, v12.8h + add v9.8h, v9.8h, v11.8h + add v10.8h, v10.8h, v12.8h + mul v25.8h, v26.8h, v7.h[0] + mul v27.8h, v28.8h, v7.h[0] + sqrdmulh v11.8h, v26.8h, v5.h[0] + sqrdmulh v12.8h, v28.8h, v5.h[0] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v11.8h, v11.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v15.8h + sub v28.8h, v14.8h, v16.8h + add v13.8h, v13.8h, v15.8h + add v14.8h, v14.8h, v16.8h + mul v25.8h, v26.8h, v7.h[1] + mul v27.8h, v28.8h, v7.h[1] + sqrdmulh v15.8h, v26.8h, v5.h[1] + sqrdmulh v16.8h, v28.8h, v5.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v15.8h, v15.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v19.8h + sub v28.8h, v18.8h, v20.8h + add v17.8h, v17.8h, v19.8h + add v18.8h, v18.8h, v20.8h + mul v25.8h, v26.8h, v7.h[2] + mul v27.8h, v28.8h, v7.h[2] + sqrdmulh v19.8h, v26.8h, v5.h[2] + sqrdmulh v20.8h, v28.8h, v5.h[2] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v19.8h, v19.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v23.8h + sub v28.8h, v22.8h, v24.8h + add v21.8h, v21.8h, v23.8h + add v22.8h, v22.8h, v24.8h + mul v25.8h, v26.8h, v7.h[3] + mul v27.8h, v28.8h, v7.h[3] + sqrdmulh v23.8h, v26.8h, v5.h[3] + sqrdmulh v24.8h, v28.8h, v5.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v13.8h + sub v28.8h, v10.8h, v14.8h + add v9.8h, v9.8h, v13.8h + add v10.8h, v10.8h, v14.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v13.8h, v26.8h, v5.h[4] + sqrdmulh v14.8h, v28.8h, v5.h[4] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v13.8h, v13.8h, v25.8h + sub v14.8h, v14.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + sub v26.8h, v11.8h, v15.8h + sub v28.8h, v12.8h, v16.8h + add v11.8h, v11.8h, v15.8h + add v12.8h, v12.8h, v16.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v15.8h, v26.8h, v5.h[4] + sqrdmulh v16.8h, v28.8h, v5.h[4] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v15.8h, v15.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v21.8h + sub v28.8h, v18.8h, v22.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v21.8h, v26.8h, v5.h[5] + sqrdmulh v22.8h, v28.8h, v5.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v19.8h, v23.8h + sub v28.8h, v20.8h, v24.8h + add v19.8h, v19.8h, v23.8h + add v20.8h, v20.8h, v24.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v23.8h, v26.8h, v5.h[5] + sqrdmulh v24.8h, v28.8h, v5.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v10.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v10.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v11.8h, v8.h[2] + sqdmulh v26.8h, v12.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v11.8h, v25.8h, v8.h[0] + mls v12.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v18.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v18.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v19.8h, v8.h[2] + sqdmulh v26.8h, v20.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v19.8h, v25.8h, v8.h[0] + mls v20.8h, v26.8h, v8.h[0] + sub v26.8h, v9.8h, v17.8h + sub v28.8h, v10.8h, v18.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v17.8h, v26.8h, v5.h[6] + sqrdmulh v18.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v17.8h, v17.8h, v25.8h + sub v18.8h, v18.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + sub v26.8h, v11.8h, v19.8h + sub v28.8h, v12.8h, v20.8h + add v11.8h, v11.8h, v19.8h + add v12.8h, v12.8h, v20.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v19.8h, v26.8h, v5.h[6] + sqrdmulh v20.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v19.8h, v19.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v13.8h, v21.8h + sub v28.8h, v14.8h, v22.8h + add v13.8h, v13.8h, v21.8h + add v14.8h, v14.8h, v22.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v21.8h, v26.8h, v5.h[6] + sqrdmulh v22.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v15.8h, v23.8h + sub v28.8h, v16.8h, v24.8h + add v15.8h, v15.8h, v23.8h + add v16.8h, v16.8h, v24.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v23.8h, v26.8h, v5.h[6] + sqrdmulh v24.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v25.8h, v9.8h, v7.h[7] + mul v26.8h, v10.8h, v7.h[7] + sqrdmulh v9.8h, v9.8h, v5.h[7] + sqrdmulh v10.8h, v10.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v9.8h, v25.8h, v8.h[0] + sqrdmlsh v10.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v25.8h, v11.8h, v7.h[7] + mul v26.8h, v12.8h, v7.h[7] + sqrdmulh v11.8h, v11.8h, v5.h[7] + sqrdmulh v12.8h, v12.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v11.8h, v11.8h, v25.8h + sub v12.8h, v12.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v25.8h, v13.8h, v7.h[7] + mul v26.8h, v14.8h, v7.h[7] + sqrdmulh v13.8h, v13.8h, v5.h[7] + sqrdmulh v14.8h, v14.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v13.8h, v13.8h, v25.8h + sub v14.8h, v14.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v25.8h, v15.8h, v7.h[7] + mul v26.8h, v16.8h, v7.h[7] + sqrdmulh v15.8h, v15.8h, v5.h[7] + sqrdmulh v16.8h, v16.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v15.8h, v15.8h, v25.8h + sub v16.8h, v16.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + mul v25.8h, v17.8h, v7.h[7] + mul v26.8h, v18.8h, v7.h[7] + sqrdmulh v17.8h, v17.8h, v5.h[7] + sqrdmulh v18.8h, v18.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v17.8h, v17.8h, v25.8h + sub v18.8h, v18.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + mul v25.8h, v19.8h, v7.h[7] + mul v26.8h, v20.8h, v7.h[7] + sqrdmulh v19.8h, v19.8h, v5.h[7] + sqrdmulh v20.8h, v20.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v19.8h, v19.8h, v25.8h + sub v20.8h, v20.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + mul v25.8h, v21.8h, v7.h[7] + mul v26.8h, v22.8h, v7.h[7] + sqrdmulh v21.8h, v21.8h, v5.h[7] + sqrdmulh v22.8h, v22.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v25.8h, v23.8h, v7.h[7] + mul v26.8h, v24.8h, v7.h[7] + sqrdmulh v23.8h, v23.8h, v5.h[7] + sqrdmulh v24.8h, v24.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + str q9, [x0] + str q10, [x0, #32] + str q11, [x0, #64] + str q12, [x0, #96] + str q13, [x0, #128] + str q14, [x0, #160] + str q15, [x0, #192] + str q16, [x0, #224] + str q17, [x1] + str q18, [x1, #32] + str q19, [x1, #64] + str q20, [x1, #96] + str q21, [x1, #128] + str q22, [x1, #160] + str q23, [x1, #192] + str q24, [x1, #224] + ldr q9, [x0, #16] + ldr q10, [x0, #48] + ldr q11, [x0, #80] + ldr q12, [x0, #112] + ldr q13, [x0, #144] + ldr q14, [x0, #176] + ldr q15, [x0, #208] + ldr q16, [x0, #240] + ldr q17, [x1, #16] + ldr q18, [x1, #48] + ldr q19, [x1, #80] + ldr q20, [x1, #112] + ldr q21, [x1, #144] + ldr q22, [x1, #176] + ldr q23, [x1, #208] + ldr q24, [x1, #240] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v6.h[0] + mul v27.8h, v28.8h, v6.h[1] + sqrdmulh v10.8h, v26.8h, v4.h[0] + sqrdmulh v12.8h, v28.8h, v4.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v10.8h, v10.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v6.h[2] + mul v27.8h, v28.8h, v6.h[3] + sqrdmulh v14.8h, v26.8h, v4.h[2] + sqrdmulh v16.8h, v28.8h, v4.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v14.8h, v14.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v6.h[4] + mul v27.8h, v28.8h, v6.h[5] + sqrdmulh v18.8h, v26.8h, v4.h[4] + sqrdmulh v20.8h, v28.8h, v4.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v18.8h, v18.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v6.h[6] + mul v27.8h, v28.8h, v6.h[7] + sqrdmulh v22.8h, v26.8h, v4.h[6] + sqrdmulh v24.8h, v28.8h, v4.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v22.8h, v22.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v11.8h + sub v28.8h, v10.8h, v12.8h + add v9.8h, v9.8h, v11.8h + add v10.8h, v10.8h, v12.8h + mul v25.8h, v26.8h, v7.h[0] + mul v27.8h, v28.8h, v7.h[0] + sqrdmulh v11.8h, v26.8h, v5.h[0] + sqrdmulh v12.8h, v28.8h, v5.h[0] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v11.8h, v11.8h, v25.8h + sub v12.8h, v12.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v15.8h + sub v28.8h, v14.8h, v16.8h + add v13.8h, v13.8h, v15.8h + add v14.8h, v14.8h, v16.8h + mul v25.8h, v26.8h, v7.h[1] + mul v27.8h, v28.8h, v7.h[1] + sqrdmulh v15.8h, v26.8h, v5.h[1] + sqrdmulh v16.8h, v28.8h, v5.h[1] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v15.8h, v15.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v19.8h + sub v28.8h, v18.8h, v20.8h + add v17.8h, v17.8h, v19.8h + add v18.8h, v18.8h, v20.8h + mul v25.8h, v26.8h, v7.h[2] + mul v27.8h, v28.8h, v7.h[2] + sqrdmulh v19.8h, v26.8h, v5.h[2] + sqrdmulh v20.8h, v28.8h, v5.h[2] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v19.8h, v19.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v23.8h + sub v28.8h, v22.8h, v24.8h + add v21.8h, v21.8h, v23.8h + add v22.8h, v22.8h, v24.8h + mul v25.8h, v26.8h, v7.h[3] + mul v27.8h, v28.8h, v7.h[3] + sqrdmulh v23.8h, v26.8h, v5.h[3] + sqrdmulh v24.8h, v28.8h, v5.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v13.8h + sub v28.8h, v10.8h, v14.8h + add v9.8h, v9.8h, v13.8h + add v10.8h, v10.8h, v14.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v13.8h, v26.8h, v5.h[4] + sqrdmulh v14.8h, v28.8h, v5.h[4] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v13.8h, v13.8h, v25.8h + sub v14.8h, v14.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + sub v26.8h, v11.8h, v15.8h + sub v28.8h, v12.8h, v16.8h + add v11.8h, v11.8h, v15.8h + add v12.8h, v12.8h, v16.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v15.8h, v26.8h, v5.h[4] + sqrdmulh v16.8h, v28.8h, v5.h[4] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v15.8h, v15.8h, v25.8h + sub v16.8h, v16.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v21.8h + sub v28.8h, v18.8h, v22.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v21.8h, v26.8h, v5.h[5] + sqrdmulh v22.8h, v28.8h, v5.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v19.8h, v23.8h + sub v28.8h, v20.8h, v24.8h + add v19.8h, v19.8h, v23.8h + add v20.8h, v20.8h, v24.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v23.8h, v26.8h, v5.h[5] + sqrdmulh v24.8h, v28.8h, v5.h[5] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v10.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v10.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v11.8h, v8.h[2] + sqdmulh v26.8h, v12.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v11.8h, v25.8h, v8.h[0] + mls v12.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v18.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v18.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v19.8h, v8.h[2] + sqdmulh v26.8h, v20.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v19.8h, v25.8h, v8.h[0] + mls v20.8h, v26.8h, v8.h[0] + sub v26.8h, v9.8h, v17.8h + sub v28.8h, v10.8h, v18.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v17.8h, v26.8h, v5.h[6] + sqrdmulh v18.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v17.8h, v17.8h, v25.8h + sub v18.8h, v18.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + sub v26.8h, v11.8h, v19.8h + sub v28.8h, v12.8h, v20.8h + add v11.8h, v11.8h, v19.8h + add v12.8h, v12.8h, v20.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v19.8h, v26.8h, v5.h[6] + sqrdmulh v20.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v19.8h, v19.8h, v25.8h + sub v20.8h, v20.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v13.8h, v21.8h + sub v28.8h, v14.8h, v22.8h + add v13.8h, v13.8h, v21.8h + add v14.8h, v14.8h, v22.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v21.8h, v26.8h, v5.h[6] + sqrdmulh v22.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v15.8h, v23.8h + sub v28.8h, v16.8h, v24.8h + add v15.8h, v15.8h, v23.8h + add v16.8h, v16.8h, v24.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v23.8h, v26.8h, v5.h[6] + sqrdmulh v24.8h, v28.8h, v5.h[6] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v27.8h, v27.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v27.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v25.8h, v9.8h, v7.h[7] + mul v26.8h, v10.8h, v7.h[7] + sqrdmulh v9.8h, v9.8h, v5.h[7] + sqrdmulh v10.8h, v10.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v9.8h, v25.8h, v8.h[0] + sqrdmlsh v10.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v25.8h, v11.8h, v7.h[7] + mul v26.8h, v12.8h, v7.h[7] + sqrdmulh v11.8h, v11.8h, v5.h[7] + sqrdmulh v12.8h, v12.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v11.8h, v11.8h, v25.8h + sub v12.8h, v12.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v25.8h, v13.8h, v7.h[7] + mul v26.8h, v14.8h, v7.h[7] + sqrdmulh v13.8h, v13.8h, v5.h[7] + sqrdmulh v14.8h, v14.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v13.8h, v13.8h, v25.8h + sub v14.8h, v14.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v25.8h, v15.8h, v7.h[7] + mul v26.8h, v16.8h, v7.h[7] + sqrdmulh v15.8h, v15.8h, v5.h[7] + sqrdmulh v16.8h, v16.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v15.8h, v15.8h, v25.8h + sub v16.8h, v16.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + mul v25.8h, v17.8h, v7.h[7] + mul v26.8h, v18.8h, v7.h[7] + sqrdmulh v17.8h, v17.8h, v5.h[7] + sqrdmulh v18.8h, v18.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v17.8h, v17.8h, v25.8h + sub v18.8h, v18.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + mul v25.8h, v19.8h, v7.h[7] + mul v26.8h, v20.8h, v7.h[7] + sqrdmulh v19.8h, v19.8h, v5.h[7] + sqrdmulh v20.8h, v20.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v19.8h, v19.8h, v25.8h + sub v20.8h, v20.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + mul v25.8h, v21.8h, v7.h[7] + mul v26.8h, v22.8h, v7.h[7] + sqrdmulh v21.8h, v21.8h, v5.h[7] + sqrdmulh v22.8h, v22.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v25.8h, v23.8h, v7.h[7] + mul v26.8h, v24.8h, v7.h[7] + sqrdmulh v23.8h, v23.8h, v5.h[7] + sqrdmulh v24.8h, v24.8h, v5.h[7] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v26.8h, v8.h[0] +#else + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v26.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + str q9, [x0, #16] + str q10, [x0, #48] + str q11, [x0, #80] + str q12, [x0, #112] + str q13, [x0, #144] + str q14, [x0, #176] + str q15, [x0, #208] + str q16, [x0, #240] + str q17, [x1, #16] + str q18, [x1, #48] + str q19, [x1, #80] + str q20, [x1, #112] + str q21, [x1, #144] + str q22, [x1, #176] + str q23, [x1, #208] + str q24, [x1, #240] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_invntt,.-kyber_invntt +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_zetas_mul, %object + .section .rodata + .size L_kyber_aarch64_zetas_mul, 256 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_zetas_mul: + .short 0x08b2,0xf74e,0x01ae,0xfe52,0x022b,0xfdd5,0x034b,0xfcb5 + .short 0x081e,0xf7e2,0x0367,0xfc99,0x060e,0xf9f2,0x0069,0xff97 + .short 0x01a6,0xfe5a,0x024b,0xfdb5,0x00b1,0xff4f,0x0c16,0xf3ea + .short 0x0bde,0xf422,0x0b35,0xf4cb,0x0626,0xf9da,0x0675,0xf98b + .short 0x0c0b,0xf3f5,0x030a,0xfcf6,0x0487,0xfb79,0x0c6e,0xf392 + .short 0x09f8,0xf608,0x05cb,0xfa35,0x0aa7,0xf559,0x045f,0xfba1 + .short 0x06cb,0xf935,0x0284,0xfd7c,0x0999,0xf667,0x015d,0xfea3 + .short 0x01a2,0xfe5e,0x0149,0xfeb7,0x0c65,0xf39b,0x0cb6,0xf34a + .short 0x0331,0xfccf,0x0449,0xfbb7,0x025b,0xfda5,0x0262,0xfd9e + .short 0x052a,0xfad6,0x07fc,0xf804,0x0748,0xf8b8,0x0180,0xfe80 + .short 0x0842,0xf7be,0x0c79,0xf387,0x04c2,0xfb3e,0x07ca,0xf836 + .short 0x0997,0xf669,0x00dc,0xff24,0x085e,0xf7a2,0x0686,0xf97a + .short 0x0860,0xf7a0,0x0707,0xf8f9,0x0803,0xf7fd,0x031a,0xfce6 + .short 0x071b,0xf8e5,0x09ab,0xf655,0x099b,0xf665,0x01de,0xfe22 + .short 0x0c95,0xf36b,0x0bcd,0xf433,0x03e4,0xfc1c,0x03df,0xfc21 + .short 0x03be,0xfc42,0x074d,0xf8b3,0x05f2,0xfa0e,0x065c,0xf9a4 +#ifndef __APPLE__ +.text +.globl kyber_basemul_mont +.type kyber_basemul_mont,@function +.align 2 +kyber_basemul_mont: +#else +.section __TEXT,__text +.globl _kyber_basemul_mont +.p2align 2 +_kyber_basemul_mont: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_mul + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul +#else + adrp x3, L_kyber_aarch64_zetas_mul@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q1, [x4] + ldp q2, q3, [x1] + ldp q4, q5, [x1, #32] + ldp q6, q7, [x1, #64] + ldp q8, q9, [x1, #96] + ldp q10, q11, [x2] + ldp q12, q13, [x2, #32] + ldp q14, q15, [x2, #64] + ldp q16, q17, [x2, #96] + ldr q0, [x3] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0] + ldr q0, [x3, #16] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #32] + ldr q0, [x3, #32] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #64] + ldr q0, [x3, #48] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #96] + ldp q2, q3, [x1, #128] + ldp q4, q5, [x1, #160] + ldp q6, q7, [x1, #192] + ldp q8, q9, [x1, #224] + ldp q10, q11, [x2, #128] + ldp q12, q13, [x2, #160] + ldp q14, q15, [x2, #192] + ldp q16, q17, [x2, #224] + ldr q0, [x3, #64] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #128] + ldr q0, [x3, #80] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #160] + ldr q0, [x3, #96] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #192] + ldr q0, [x3, #112] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #224] + ldp q2, q3, [x1, #256] + ldp q4, q5, [x1, #288] + ldp q6, q7, [x1, #320] + ldp q8, q9, [x1, #352] + ldp q10, q11, [x2, #256] + ldp q12, q13, [x2, #288] + ldp q14, q15, [x2, #320] + ldp q16, q17, [x2, #352] + ldr q0, [x3, #128] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #256] + ldr q0, [x3, #144] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #288] + ldr q0, [x3, #160] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #320] + ldr q0, [x3, #176] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #352] + ldp q2, q3, [x1, #384] + ldp q4, q5, [x1, #416] + ldp q6, q7, [x1, #448] + ldp q8, q9, [x1, #480] + ldp q10, q11, [x2, #384] + ldp q12, q13, [x2, #416] + ldp q14, q15, [x2, #448] + ldp q16, q17, [x2, #480] + ldr q0, [x3, #192] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #384] + ldr q0, [x3, #208] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #416] + ldr q0, [x3, #224] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #448] + ldr q0, [x3, #240] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + stp q24, q25, [x0, #480] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_basemul_mont,.-kyber_basemul_mont +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_basemul_mont_add +.type kyber_basemul_mont_add,@function +.align 2 +kyber_basemul_mont_add: +#else +.section __TEXT,__text +.globl _kyber_basemul_mont_add +.p2align 2 +_kyber_basemul_mont_add: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_mul + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul +#else + adrp x3, L_kyber_aarch64_zetas_mul@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_mul@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q1, [x4] + ldp q2, q3, [x1] + ldp q4, q5, [x1, #32] + ldp q6, q7, [x1, #64] + ldp q8, q9, [x1, #96] + ldp q10, q11, [x2] + ldp q12, q13, [x2, #32] + ldp q14, q15, [x2, #64] + ldp q16, q17, [x2, #96] + ldp q28, q29, [x0] + ldr q0, [x3] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0] + ldp q28, q29, [x0, #32] + ldr q0, [x3, #16] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #32] + ldp q28, q29, [x0, #64] + ldr q0, [x3, #32] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #64] + ldp q28, q29, [x0, #96] + ldr q0, [x3, #48] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #96] + ldp q2, q3, [x1, #128] + ldp q4, q5, [x1, #160] + ldp q6, q7, [x1, #192] + ldp q8, q9, [x1, #224] + ldp q10, q11, [x2, #128] + ldp q12, q13, [x2, #160] + ldp q14, q15, [x2, #192] + ldp q16, q17, [x2, #224] + ldp q28, q29, [x0, #128] + ldr q0, [x3, #64] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #128] + ldp q28, q29, [x0, #160] + ldr q0, [x3, #80] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #160] + ldp q28, q29, [x0, #192] + ldr q0, [x3, #96] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #192] + ldp q28, q29, [x0, #224] + ldr q0, [x3, #112] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #224] + ldp q2, q3, [x1, #256] + ldp q4, q5, [x1, #288] + ldp q6, q7, [x1, #320] + ldp q8, q9, [x1, #352] + ldp q10, q11, [x2, #256] + ldp q12, q13, [x2, #288] + ldp q14, q15, [x2, #320] + ldp q16, q17, [x2, #352] + ldp q28, q29, [x0, #256] + ldr q0, [x3, #128] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #256] + ldp q28, q29, [x0, #288] + ldr q0, [x3, #144] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #288] + ldp q28, q29, [x0, #320] + ldr q0, [x3, #160] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #320] + ldp q28, q29, [x0, #352] + ldr q0, [x3, #176] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #352] + ldp q2, q3, [x1, #384] + ldp q4, q5, [x1, #416] + ldp q6, q7, [x1, #448] + ldp q8, q9, [x1, #480] + ldp q10, q11, [x2, #384] + ldp q12, q13, [x2, #416] + ldp q14, q15, [x2, #448] + ldp q16, q17, [x2, #480] + ldp q28, q29, [x0, #384] + ldr q0, [x3, #192] + uzp1 v18.8h, v2.8h, v3.8h + uzp2 v19.8h, v2.8h, v3.8h + uzp1 v20.8h, v10.8h, v11.8h + uzp2 v21.8h, v10.8h, v11.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #384] + ldp q28, q29, [x0, #416] + ldr q0, [x3, #208] + uzp1 v18.8h, v4.8h, v5.8h + uzp2 v19.8h, v4.8h, v5.8h + uzp1 v20.8h, v12.8h, v13.8h + uzp2 v21.8h, v12.8h, v13.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #416] + ldp q28, q29, [x0, #448] + ldr q0, [x3, #224] + uzp1 v18.8h, v6.8h, v7.8h + uzp2 v19.8h, v6.8h, v7.8h + uzp1 v20.8h, v14.8h, v15.8h + uzp2 v21.8h, v14.8h, v15.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #448] + ldp q28, q29, [x0, #480] + ldr q0, [x3, #240] + uzp1 v18.8h, v8.8h, v9.8h + uzp2 v19.8h, v8.8h, v9.8h + uzp1 v20.8h, v16.8h, v17.8h + uzp2 v21.8h, v16.8h, v17.8h + smull v26.4s, v18.4h, v20.4h + smull2 v27.4s, v18.8h, v20.8h + smull v23.4s, v19.4h, v21.4h + smull2 v24.4s, v19.8h, v21.8h + xtn v25.4h, v23.4s + xtn2 v25.8h, v24.4s + mul v25.8h, v25.8h, v1.h[1] + smlsl v23.4s, v25.4h, v1.h[0] + smlsl2 v24.4s, v25.8h, v1.h[0] + shrn v22.4h, v23.4s, #16 + shrn2 v22.8h, v24.4s, #16 + smlal v26.4s, v22.4h, v0.4h + smlal2 v27.4s, v22.8h, v0.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v22.4h, v26.4s, #16 + shrn2 v22.8h, v27.4s, #16 + smull v26.4s, v18.4h, v21.4h + smull2 v27.4s, v18.8h, v21.8h + smlal v26.4s, v19.4h, v20.4h + smlal2 v27.4s, v19.8h, v20.8h + xtn v24.4h, v26.4s + xtn2 v24.8h, v27.4s + mul v24.8h, v24.8h, v1.h[1] + smlsl v26.4s, v24.4h, v1.h[0] + smlsl2 v27.4s, v24.8h, v1.h[0] + shrn v23.4h, v26.4s, #16 + shrn2 v23.8h, v27.4s, #16 + zip1 v24.8h, v22.8h, v23.8h + zip2 v25.8h, v22.8h, v23.8h + add v28.8h, v28.8h, v24.8h + add v29.8h, v29.8h, v25.8h + stp q28, q29, [x0, #480] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_basemul_mont_add,.-kyber_basemul_mont_add +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_csubq_neon +.type kyber_csubq_neon,@function +.align 2 +kyber_csubq_neon: +#else +.section __TEXT,__text +.globl _kyber_csubq_neon +.p2align 2 +_kyber_csubq_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x1, L_kyber_aarch64_q + add x1, x1, :lo12:L_kyber_aarch64_q +#else + adrp x1, L_kyber_aarch64_q@PAGE + add x1, x1, :lo12:L_kyber_aarch64_q@PAGEOFF +#endif /* __APPLE__ */ + ldr q20, [x1] + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + sub x0, x0, #0x100 + sub v0.8h, v0.8h, v20.8h + sub v1.8h, v1.8h, v20.8h + sub v2.8h, v2.8h, v20.8h + sub v3.8h, v3.8h, v20.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v20.8h + sub v6.8h, v6.8h, v20.8h + sub v7.8h, v7.8h, v20.8h + sub v8.8h, v8.8h, v20.8h + sub v9.8h, v9.8h, v20.8h + sub v10.8h, v10.8h, v20.8h + sub v11.8h, v11.8h, v20.8h + sub v12.8h, v12.8h, v20.8h + sub v13.8h, v13.8h, v20.8h + sub v14.8h, v14.8h, v20.8h + sub v15.8h, v15.8h, v20.8h + sshr v16.8h, v0.8h, #15 + sshr v17.8h, v1.8h, #15 + sshr v18.8h, v2.8h, #15 + sshr v19.8h, v3.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + sshr v16.8h, v4.8h, #15 + sshr v17.8h, v5.8h, #15 + sshr v18.8h, v6.8h, #15 + sshr v19.8h, v7.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v4.8h, v4.8h, v16.8h + add v5.8h, v5.8h, v17.8h + add v6.8h, v6.8h, v18.8h + add v7.8h, v7.8h, v19.8h + sshr v16.8h, v8.8h, #15 + sshr v17.8h, v9.8h, #15 + sshr v18.8h, v10.8h, #15 + sshr v19.8h, v11.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v8.8h, v8.8h, v16.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + add v11.8h, v11.8h, v19.8h + sshr v16.8h, v12.8h, #15 + sshr v17.8h, v13.8h, #15 + sshr v18.8h, v14.8h, #15 + sshr v19.8h, v15.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v12.8h, v12.8h, v16.8h + add v13.8h, v13.8h, v17.8h + add v14.8h, v14.8h, v18.8h + add v15.8h, v15.8h, v19.8h + st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + sub x0, x0, #0x100 + sub v0.8h, v0.8h, v20.8h + sub v1.8h, v1.8h, v20.8h + sub v2.8h, v2.8h, v20.8h + sub v3.8h, v3.8h, v20.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v20.8h + sub v6.8h, v6.8h, v20.8h + sub v7.8h, v7.8h, v20.8h + sub v8.8h, v8.8h, v20.8h + sub v9.8h, v9.8h, v20.8h + sub v10.8h, v10.8h, v20.8h + sub v11.8h, v11.8h, v20.8h + sub v12.8h, v12.8h, v20.8h + sub v13.8h, v13.8h, v20.8h + sub v14.8h, v14.8h, v20.8h + sub v15.8h, v15.8h, v20.8h + sshr v16.8h, v0.8h, #15 + sshr v17.8h, v1.8h, #15 + sshr v18.8h, v2.8h, #15 + sshr v19.8h, v3.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + sshr v16.8h, v4.8h, #15 + sshr v17.8h, v5.8h, #15 + sshr v18.8h, v6.8h, #15 + sshr v19.8h, v7.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v4.8h, v4.8h, v16.8h + add v5.8h, v5.8h, v17.8h + add v6.8h, v6.8h, v18.8h + add v7.8h, v7.8h, v19.8h + sshr v16.8h, v8.8h, #15 + sshr v17.8h, v9.8h, #15 + sshr v18.8h, v10.8h, #15 + sshr v19.8h, v11.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v8.8h, v8.8h, v16.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + add v11.8h, v11.8h, v19.8h + sshr v16.8h, v12.8h, #15 + sshr v17.8h, v13.8h, #15 + sshr v18.8h, v14.8h, #15 + sshr v19.8h, v15.8h, #15 + and v16.16b, v16.16b, v20.16b + and v17.16b, v17.16b, v20.16b + and v18.16b, v18.16b, v20.16b + and v19.16b, v19.16b, v20.16b + add v12.8h, v12.8h, v16.8h + add v13.8h, v13.8h, v17.8h + add v14.8h, v14.8h, v18.8h + add v15.8h, v15.8h, v19.8h + st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 + st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 + st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_csubq_neon,.-kyber_csubq_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_add_reduce +.type kyber_add_reduce,@function +.align 2 +kyber_add_reduce: +#else +.section __TEXT,__text +.globl _kyber_add_reduce +.p2align 2 +_kyber_add_reduce: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_consts + add x2, x2, :lo12:L_kyber_aarch64_consts +#else + adrp x2, L_kyber_aarch64_consts@PAGE + add x2, x2, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x2] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_add_reduce,.-kyber_add_reduce +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_add3_reduce +.type kyber_add3_reduce,@function +.align 2 +kyber_add3_reduce: +#else +.section __TEXT,__text +.globl _kyber_add3_reduce +.p2align 2 +_kyber_add3_reduce: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_consts + add x3, x3, :lo12:L_kyber_aarch64_consts +#else + adrp x3, L_kyber_aarch64_consts@PAGE + add x3, x3, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x3] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 + ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 + sub x0, x0, #0x80 + add v1.8h, v1.8h, v9.8h + add v2.8h, v2.8h, v10.8h + add v3.8h, v3.8h, v11.8h + add v4.8h, v4.8h, v12.8h + add v5.8h, v5.8h, v13.8h + add v6.8h, v6.8h, v14.8h + add v7.8h, v7.8h, v15.8h + add v8.8h, v8.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + add v8.8h, v8.8h, v24.8h + sqdmulh v25.8h, v1.8h, v0.h[2] + sqdmulh v26.8h, v2.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v1.8h, v25.8h, v0.h[0] + mls v2.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v3.8h, v0.h[2] + sqdmulh v26.8h, v4.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v3.8h, v25.8h, v0.h[0] + mls v4.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v5.8h, v0.h[2] + sqdmulh v26.8h, v6.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v5.8h, v25.8h, v0.h[0] + mls v6.8h, v26.8h, v0.h[0] + sqdmulh v25.8h, v7.8h, v0.h[2] + sqdmulh v26.8h, v8.8h, v0.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v7.8h, v25.8h, v0.h[0] + mls v8.8h, v26.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_add3_reduce,.-kyber_add3_reduce +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_rsub_reduce +.type kyber_rsub_reduce,@function +.align 2 +kyber_rsub_reduce: +#else +.section __TEXT,__text +.globl _kyber_rsub_reduce +.p2align 2 +_kyber_rsub_reduce: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_consts + add x2, x2, :lo12:L_kyber_aarch64_consts +#else + adrp x2, L_kyber_aarch64_consts@PAGE + add x2, x2, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x2] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 + sub x0, x0, #0x80 + sub v1.8h, v9.8h, v1.8h + sub v2.8h, v10.8h, v2.8h + sub v3.8h, v11.8h, v3.8h + sub v4.8h, v12.8h, v4.8h + sub v5.8h, v13.8h, v5.8h + sub v6.8h, v14.8h, v6.8h + sub v7.8h, v15.8h, v7.8h + sub v8.8h, v16.8h, v8.8h + sqdmulh v17.8h, v1.8h, v0.h[2] + sqdmulh v18.8h, v2.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v1.8h, v17.8h, v0.h[0] + mls v2.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v3.8h, v0.h[2] + sqdmulh v18.8h, v4.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v3.8h, v17.8h, v0.h[0] + mls v4.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v5.8h, v0.h[2] + sqdmulh v18.8h, v6.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v5.8h, v17.8h, v0.h[0] + mls v6.8h, v18.8h, v0.h[0] + sqdmulh v17.8h, v7.8h, v0.h[2] + sqdmulh v18.8h, v8.8h, v0.h[2] + sshr v17.8h, v17.8h, #11 + sshr v18.8h, v18.8h, #11 + mls v7.8h, v17.8h, v0.h[0] + mls v8.8h, v18.8h, v0.h[0] + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_rsub_reduce,.-kyber_rsub_reduce +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_to_mont +.type kyber_to_mont,@function +.align 2 +kyber_to_mont: +#else +.section __TEXT,__text +.globl _kyber_to_mont +.p2align 2 +_kyber_to_mont: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x1, L_kyber_aarch64_consts + add x1, x1, :lo12:L_kyber_aarch64_consts +#else + adrp x1, L_kyber_aarch64_consts@PAGE + add x1, x1, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x1] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + sub x0, x0, #0x100 + mul v17.8h, v1.8h, v0.h[4] + mul v18.8h, v2.8h, v0.h[4] + sqrdmulh v1.8h, v1.8h, v0.h[3] + sqrdmulh v2.8h, v2.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v1.8h, v17.8h, v0.h[0] + sqrdmlsh v2.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v1.8h, v1.8h, #1 + sshr v2.8h, v2.8h, #1 + mul v17.8h, v3.8h, v0.h[4] + mul v18.8h, v4.8h, v0.h[4] + sqrdmulh v3.8h, v3.8h, v0.h[3] + sqrdmulh v4.8h, v4.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v3.8h, v17.8h, v0.h[0] + sqrdmlsh v4.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v3.8h, v3.8h, v17.8h + sub v4.8h, v4.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v3.8h, v3.8h, #1 + sshr v4.8h, v4.8h, #1 + mul v17.8h, v5.8h, v0.h[4] + mul v18.8h, v6.8h, v0.h[4] + sqrdmulh v5.8h, v5.8h, v0.h[3] + sqrdmulh v6.8h, v6.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v5.8h, v17.8h, v0.h[0] + sqrdmlsh v6.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v5.8h, v5.8h, v17.8h + sub v6.8h, v6.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v5.8h, v5.8h, #1 + sshr v6.8h, v6.8h, #1 + mul v17.8h, v7.8h, v0.h[4] + mul v18.8h, v8.8h, v0.h[4] + sqrdmulh v7.8h, v7.8h, v0.h[3] + sqrdmulh v8.8h, v8.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v7.8h, v17.8h, v0.h[0] + sqrdmlsh v8.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v7.8h, v7.8h, v17.8h + sub v8.8h, v8.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v7.8h, v7.8h, #1 + sshr v8.8h, v8.8h, #1 + mul v17.8h, v9.8h, v0.h[4] + mul v18.8h, v10.8h, v0.h[4] + sqrdmulh v9.8h, v9.8h, v0.h[3] + sqrdmulh v10.8h, v10.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v9.8h, v17.8h, v0.h[0] + sqrdmlsh v10.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v9.8h, v9.8h, v17.8h + sub v10.8h, v10.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v17.8h, v11.8h, v0.h[4] + mul v18.8h, v12.8h, v0.h[4] + sqrdmulh v11.8h, v11.8h, v0.h[3] + sqrdmulh v12.8h, v12.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v11.8h, v17.8h, v0.h[0] + sqrdmlsh v12.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v11.8h, v11.8h, v17.8h + sub v12.8h, v12.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v17.8h, v13.8h, v0.h[4] + mul v18.8h, v14.8h, v0.h[4] + sqrdmulh v13.8h, v13.8h, v0.h[3] + sqrdmulh v14.8h, v14.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v13.8h, v17.8h, v0.h[0] + sqrdmlsh v14.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v13.8h, v13.8h, v17.8h + sub v14.8h, v14.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v17.8h, v15.8h, v0.h[4] + mul v18.8h, v16.8h, v0.h[4] + sqrdmulh v15.8h, v15.8h, v0.h[3] + sqrdmulh v16.8h, v16.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v17.8h, v0.h[0] + sqrdmlsh v16.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v15.8h, v15.8h, v17.8h + sub v16.8h, v16.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + sub x0, x0, #0x100 + mul v17.8h, v1.8h, v0.h[4] + mul v18.8h, v2.8h, v0.h[4] + sqrdmulh v1.8h, v1.8h, v0.h[3] + sqrdmulh v2.8h, v2.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v1.8h, v17.8h, v0.h[0] + sqrdmlsh v2.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v1.8h, v1.8h, #1 + sshr v2.8h, v2.8h, #1 + mul v17.8h, v3.8h, v0.h[4] + mul v18.8h, v4.8h, v0.h[4] + sqrdmulh v3.8h, v3.8h, v0.h[3] + sqrdmulh v4.8h, v4.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v3.8h, v17.8h, v0.h[0] + sqrdmlsh v4.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v3.8h, v3.8h, v17.8h + sub v4.8h, v4.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v3.8h, v3.8h, #1 + sshr v4.8h, v4.8h, #1 + mul v17.8h, v5.8h, v0.h[4] + mul v18.8h, v6.8h, v0.h[4] + sqrdmulh v5.8h, v5.8h, v0.h[3] + sqrdmulh v6.8h, v6.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v5.8h, v17.8h, v0.h[0] + sqrdmlsh v6.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v5.8h, v5.8h, v17.8h + sub v6.8h, v6.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v5.8h, v5.8h, #1 + sshr v6.8h, v6.8h, #1 + mul v17.8h, v7.8h, v0.h[4] + mul v18.8h, v8.8h, v0.h[4] + sqrdmulh v7.8h, v7.8h, v0.h[3] + sqrdmulh v8.8h, v8.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v7.8h, v17.8h, v0.h[0] + sqrdmlsh v8.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v7.8h, v7.8h, v17.8h + sub v8.8h, v8.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v7.8h, v7.8h, #1 + sshr v8.8h, v8.8h, #1 + mul v17.8h, v9.8h, v0.h[4] + mul v18.8h, v10.8h, v0.h[4] + sqrdmulh v9.8h, v9.8h, v0.h[3] + sqrdmulh v10.8h, v10.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v9.8h, v17.8h, v0.h[0] + sqrdmlsh v10.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v9.8h, v9.8h, v17.8h + sub v10.8h, v10.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v17.8h, v11.8h, v0.h[4] + mul v18.8h, v12.8h, v0.h[4] + sqrdmulh v11.8h, v11.8h, v0.h[3] + sqrdmulh v12.8h, v12.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v11.8h, v17.8h, v0.h[0] + sqrdmlsh v12.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v11.8h, v11.8h, v17.8h + sub v12.8h, v12.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v17.8h, v13.8h, v0.h[4] + mul v18.8h, v14.8h, v0.h[4] + sqrdmulh v13.8h, v13.8h, v0.h[3] + sqrdmulh v14.8h, v14.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v13.8h, v17.8h, v0.h[0] + sqrdmlsh v14.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v13.8h, v13.8h, v17.8h + sub v14.8h, v14.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v17.8h, v15.8h, v0.h[4] + mul v18.8h, v16.8h, v0.h[4] + sqrdmulh v15.8h, v15.8h, v0.h[3] + sqrdmulh v16.8h, v16.8h, v0.h[3] +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmlsh v15.8h, v17.8h, v0.h[0] + sqrdmlsh v16.8h, v18.8h, v0.h[0] +#else + sqrdmulh v17.8h, v17.8h, v0.h[0] + sqrdmulh v18.8h, v18.8h, v0.h[0] + sub v15.8h, v15.8h, v17.8h + sub v16.8h, v16.8h, v18.8h +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_to_mont,.-kyber_to_mont +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_to_msg_neon_low, %object + .section .rodata + .size L_kyber_aarch64_to_msg_neon_low, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_to_msg_neon_low: + .short 0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_to_msg_neon_high, %object + .section .rodata + .size L_kyber_aarch64_to_msg_neon_high, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_to_msg_neon_high: + .short 0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_to_msg_neon_bits, %object + .section .rodata + .size L_kyber_aarch64_to_msg_neon_bits, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_to_msg_neon_bits: + .short 0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080 +#ifndef __APPLE__ +.text +.globl kyber_to_msg_neon +.type kyber_to_msg_neon,@function +.align 2 +kyber_to_msg_neon: +#else +.section __TEXT,__text +.globl _kyber_to_msg_neon +.p2align 2 +_kyber_to_msg_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_to_msg_neon_low + add x2, x2, :lo12:L_kyber_aarch64_to_msg_neon_low +#else + adrp x2, L_kyber_aarch64_to_msg_neon_low@PAGE + add x2, x2, :lo12:L_kyber_aarch64_to_msg_neon_low@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_to_msg_neon_high + add x3, x3, :lo12:L_kyber_aarch64_to_msg_neon_high +#else + adrp x3, L_kyber_aarch64_to_msg_neon_high@PAGE + add x3, x3, :lo12:L_kyber_aarch64_to_msg_neon_high@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_to_msg_neon_bits + add x4, x4, :lo12:L_kyber_aarch64_to_msg_neon_bits +#else + adrp x4, L_kyber_aarch64_to_msg_neon_bits@PAGE + add x4, x4, :lo12:L_kyber_aarch64_to_msg_neon_bits@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x2] + ldr q1, [x3] + ldr q26, [x4] + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 + cmge v10.8h, v2.8h, v0.8h + cmge v18.8h, v1.8h, v2.8h + cmge v11.8h, v3.8h, v0.8h + cmge v19.8h, v1.8h, v3.8h + cmge v12.8h, v4.8h, v0.8h + cmge v20.8h, v1.8h, v4.8h + cmge v13.8h, v5.8h, v0.8h + cmge v21.8h, v1.8h, v5.8h + cmge v14.8h, v6.8h, v0.8h + cmge v22.8h, v1.8h, v6.8h + cmge v15.8h, v7.8h, v0.8h + cmge v23.8h, v1.8h, v7.8h + cmge v16.8h, v8.8h, v0.8h + cmge v24.8h, v1.8h, v8.8h + cmge v17.8h, v9.8h, v0.8h + cmge v25.8h, v1.8h, v9.8h + and v18.16b, v18.16b, v10.16b + and v19.16b, v19.16b, v11.16b + and v20.16b, v20.16b, v12.16b + and v21.16b, v21.16b, v13.16b + and v22.16b, v22.16b, v14.16b + and v23.16b, v23.16b, v15.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v17.16b + and v18.16b, v18.16b, v26.16b + and v19.16b, v19.16b, v26.16b + and v20.16b, v20.16b, v26.16b + and v21.16b, v21.16b, v26.16b + and v22.16b, v22.16b, v26.16b + and v23.16b, v23.16b, v26.16b + and v24.16b, v24.16b, v26.16b + and v25.16b, v25.16b, v26.16b + addv h18, v18.8h + addv h19, v19.8h + addv h20, v20.8h + addv h21, v21.8h + addv h22, v22.8h + addv h23, v23.8h + addv h24, v24.8h + addv h25, v25.8h + ins v18.b[1], v19.b[0] + ins v18.b[2], v20.b[0] + ins v18.b[3], v21.b[0] + ins v18.b[4], v22.b[0] + ins v18.b[5], v23.b[0] + ins v18.b[6], v24.b[0] + ins v18.b[7], v25.b[0] + st1 {v18.8b}, [x0], #8 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_to_msg_neon,.-kyber_to_msg_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_from_msg_neon_q1half, %object + .section .rodata + .size L_kyber_aarch64_from_msg_neon_q1half, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_from_msg_neon_q1half: + .short 0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_from_msg_neon_bits, %object + .section .rodata + .size L_kyber_aarch64_from_msg_neon_bits, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_kyber_aarch64_from_msg_neon_bits: + .byte 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 + .byte 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 +#ifndef __APPLE__ +.text +.globl kyber_from_msg_neon +.type kyber_from_msg_neon,@function +.align 2 +kyber_from_msg_neon: +#else +.section __TEXT,__text +.globl _kyber_from_msg_neon +.p2align 2 +_kyber_from_msg_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-48]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_from_msg_neon_q1half + add x2, x2, :lo12:L_kyber_aarch64_from_msg_neon_q1half +#else + adrp x2, L_kyber_aarch64_from_msg_neon_q1half@PAGE + add x2, x2, :lo12:L_kyber_aarch64_from_msg_neon_q1half@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_from_msg_neon_bits + add x3, x3, :lo12:L_kyber_aarch64_from_msg_neon_bits +#else + adrp x3, L_kyber_aarch64_from_msg_neon_bits@PAGE + add x3, x3, :lo12:L_kyber_aarch64_from_msg_neon_bits@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v2.16b, v3.16b}, [x1] + ldr q1, [x2] + ldr q0, [x3] + dup v4.8b, v2.b[0] + dup v5.8b, v2.b[1] + dup v6.8b, v2.b[2] + dup v7.8b, v2.b[3] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v2.b[4] + dup v5.8b, v2.b[5] + dup v6.8b, v2.b[6] + dup v7.8b, v2.b[7] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v2.b[8] + dup v5.8b, v2.b[9] + dup v6.8b, v2.b[10] + dup v7.8b, v2.b[11] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v2.b[12] + dup v5.8b, v2.b[13] + dup v6.8b, v2.b[14] + dup v7.8b, v2.b[15] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[0] + dup v5.8b, v3.b[1] + dup v6.8b, v3.b[2] + dup v7.8b, v3.b[3] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[4] + dup v5.8b, v3.b[5] + dup v6.8b, v3.b[6] + dup v7.8b, v3.b[7] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[8] + dup v5.8b, v3.b[9] + dup v6.8b, v3.b[10] + dup v7.8b, v3.b[11] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + dup v4.8b, v3.b[12] + dup v5.8b, v3.b[13] + dup v6.8b, v3.b[14] + dup v7.8b, v3.b[15] + cmtst v4.8b, v4.8b, v0.8b + cmtst v5.8b, v5.8b, v0.8b + cmtst v6.8b, v6.8b, v0.8b + cmtst v7.8b, v7.8b, v0.8b + zip1 v4.16b, v4.16b, v4.16b + zip1 v5.16b, v5.16b, v5.16b + zip1 v6.16b, v6.16b, v6.16b + zip1 v7.16b, v7.16b, v7.16b + and v4.16b, v4.16b, v1.16b + and v5.16b, v5.16b, v1.16b + and v6.16b, v6.16b, v1.16b + and v7.16b, v7.16b, v1.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp x29, x30, [sp], #48 + ret +#ifndef __APPLE__ + .size kyber_from_msg_neon,.-kyber_from_msg_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_cmp_neon +.type kyber_cmp_neon,@function +.align 2 +kyber_cmp_neon: +#else +.section __TEXT,__text +.globl _kyber_cmp_neon +.p2align 2 +_kyber_cmp_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-48]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v8.16b, v0.16b, v4.16b + eor v9.16b, v1.16b, v5.16b + eor v10.16b, v2.16b, v6.16b + eor v11.16b, v3.16b, v7.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + subs w2, w2, #0x300 + beq L_kyber_aarch64_cmp_neon_done + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + subs w2, w2, #0x140 + beq L_kyber_aarch64_cmp_neon_done + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld2 {v0.16b, v1.16b}, [x0] + ld2 {v4.16b, v5.16b}, [x1] + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b +L_kyber_aarch64_cmp_neon_done: + orr v8.16b, v8.16b, v9.16b + orr v10.16b, v10.16b, v11.16b + orr v8.16b, v8.16b, v10.16b + ins v9.b[0], v8.b[1] + orr v8.16b, v8.16b, v9.16b + mov x0, v8.d[0] + subs x0, x0, xzr + csetm w0, ne + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp x29, x30, [sp], #48 + ret +#ifndef __APPLE__ + .size kyber_cmp_neon,.-kyber_cmp_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_rej_uniform_neon_mask, %object + .section .rodata + .size L_kyber_aarch64_rej_uniform_neon_mask, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_rej_uniform_neon_mask: + .short 0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_rej_uniform_neon_bits, %object + .section .rodata + .size L_kyber_aarch64_rej_uniform_neon_bits, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 2 +#else + .p2align 2 +#endif /* __APPLE__ */ +L_kyber_aarch64_rej_uniform_neon_bits: + .short 0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080 +#ifndef __APPLE__ + .text + .type L_kyber_aarch64_rej_uniform_neon_indeces, %object + .section .rodata + .size L_kyber_aarch64_rej_uniform_neon_indeces, 4096 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_kyber_aarch64_rej_uniform_neon_indeces: + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x0a,0x0b,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x0a,0x0b,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x0a,0x0b,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x0a,0x0b,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x0a,0x0b,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x0a,0x0b,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x0a,0x0b,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x0a,0x0b,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0x0a,0x0b,0x0e,0x0f,0xff,0xff + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x06,0x07,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 + .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f +#ifndef __APPLE__ +.text +.globl kyber_rej_uniform_neon +.type kyber_rej_uniform_neon,@function +.align 2 +kyber_rej_uniform_neon: +#else +.section __TEXT,__text +.globl _kyber_rej_uniform_neon +.p2align 2 +_kyber_rej_uniform_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-64]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_rej_uniform_neon_mask + add x4, x4, :lo12:L_kyber_aarch64_rej_uniform_neon_mask +#else + adrp x4, L_kyber_aarch64_rej_uniform_neon_mask@PAGE + add x4, x4, :lo12:L_kyber_aarch64_rej_uniform_neon_mask@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x5, L_kyber_aarch64_q + add x5, x5, :lo12:L_kyber_aarch64_q +#else + adrp x5, L_kyber_aarch64_q@PAGE + add x5, x5, :lo12:L_kyber_aarch64_q@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x6, L_kyber_aarch64_rej_uniform_neon_bits + add x6, x6, :lo12:L_kyber_aarch64_rej_uniform_neon_bits +#else + adrp x6, L_kyber_aarch64_rej_uniform_neon_bits@PAGE + add x6, x6, :lo12:L_kyber_aarch64_rej_uniform_neon_bits@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x7, L_kyber_aarch64_rej_uniform_neon_indeces + add x7, x7, :lo12:L_kyber_aarch64_rej_uniform_neon_indeces +#else + adrp x7, L_kyber_aarch64_rej_uniform_neon_indeces@PAGE + add x7, x7, :lo12:L_kyber_aarch64_rej_uniform_neon_indeces@PAGEOFF +#endif /* __APPLE__ */ + eor v1.16b, v1.16b, v1.16b + eor v12.16b, v12.16b, v12.16b + eor v13.16b, v13.16b, v13.16b + eor x12, x12, x12 + eor v10.16b, v10.16b, v10.16b + eor v11.16b, v11.16b, v11.16b + mov x13, #0xd01 + ldr q0, [x4] + ldr q3, [x5] + ldr q2, [x6] + subs wzr, w1, #0 + beq L_kyber_aarch64_rej_uniform_neon_done + subs wzr, w1, #16 + blt L_kyber_aarch64_rej_uniform_neon_loop_4 +L_kyber_aarch64_rej_uniform_neon_loop_16: + ld3 {v4.8b, v5.8b, v6.8b}, [x2], #24 + zip1 v4.16b, v4.16b, v1.16b + zip1 v5.16b, v5.16b, v1.16b + zip1 v6.16b, v6.16b, v1.16b + shl v7.8h, v5.8h, #8 + ushr v8.8h, v5.8h, #4 + shl v6.8h, v6.8h, #4 + orr v4.16b, v4.16b, v7.16b + orr v5.16b, v8.16b, v6.16b + and v7.16b, v4.16b, v0.16b + and v8.16b, v5.16b, v0.16b + zip1 v4.8h, v7.8h, v8.8h + zip2 v5.8h, v7.8h, v8.8h + cmgt v7.8h, v3.8h, v4.8h + cmgt v8.8h, v3.8h, v5.8h + ushr v12.8h, v7.8h, #15 + ushr v13.8h, v8.8h, #15 + addv h12, v12.8h + addv h13, v13.8h + mov x10, v12.d[0] + mov x11, v13.d[0] + and v10.16b, v7.16b, v2.16b + and v11.16b, v8.16b, v2.16b + addv h10, v10.8h + addv h11, v11.8h + mov w8, v10.s[0] + mov w9, v11.s[0] + lsl w8, w8, #4 + lsl w9, w9, #4 + ldr q10, [x7, x8] + ldr q11, [x7, x9] + tbl v7.16b, {v4.16b}, v10.16b + tbl v8.16b, {v5.16b}, v11.16b + str q7, [x0] + add x0, x0, x10, lsl 1 + add x12, x12, x10 + str q8, [x0] + add x0, x0, x11, lsl 1 + add x12, x12, x11 + subs w3, w3, #24 + beq L_kyber_aarch64_rej_uniform_neon_done + sub w10, w1, w12 + subs x10, x10, #16 + blt L_kyber_aarch64_rej_uniform_neon_loop_4 + b L_kyber_aarch64_rej_uniform_neon_loop_16 +L_kyber_aarch64_rej_uniform_neon_loop_4: + subs w10, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + subs x10, x10, #4 + blt L_kyber_aarch64_rej_uniform_neon_loop_lt_4 + ldr x4, [x2], #6 + lsr x5, x4, #12 + lsr x6, x4, #24 + lsr x7, x4, #36 + and x4, x4, #0xfff + and x5, x5, #0xfff + and x6, x6, #0xfff + and x7, x7, #0xfff + strh w4, [x0] + subs xzr, x4, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + strh w5, [x0] + subs xzr, x5, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + strh w6, [x0] + subs xzr, x6, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + strh w7, [x0] + subs xzr, x7, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs w3, w3, #6 + beq L_kyber_aarch64_rej_uniform_neon_done + b L_kyber_aarch64_rej_uniform_neon_loop_4 +L_kyber_aarch64_rej_uniform_neon_loop_lt_4: + ldr x4, [x2], #6 + lsr x5, x4, #12 + lsr x6, x4, #24 + lsr x7, x4, #36 + and x4, x4, #0xfff + and x5, x5, #0xfff + and x6, x6, #0xfff + and x7, x7, #0xfff + strh w4, [x0] + subs xzr, x4, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + strh w5, [x0] + subs xzr, x5, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + strh w6, [x0] + subs xzr, x6, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + strh w7, [x0] + subs xzr, x7, x13 + cinc x0, x0, lt + cinc x0, x0, lt + cinc x12, x12, lt + subs wzr, w1, w12 + beq L_kyber_aarch64_rej_uniform_neon_done + subs w3, w3, #6 + beq L_kyber_aarch64_rej_uniform_neon_done + b L_kyber_aarch64_rej_uniform_neon_loop_lt_4 +L_kyber_aarch64_rej_uniform_neon_done: + mov x0, x12 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp x29, x30, [sp], #0x40 + ret +#ifndef __APPLE__ + .size kyber_rej_uniform_neon,.-kyber_rej_uniform_neon +#endif /* __APPLE__ */ +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifndef __APPLE__ +.text +.globl kyber_sha3_blocksx3_neon +.type kyber_sha3_blocksx3_neon,@function +.align 2 +kyber_sha3_blocksx3_neon: +#else +.section __TEXT,__text +.globl _kyber_sha3_blocksx3_neon +.p2align 2 +_kyber_sha3_blocksx3_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x27, L_sha3_aarch64_r + add x27, x27, :lo12:L_sha3_aarch64_r +#else + adrp x27, L_sha3_aarch64_r@PAGE + add x27, x27, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + ld1 {v24.d}[0], [x0] + add x0, x0, #8 + ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + ld1 {v24.d}[1], [x0] + add x0, x0, #8 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + ldp x5, x6, [x0, #32] + ldp x7, x8, [x0, #48] + ldp x9, x10, [x0, #64] + ldp x11, x12, [x0, #80] + ldp x13, x14, [x0, #96] + ldp x15, x16, [x0, #112] + ldp x17, x19, [x0, #128] + ldp x20, x21, [x0, #144] + ldp x22, x23, [x0, #160] + ldp x24, x25, [x0, #176] + ldr x26, [x0, #192] + mov x28, #24 + # Start of 24 rounds +L_SHA3_transform_blocksx3_neon_begin: + stp x27, x28, [x29, #48] + # Col Mix + eor3 v31.16b, v0.16b, v5.16b, v10.16b + eor x0, x5, x10 + eor3 v27.16b, v1.16b, v6.16b, v11.16b + eor x30, x1, x6 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x28, x3, x8 + eor3 v29.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x15 + eor3 v30.16b, v4.16b, v9.16b, v14.16b + eor x30, x30, x11 + eor3 v31.16b, v31.16b, v15.16b, v20.16b + eor x28, x28, x13 + eor3 v27.16b, v27.16b, v16.16b, v21.16b + eor x0, x0, x21 + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor x30, x30, x16 + eor3 v29.16b, v29.16b, v18.16b, v23.16b + eor x28, x28, x19 + eor3 v30.16b, v30.16b, v19.16b, v24.16b + eor x0, x0, x26 + rax1 v25.2d, v30.2d, v27.2d + eor x30, x30, x22 + rax1 v26.2d, v31.2d, v28.2d + eor x28, x28, x24 + rax1 v27.2d, v27.2d, v29.2d + str x0, [x29, #32] + rax1 v28.2d, v28.2d, v30.2d + str x28, [x29, #24] + rax1 v29.2d, v29.2d, v31.2d + eor x27, x2, x7 + eor v0.16b, v0.16b, v25.16b + xar v30.2d, v1.2d, v26.2d, #63 + eor x28, x4, x9 + xar v1.2d, v6.2d, v26.2d, #20 + eor x27, x27, x12 + xar v6.2d, v9.2d, v29.2d, #44 + eor x28, x28, x14 + xar v9.2d, v22.2d, v27.2d, #3 + eor x27, x27, x17 + xar v22.2d, v14.2d, v29.2d, #25 + eor x28, x28, x20 + xar v14.2d, v20.2d, v25.2d, #46 + eor x27, x27, x23 + xar v20.2d, v2.2d, v27.2d, #2 + eor x28, x28, x25 + xar v2.2d, v12.2d, v27.2d, #21 + eor x0, x0, x27, ror 63 + xar v12.2d, v13.2d, v28.2d, #39 + eor x27, x27, x28, ror 63 + xar v13.2d, v19.2d, v29.2d, #56 + eor x1, x1, x0 + xar v19.2d, v23.2d, v28.2d, #8 + eor x6, x6, x0 + xar v23.2d, v15.2d, v25.2d, #23 + eor x11, x11, x0 + xar v15.2d, v4.2d, v29.2d, #37 + eor x16, x16, x0 + xar v4.2d, v24.2d, v29.2d, #50 + eor x22, x22, x0 + xar v24.2d, v21.2d, v26.2d, #62 + eor x3, x3, x27 + xar v21.2d, v8.2d, v28.2d, #9 + eor x8, x8, x27 + xar v8.2d, v16.2d, v26.2d, #19 + eor x13, x13, x27 + xar v16.2d, v5.2d, v25.2d, #28 + eor x19, x19, x27 + xar v5.2d, v3.2d, v28.2d, #36 + eor x24, x24, x27 + xar v3.2d, v18.2d, v28.2d, #43 + ldr x0, [x29, #32] + xar v18.2d, v17.2d, v27.2d, #49 + ldr x27, [x29, #24] + xar v17.2d, v11.2d, v26.2d, #54 + eor x28, x28, x30, ror 63 + xar v11.2d, v7.2d, v27.2d, #58 + eor x30, x30, x27, ror 63 + xar v7.2d, v10.2d, v25.2d, #61 + eor x27, x27, x0, ror 63 + # Row Mix + mov v25.16b, v0.16b + eor x5, x5, x28 + mov v26.16b, v1.16b + eor x10, x10, x28 + bcax v0.16b, v25.16b, v2.16b, v26.16b + eor x15, x15, x28 + bcax v1.16b, v26.16b, v3.16b, v2.16b + eor x21, x21, x28 + bcax v2.16b, v2.16b, v4.16b, v3.16b + eor x26, x26, x28 + bcax v3.16b, v3.16b, v25.16b, v4.16b + eor x2, x2, x30 + bcax v4.16b, v4.16b, v26.16b, v25.16b + eor x7, x7, x30 + mov v25.16b, v5.16b + eor x12, x12, x30 + mov v26.16b, v6.16b + eor x17, x17, x30 + bcax v5.16b, v25.16b, v7.16b, v26.16b + eor x23, x23, x30 + bcax v6.16b, v26.16b, v8.16b, v7.16b + eor x4, x4, x27 + bcax v7.16b, v7.16b, v9.16b, v8.16b + eor x9, x9, x27 + bcax v8.16b, v8.16b, v25.16b, v9.16b + eor x14, x14, x27 + bcax v9.16b, v9.16b, v26.16b, v25.16b + eor x20, x20, x27 + mov v26.16b, v11.16b + eor x25, x25, x27 + # Swap Rotate Base + bcax v10.16b, v30.16b, v12.16b, v26.16b + ror x0, x2, #63 + bcax v11.16b, v26.16b, v13.16b, v12.16b + ror x2, x7, #20 + bcax v12.16b, v12.16b, v14.16b, v13.16b + ror x7, x10, #44 + bcax v13.16b, v13.16b, v30.16b, v14.16b + ror x10, x24, #3 + bcax v14.16b, v14.16b, v26.16b, v30.16b + ror x24, x15, #25 + mov v25.16b, v15.16b + ror x15, x22, #46 + mov v26.16b, v16.16b + ror x22, x3, #2 + bcax v15.16b, v25.16b, v17.16b, v26.16b + ror x3, x13, #21 + bcax v16.16b, v26.16b, v18.16b, v17.16b + ror x13, x14, #39 + bcax v17.16b, v17.16b, v19.16b, v18.16b + ror x14, x21, #56 + bcax v18.16b, v18.16b, v25.16b, v19.16b + ror x21, x25, #8 + bcax v19.16b, v19.16b, v26.16b, v25.16b + ror x25, x16, #23 + mov v25.16b, v20.16b + ror x16, x5, #37 + mov v26.16b, v21.16b + ror x5, x26, #50 + bcax v20.16b, v25.16b, v22.16b, v26.16b + ror x26, x23, #62 + bcax v21.16b, v26.16b, v23.16b, v22.16b + ror x23, x9, #9 + bcax v22.16b, v22.16b, v24.16b, v23.16b + ror x9, x17, #19 + bcax v23.16b, v23.16b, v25.16b, v24.16b + ror x17, x6, #28 + bcax v24.16b, v24.16b, v26.16b, v25.16b + ror x6, x4, #36 + ror x4, x20, #43 + ror x20, x19, #49 + ror x19, x12, #54 + ror x12, x8, #58 + ror x8, x11, #61 + # Row Mix Base + bic x11, x3, x2 + bic x27, x4, x3 + bic x28, x1, x5 + bic x30, x2, x1 + eor x1, x1, x11 + eor x2, x2, x27 + bic x11, x5, x4 + eor x4, x4, x28 + eor x3, x3, x11 + eor x5, x5, x30 + bic x11, x8, x7 + bic x27, x9, x8 + bic x28, x6, x10 + bic x30, x7, x6 + eor x6, x6, x11 + eor x7, x7, x27 + bic x11, x10, x9 + eor x9, x9, x28 + eor x8, x8, x11 + eor x10, x10, x30 + bic x11, x13, x12 + bic x27, x14, x13 + bic x28, x0, x15 + bic x30, x12, x0 + eor x11, x0, x11 + eor x12, x12, x27 + bic x0, x15, x14 + eor x14, x14, x28 + eor x13, x13, x0 + eor x15, x15, x30 + bic x0, x19, x17 + bic x27, x20, x19 + bic x28, x16, x21 + bic x30, x17, x16 + eor x16, x16, x0 + eor x17, x17, x27 + bic x0, x21, x20 + eor x20, x20, x28 + eor x19, x19, x0 + eor x21, x21, x30 + bic x0, x24, x23 + bic x27, x25, x24 + bic x28, x22, x26 + bic x30, x23, x22 + eor x22, x22, x0 + eor x23, x23, x27 + bic x0, x26, x25 + eor x25, x25, x28 + eor x24, x24, x0 + eor x26, x26, x30 + # Done tranforming + ldp x27, x28, [x29, #48] + ldr x0, [x27], #8 + subs x28, x28, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x1, x1, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_transform_blocksx3_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x1, x2, [x0] + stp x3, x4, [x0, #16] + stp x5, x6, [x0, #32] + stp x7, x8, [x0, #48] + stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] + stp x13, x14, [x0, #96] + stp x15, x16, [x0, #112] + stp x17, x19, [x0, #128] + stp x20, x21, [x0, #144] + stp x22, x23, [x0, #160] + stp x24, x25, [x0, #176] + str x26, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_sha3_blocksx3_neon,.-kyber_sha3_blocksx3_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_shake128_blocksx3_seed_neon +.type kyber_shake128_blocksx3_seed_neon,@function +.align 2 +kyber_shake128_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake128_blocksx3_seed_neon +.p2align 2 +_kyber_shake128_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + eor v16.16b, v16.16b, v16.16b + eor x19, x19, x19 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + movz x23, #0x8000, lsl 48 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v20.2d, x23 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake128_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix + eor3 v31.16b, v0.16b, v5.16b, v10.16b + eor x0, x6, x11 + eor3 v27.16b, v1.16b, v6.16b, v11.16b + eor x30, x2, x7 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x28, x4, x9 + eor3 v29.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x16 + eor3 v30.16b, v4.16b, v9.16b, v14.16b + eor x30, x30, x12 + eor3 v31.16b, v31.16b, v15.16b, v20.16b + eor x28, x28, x14 + eor3 v27.16b, v27.16b, v16.16b, v21.16b + eor x0, x0, x22 + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor x30, x30, x17 + eor3 v29.16b, v29.16b, v18.16b, v23.16b + eor x28, x28, x20 + eor3 v30.16b, v30.16b, v19.16b, v24.16b + eor x0, x0, x27 + rax1 v25.2d, v30.2d, v27.2d + eor x30, x30, x23 + rax1 v26.2d, v31.2d, v28.2d + eor x28, x28, x25 + rax1 v27.2d, v27.2d, v29.2d + str x0, [x29, #32] + rax1 v28.2d, v28.2d, v30.2d + str x28, [x29, #24] + rax1 v29.2d, v29.2d, v31.2d + eor x1, x3, x8 + eor v0.16b, v0.16b, v25.16b + xar v30.2d, v1.2d, v26.2d, #63 + eor x28, x5, x10 + xar v1.2d, v6.2d, v26.2d, #20 + eor x1, x1, x13 + xar v6.2d, v9.2d, v29.2d, #44 + eor x28, x28, x15 + xar v9.2d, v22.2d, v27.2d, #3 + eor x1, x1, x19 + xar v22.2d, v14.2d, v29.2d, #25 + eor x28, x28, x21 + xar v14.2d, v20.2d, v25.2d, #46 + eor x1, x1, x24 + xar v20.2d, v2.2d, v27.2d, #2 + eor x28, x28, x26 + xar v2.2d, v12.2d, v27.2d, #21 + eor x0, x0, x1, ror 63 + xar v12.2d, v13.2d, v28.2d, #39 + eor x1, x1, x28, ror 63 + xar v13.2d, v19.2d, v29.2d, #56 + eor x2, x2, x0 + xar v19.2d, v23.2d, v28.2d, #8 + eor x7, x7, x0 + xar v23.2d, v15.2d, v25.2d, #23 + eor x12, x12, x0 + xar v15.2d, v4.2d, v29.2d, #37 + eor x17, x17, x0 + xar v4.2d, v24.2d, v29.2d, #50 + eor x23, x23, x0 + xar v24.2d, v21.2d, v26.2d, #62 + eor x4, x4, x1 + xar v21.2d, v8.2d, v28.2d, #9 + eor x9, x9, x1 + xar v8.2d, v16.2d, v26.2d, #19 + eor x14, x14, x1 + xar v16.2d, v5.2d, v25.2d, #28 + eor x20, x20, x1 + xar v5.2d, v3.2d, v28.2d, #36 + eor x25, x25, x1 + xar v3.2d, v18.2d, v28.2d, #43 + ldr x0, [x29, #32] + xar v18.2d, v17.2d, v27.2d, #49 + ldr x1, [x29, #24] + xar v17.2d, v11.2d, v26.2d, #54 + eor x28, x28, x30, ror 63 + xar v11.2d, v7.2d, v27.2d, #58 + eor x30, x30, x1, ror 63 + xar v7.2d, v10.2d, v25.2d, #61 + eor x1, x1, x0, ror 63 + # Row Mix + mov v25.16b, v0.16b + eor x6, x6, x28 + mov v26.16b, v1.16b + eor x11, x11, x28 + bcax v0.16b, v25.16b, v2.16b, v26.16b + eor x16, x16, x28 + bcax v1.16b, v26.16b, v3.16b, v2.16b + eor x22, x22, x28 + bcax v2.16b, v2.16b, v4.16b, v3.16b + eor x27, x27, x28 + bcax v3.16b, v3.16b, v25.16b, v4.16b + eor x3, x3, x30 + bcax v4.16b, v4.16b, v26.16b, v25.16b + eor x8, x8, x30 + mov v25.16b, v5.16b + eor x13, x13, x30 + mov v26.16b, v6.16b + eor x19, x19, x30 + bcax v5.16b, v25.16b, v7.16b, v26.16b + eor x24, x24, x30 + bcax v6.16b, v26.16b, v8.16b, v7.16b + eor x5, x5, x1 + bcax v7.16b, v7.16b, v9.16b, v8.16b + eor x10, x10, x1 + bcax v8.16b, v8.16b, v25.16b, v9.16b + eor x15, x15, x1 + bcax v9.16b, v9.16b, v26.16b, v25.16b + eor x21, x21, x1 + mov v26.16b, v11.16b + eor x26, x26, x1 + # Swap Rotate Base + bcax v10.16b, v30.16b, v12.16b, v26.16b + ror x0, x3, #63 + bcax v11.16b, v26.16b, v13.16b, v12.16b + ror x3, x8, #20 + bcax v12.16b, v12.16b, v14.16b, v13.16b + ror x8, x11, #44 + bcax v13.16b, v13.16b, v30.16b, v14.16b + ror x11, x25, #3 + bcax v14.16b, v14.16b, v26.16b, v30.16b + ror x25, x16, #25 + mov v25.16b, v15.16b + ror x16, x23, #46 + mov v26.16b, v16.16b + ror x23, x4, #2 + bcax v15.16b, v25.16b, v17.16b, v26.16b + ror x4, x14, #21 + bcax v16.16b, v26.16b, v18.16b, v17.16b + ror x14, x15, #39 + bcax v17.16b, v17.16b, v19.16b, v18.16b + ror x15, x22, #56 + bcax v18.16b, v18.16b, v25.16b, v19.16b + ror x22, x26, #8 + bcax v19.16b, v19.16b, v26.16b, v25.16b + ror x26, x17, #23 + mov v25.16b, v20.16b + ror x17, x6, #37 + mov v26.16b, v21.16b + ror x6, x27, #50 + bcax v20.16b, v25.16b, v22.16b, v26.16b + ror x27, x24, #62 + bcax v21.16b, v26.16b, v23.16b, v22.16b + ror x24, x10, #9 + bcax v22.16b, v22.16b, v24.16b, v23.16b + ror x10, x19, #19 + bcax v23.16b, v23.16b, v25.16b, v24.16b + ror x19, x7, #28 + bcax v24.16b, v24.16b, v26.16b, v25.16b + ror x7, x5, #36 + ror x5, x21, #43 + ror x21, x20, #49 + ror x20, x13, #54 + ror x13, x9, #58 + ror x9, x12, #61 + # Row Mix Base + bic x12, x4, x3 + bic x1, x5, x4 + bic x28, x2, x6 + bic x30, x3, x2 + eor x2, x2, x12 + eor x3, x3, x1 + bic x12, x6, x5 + eor x5, x5, x28 + eor x4, x4, x12 + eor x6, x6, x30 + bic x12, x9, x8 + bic x1, x10, x9 + bic x28, x7, x11 + bic x30, x8, x7 + eor x7, x7, x12 + eor x8, x8, x1 + bic x12, x11, x10 + eor x10, x10, x28 + eor x9, x9, x12 + eor x11, x11, x30 + bic x12, x14, x13 + bic x1, x15, x14 + bic x28, x0, x16 + bic x30, x13, x0 + eor x12, x0, x12 + eor x13, x13, x1 + bic x0, x16, x15 + eor x15, x15, x28 + eor x14, x14, x0 + eor x16, x16, x30 + bic x0, x20, x19 + bic x1, x21, x20 + bic x28, x17, x22 + bic x30, x19, x17 + eor x17, x17, x0 + eor x19, x19, x1 + bic x0, x22, x21 + eor x21, x21, x28 + eor x20, x20, x0 + eor x22, x22, x30 + bic x0, x25, x24 + bic x1, x26, x25 + bic x28, x23, x27 + bic x30, x24, x23 + eor x23, x23, x0 + eor x24, x24, x1 + bic x0, x27, x26 + eor x26, x26, x28 + eor x25, x25, x0 + eor x27, x27, x30 + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake128_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake128_blocksx3_seed_neon,.-kyber_shake128_blocksx3_seed_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_shake256_blocksx3_seed_neon +.type kyber_shake256_blocksx3_seed_neon,@function +.align 2 +kyber_shake256_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake256_blocksx3_seed_neon +.p2align 2 +_kyber_shake256_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + movz x19, #0x8000, lsl 48 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + eor v20.16b, v20.16b, v20.16b + eor x23, x23, x23 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v16.2d, x19 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake256_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix + eor3 v31.16b, v0.16b, v5.16b, v10.16b + eor x0, x6, x11 + eor3 v27.16b, v1.16b, v6.16b, v11.16b + eor x30, x2, x7 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x28, x4, x9 + eor3 v29.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x16 + eor3 v30.16b, v4.16b, v9.16b, v14.16b + eor x30, x30, x12 + eor3 v31.16b, v31.16b, v15.16b, v20.16b + eor x28, x28, x14 + eor3 v27.16b, v27.16b, v16.16b, v21.16b + eor x0, x0, x22 + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor x30, x30, x17 + eor3 v29.16b, v29.16b, v18.16b, v23.16b + eor x28, x28, x20 + eor3 v30.16b, v30.16b, v19.16b, v24.16b + eor x0, x0, x27 + rax1 v25.2d, v30.2d, v27.2d + eor x30, x30, x23 + rax1 v26.2d, v31.2d, v28.2d + eor x28, x28, x25 + rax1 v27.2d, v27.2d, v29.2d + str x0, [x29, #32] + rax1 v28.2d, v28.2d, v30.2d + str x28, [x29, #24] + rax1 v29.2d, v29.2d, v31.2d + eor x1, x3, x8 + eor v0.16b, v0.16b, v25.16b + xar v30.2d, v1.2d, v26.2d, #63 + eor x28, x5, x10 + xar v1.2d, v6.2d, v26.2d, #20 + eor x1, x1, x13 + xar v6.2d, v9.2d, v29.2d, #44 + eor x28, x28, x15 + xar v9.2d, v22.2d, v27.2d, #3 + eor x1, x1, x19 + xar v22.2d, v14.2d, v29.2d, #25 + eor x28, x28, x21 + xar v14.2d, v20.2d, v25.2d, #46 + eor x1, x1, x24 + xar v20.2d, v2.2d, v27.2d, #2 + eor x28, x28, x26 + xar v2.2d, v12.2d, v27.2d, #21 + eor x0, x0, x1, ror 63 + xar v12.2d, v13.2d, v28.2d, #39 + eor x1, x1, x28, ror 63 + xar v13.2d, v19.2d, v29.2d, #56 + eor x2, x2, x0 + xar v19.2d, v23.2d, v28.2d, #8 + eor x7, x7, x0 + xar v23.2d, v15.2d, v25.2d, #23 + eor x12, x12, x0 + xar v15.2d, v4.2d, v29.2d, #37 + eor x17, x17, x0 + xar v4.2d, v24.2d, v29.2d, #50 + eor x23, x23, x0 + xar v24.2d, v21.2d, v26.2d, #62 + eor x4, x4, x1 + xar v21.2d, v8.2d, v28.2d, #9 + eor x9, x9, x1 + xar v8.2d, v16.2d, v26.2d, #19 + eor x14, x14, x1 + xar v16.2d, v5.2d, v25.2d, #28 + eor x20, x20, x1 + xar v5.2d, v3.2d, v28.2d, #36 + eor x25, x25, x1 + xar v3.2d, v18.2d, v28.2d, #43 + ldr x0, [x29, #32] + xar v18.2d, v17.2d, v27.2d, #49 + ldr x1, [x29, #24] + xar v17.2d, v11.2d, v26.2d, #54 + eor x28, x28, x30, ror 63 + xar v11.2d, v7.2d, v27.2d, #58 + eor x30, x30, x1, ror 63 + xar v7.2d, v10.2d, v25.2d, #61 + eor x1, x1, x0, ror 63 + # Row Mix + mov v25.16b, v0.16b + eor x6, x6, x28 + mov v26.16b, v1.16b + eor x11, x11, x28 + bcax v0.16b, v25.16b, v2.16b, v26.16b + eor x16, x16, x28 + bcax v1.16b, v26.16b, v3.16b, v2.16b + eor x22, x22, x28 + bcax v2.16b, v2.16b, v4.16b, v3.16b + eor x27, x27, x28 + bcax v3.16b, v3.16b, v25.16b, v4.16b + eor x3, x3, x30 + bcax v4.16b, v4.16b, v26.16b, v25.16b + eor x8, x8, x30 + mov v25.16b, v5.16b + eor x13, x13, x30 + mov v26.16b, v6.16b + eor x19, x19, x30 + bcax v5.16b, v25.16b, v7.16b, v26.16b + eor x24, x24, x30 + bcax v6.16b, v26.16b, v8.16b, v7.16b + eor x5, x5, x1 + bcax v7.16b, v7.16b, v9.16b, v8.16b + eor x10, x10, x1 + bcax v8.16b, v8.16b, v25.16b, v9.16b + eor x15, x15, x1 + bcax v9.16b, v9.16b, v26.16b, v25.16b + eor x21, x21, x1 + mov v26.16b, v11.16b + eor x26, x26, x1 + # Swap Rotate Base + bcax v10.16b, v30.16b, v12.16b, v26.16b + ror x0, x3, #63 + bcax v11.16b, v26.16b, v13.16b, v12.16b + ror x3, x8, #20 + bcax v12.16b, v12.16b, v14.16b, v13.16b + ror x8, x11, #44 + bcax v13.16b, v13.16b, v30.16b, v14.16b + ror x11, x25, #3 + bcax v14.16b, v14.16b, v26.16b, v30.16b + ror x25, x16, #25 + mov v25.16b, v15.16b + ror x16, x23, #46 + mov v26.16b, v16.16b + ror x23, x4, #2 + bcax v15.16b, v25.16b, v17.16b, v26.16b + ror x4, x14, #21 + bcax v16.16b, v26.16b, v18.16b, v17.16b + ror x14, x15, #39 + bcax v17.16b, v17.16b, v19.16b, v18.16b + ror x15, x22, #56 + bcax v18.16b, v18.16b, v25.16b, v19.16b + ror x22, x26, #8 + bcax v19.16b, v19.16b, v26.16b, v25.16b + ror x26, x17, #23 + mov v25.16b, v20.16b + ror x17, x6, #37 + mov v26.16b, v21.16b + ror x6, x27, #50 + bcax v20.16b, v25.16b, v22.16b, v26.16b + ror x27, x24, #62 + bcax v21.16b, v26.16b, v23.16b, v22.16b + ror x24, x10, #9 + bcax v22.16b, v22.16b, v24.16b, v23.16b + ror x10, x19, #19 + bcax v23.16b, v23.16b, v25.16b, v24.16b + ror x19, x7, #28 + bcax v24.16b, v24.16b, v26.16b, v25.16b + ror x7, x5, #36 + ror x5, x21, #43 + ror x21, x20, #49 + ror x20, x13, #54 + ror x13, x9, #58 + ror x9, x12, #61 + # Row Mix Base + bic x12, x4, x3 + bic x1, x5, x4 + bic x28, x2, x6 + bic x30, x3, x2 + eor x2, x2, x12 + eor x3, x3, x1 + bic x12, x6, x5 + eor x5, x5, x28 + eor x4, x4, x12 + eor x6, x6, x30 + bic x12, x9, x8 + bic x1, x10, x9 + bic x28, x7, x11 + bic x30, x8, x7 + eor x7, x7, x12 + eor x8, x8, x1 + bic x12, x11, x10 + eor x10, x10, x28 + eor x9, x9, x12 + eor x11, x11, x30 + bic x12, x14, x13 + bic x1, x15, x14 + bic x28, x0, x16 + bic x30, x13, x0 + eor x12, x0, x12 + eor x13, x13, x1 + bic x0, x16, x15 + eor x15, x15, x28 + eor x14, x14, x0 + eor x16, x16, x30 + bic x0, x20, x19 + bic x1, x21, x20 + bic x28, x17, x22 + bic x30, x19, x17 + eor x17, x17, x0 + eor x19, x19, x1 + bic x0, x22, x21 + eor x21, x21, x28 + eor x20, x20, x0 + eor x22, x22, x30 + bic x0, x25, x24 + bic x1, x26, x25 + bic x28, x23, x27 + bic x30, x24, x23 + eor x23, x23, x0 + eor x24, x24, x1 + bic x0, x27, x26 + eor x26, x26, x28 + eor x25, x25, x0 + eor x27, x27, x30 + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake256_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake256_blocksx3_seed_neon,.-kyber_shake256_blocksx3_seed_neon +#endif /* __APPLE__ */ +#else +#ifndef __APPLE__ +.text +.globl kyber_sha3_blocksx3_neon +.type kyber_sha3_blocksx3_neon,@function +.align 2 +kyber_sha3_blocksx3_neon: +#else +.section __TEXT,__text +.globl _kyber_sha3_blocksx3_neon +.p2align 2 +_kyber_sha3_blocksx3_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x27, L_sha3_aarch64_r + add x27, x27, :lo12:L_sha3_aarch64_r +#else + adrp x27, L_sha3_aarch64_r@PAGE + add x27, x27, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + ld1 {v24.d}[0], [x0] + add x0, x0, #8 + ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + ld1 {v24.d}[1], [x0] + add x0, x0, #8 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + ldp x5, x6, [x0, #32] + ldp x7, x8, [x0, #48] + ldp x9, x10, [x0, #64] + ldp x11, x12, [x0, #80] + ldp x13, x14, [x0, #96] + ldp x15, x16, [x0, #112] + ldp x17, x19, [x0, #128] + ldp x20, x21, [x0, #144] + ldp x22, x23, [x0, #160] + ldp x24, x25, [x0, #176] + ldr x26, [x0, #192] + mov x28, #24 + # Start of 24 rounds +L_SHA3_transform_blocksx3_neon_begin: + stp x27, x28, [x29, #48] + # Col Mix NEON + eor v30.16b, v4.16b, v9.16b + eor x0, x5, x10 + eor v27.16b, v1.16b, v6.16b + eor x30, x1, x6 + eor v30.16b, v30.16b, v14.16b + eor x28, x3, x8 + eor v27.16b, v27.16b, v11.16b + eor x0, x0, x15 + eor v30.16b, v30.16b, v19.16b + eor x30, x30, x11 + eor v27.16b, v27.16b, v16.16b + eor x28, x28, x13 + eor v30.16b, v30.16b, v24.16b + eor x0, x0, x21 + eor v27.16b, v27.16b, v21.16b + eor x30, x30, x16 + ushr v25.2d, v27.2d, #63 + eor x28, x28, x19 + sli v25.2d, v27.2d, #1 + eor x0, x0, x26 + eor v25.16b, v25.16b, v30.16b + eor x30, x30, x22 + eor v31.16b, v0.16b, v5.16b + eor x28, x28, x24 + eor v28.16b, v2.16b, v7.16b + str x0, [x29, #32] + eor v31.16b, v31.16b, v10.16b + str x28, [x29, #24] + eor v28.16b, v28.16b, v12.16b + eor x27, x2, x7 + eor v31.16b, v31.16b, v15.16b + eor x28, x4, x9 + eor v28.16b, v28.16b, v17.16b + eor x27, x27, x12 + eor v31.16b, v31.16b, v20.16b + eor x28, x28, x14 + eor v28.16b, v28.16b, v22.16b + eor x27, x27, x17 + ushr v29.2d, v30.2d, #63 + eor x28, x28, x20 + ushr v26.2d, v28.2d, #63 + eor x27, x27, x23 + sli v29.2d, v30.2d, #1 + eor x28, x28, x25 + sli v26.2d, v28.2d, #1 + eor x0, x0, x27, ror 63 + eor v28.16b, v28.16b, v29.16b + eor x27, x27, x28, ror 63 + eor v29.16b, v3.16b, v8.16b + eor x1, x1, x0 + eor v26.16b, v26.16b, v31.16b + eor x6, x6, x0 + eor v29.16b, v29.16b, v13.16b + eor x11, x11, x0 + eor v29.16b, v29.16b, v18.16b + eor x16, x16, x0 + eor v29.16b, v29.16b, v23.16b + eor x22, x22, x0 + ushr v30.2d, v29.2d, #63 + eor x3, x3, x27 + sli v30.2d, v29.2d, #1 + eor x8, x8, x27 + eor v27.16b, v27.16b, v30.16b + eor x13, x13, x27 + ushr v30.2d, v31.2d, #63 + eor x19, x19, x27 + sli v30.2d, v31.2d, #1 + eor x24, x24, x27 + eor v29.16b, v29.16b, v30.16b + ldr x0, [x29, #32] + # Swap Rotate NEON + eor v0.16b, v0.16b, v25.16b + eor v31.16b, v1.16b, v26.16b + ldr x27, [x29, #24] + eor v6.16b, v6.16b, v26.16b + eor x28, x28, x30, ror 63 + ushr v30.2d, v31.2d, #63 + eor x30, x30, x27, ror 63 + ushr v1.2d, v6.2d, #20 + eor x27, x27, x0, ror 63 + sli v30.2d, v31.2d, #1 + eor x5, x5, x28 + sli v1.2d, v6.2d, #44 + eor x10, x10, x28 + eor v31.16b, v9.16b, v29.16b + eor x15, x15, x28 + eor v22.16b, v22.16b, v27.16b + eor x21, x21, x28 + ushr v6.2d, v31.2d, #44 + eor x26, x26, x28 + ushr v9.2d, v22.2d, #3 + eor x2, x2, x30 + sli v6.2d, v31.2d, #20 + eor x7, x7, x30 + sli v9.2d, v22.2d, #61 + eor x12, x12, x30 + eor v31.16b, v14.16b, v29.16b + eor x17, x17, x30 + eor v20.16b, v20.16b, v25.16b + eor x23, x23, x30 + ushr v22.2d, v31.2d, #25 + eor x4, x4, x27 + ushr v14.2d, v20.2d, #46 + eor x9, x9, x27 + sli v22.2d, v31.2d, #39 + eor x14, x14, x27 + sli v14.2d, v20.2d, #18 + eor x20, x20, x27 + eor v31.16b, v2.16b, v27.16b + eor x25, x25, x27 + # Swap Rotate Base + eor v12.16b, v12.16b, v27.16b + ror x0, x2, #63 + ushr v20.2d, v31.2d, #2 + ror x2, x7, #20 + ushr v2.2d, v12.2d, #21 + ror x7, x10, #44 + sli v20.2d, v31.2d, #62 + ror x10, x24, #3 + sli v2.2d, v12.2d, #43 + ror x24, x15, #25 + eor v31.16b, v13.16b, v28.16b + ror x15, x22, #46 + eor v19.16b, v19.16b, v29.16b + ror x22, x3, #2 + ushr v12.2d, v31.2d, #39 + ror x3, x13, #21 + ushr v13.2d, v19.2d, #56 + ror x13, x14, #39 + sli v12.2d, v31.2d, #25 + ror x14, x21, #56 + sli v13.2d, v19.2d, #8 + ror x21, x25, #8 + eor v31.16b, v23.16b, v28.16b + ror x25, x16, #23 + eor v15.16b, v15.16b, v25.16b + ror x16, x5, #37 + ushr v19.2d, v31.2d, #8 + ror x5, x26, #50 + ushr v23.2d, v15.2d, #23 + ror x26, x23, #62 + sli v19.2d, v31.2d, #56 + ror x23, x9, #9 + sli v23.2d, v15.2d, #41 + ror x9, x17, #19 + eor v31.16b, v4.16b, v29.16b + ror x17, x6, #28 + eor v24.16b, v24.16b, v29.16b + ror x6, x4, #36 + ushr v15.2d, v31.2d, #37 + ror x4, x20, #43 + ushr v4.2d, v24.2d, #50 + ror x20, x19, #49 + sli v15.2d, v31.2d, #27 + ror x19, x12, #54 + sli v4.2d, v24.2d, #14 + ror x12, x8, #58 + eor v31.16b, v21.16b, v26.16b + ror x8, x11, #61 + # Row Mix Base + eor v8.16b, v8.16b, v28.16b + bic x11, x3, x2 + ushr v24.2d, v31.2d, #62 + bic x27, x4, x3 + ushr v21.2d, v8.2d, #9 + bic x28, x1, x5 + sli v24.2d, v31.2d, #2 + bic x30, x2, x1 + sli v21.2d, v8.2d, #55 + eor x1, x1, x11 + eor v31.16b, v16.16b, v26.16b + eor x2, x2, x27 + eor v5.16b, v5.16b, v25.16b + bic x11, x5, x4 + ushr v8.2d, v31.2d, #19 + eor x4, x4, x28 + ushr v16.2d, v5.2d, #28 + eor x3, x3, x11 + sli v8.2d, v31.2d, #45 + eor x5, x5, x30 + sli v16.2d, v5.2d, #36 + bic x11, x8, x7 + eor v31.16b, v3.16b, v28.16b + bic x27, x9, x8 + eor v18.16b, v18.16b, v28.16b + bic x28, x6, x10 + ushr v5.2d, v31.2d, #36 + bic x30, x7, x6 + ushr v3.2d, v18.2d, #43 + eor x6, x6, x11 + sli v5.2d, v31.2d, #28 + eor x7, x7, x27 + sli v3.2d, v18.2d, #21 + bic x11, x10, x9 + eor v31.16b, v17.16b, v27.16b + eor x9, x9, x28 + eor v11.16b, v11.16b, v26.16b + eor x8, x8, x11 + ushr v18.2d, v31.2d, #49 + eor x10, x10, x30 + ushr v17.2d, v11.2d, #54 + bic x11, x13, x12 + sli v18.2d, v31.2d, #15 + bic x27, x14, x13 + sli v17.2d, v11.2d, #10 + bic x28, x0, x15 + eor v31.16b, v7.16b, v27.16b + bic x30, x12, x0 + eor v10.16b, v10.16b, v25.16b + eor x11, x0, x11 + ushr v11.2d, v31.2d, #58 + eor x12, x12, x27 + ushr v7.2d, v10.2d, #61 + bic x0, x15, x14 + sli v11.2d, v31.2d, #6 + eor x14, x14, x28 + sli v7.2d, v10.2d, #3 + eor x13, x13, x0 + # Row Mix NEON + bic v25.16b, v2.16b, v1.16b + eor x15, x15, x30 + bic v26.16b, v3.16b, v2.16b + bic x0, x19, x17 + bic v27.16b, v4.16b, v3.16b + bic x27, x20, x19 + bic v28.16b, v0.16b, v4.16b + bic x28, x16, x21 + bic v29.16b, v1.16b, v0.16b + bic x30, x17, x16 + eor v0.16b, v0.16b, v25.16b + eor x16, x16, x0 + eor v1.16b, v1.16b, v26.16b + eor x17, x17, x27 + eor v2.16b, v2.16b, v27.16b + bic x0, x21, x20 + eor v3.16b, v3.16b, v28.16b + eor x20, x20, x28 + eor v4.16b, v4.16b, v29.16b + eor x19, x19, x0 + bic v25.16b, v7.16b, v6.16b + eor x21, x21, x30 + bic v26.16b, v8.16b, v7.16b + bic x0, x24, x23 + bic v27.16b, v9.16b, v8.16b + bic x27, x25, x24 + bic v28.16b, v5.16b, v9.16b + bic x28, x22, x26 + bic v29.16b, v6.16b, v5.16b + bic x30, x23, x22 + eor v5.16b, v5.16b, v25.16b + eor x22, x22, x0 + eor v6.16b, v6.16b, v26.16b + eor x23, x23, x27 + eor v7.16b, v7.16b, v27.16b + bic x0, x26, x25 + eor v8.16b, v8.16b, v28.16b + eor x25, x25, x28 + eor v9.16b, v9.16b, v29.16b + eor x24, x24, x0 + bic v25.16b, v12.16b, v11.16b + eor x26, x26, x30 + bic v26.16b, v13.16b, v12.16b + bic v27.16b, v14.16b, v13.16b + bic v28.16b, v30.16b, v14.16b + bic v29.16b, v11.16b, v30.16b + eor v10.16b, v30.16b, v25.16b + eor v11.16b, v11.16b, v26.16b + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + bic v25.16b, v17.16b, v16.16b + bic v26.16b, v18.16b, v17.16b + bic v27.16b, v19.16b, v18.16b + bic v28.16b, v15.16b, v19.16b + bic v29.16b, v16.16b, v15.16b + eor v15.16b, v15.16b, v25.16b + eor v16.16b, v16.16b, v26.16b + eor v17.16b, v17.16b, v27.16b + eor v18.16b, v18.16b, v28.16b + eor v19.16b, v19.16b, v29.16b + bic v25.16b, v22.16b, v21.16b + bic v26.16b, v23.16b, v22.16b + bic v27.16b, v24.16b, v23.16b + bic v28.16b, v20.16b, v24.16b + bic v29.16b, v21.16b, v20.16b + eor v20.16b, v20.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v22.16b, v22.16b, v27.16b + eor v23.16b, v23.16b, v28.16b + eor v24.16b, v24.16b, v29.16b + # Done tranforming + ldp x27, x28, [x29, #48] + ldr x0, [x27], #8 + subs x28, x28, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x1, x1, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_transform_blocksx3_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x1, x2, [x0] + stp x3, x4, [x0, #16] + stp x5, x6, [x0, #32] + stp x7, x8, [x0, #48] + stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] + stp x13, x14, [x0, #96] + stp x15, x16, [x0, #112] + stp x17, x19, [x0, #128] + stp x20, x21, [x0, #144] + stp x22, x23, [x0, #160] + stp x24, x25, [x0, #176] + str x26, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_sha3_blocksx3_neon,.-kyber_sha3_blocksx3_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_shake128_blocksx3_seed_neon +.type kyber_shake128_blocksx3_seed_neon,@function +.align 2 +kyber_shake128_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake128_blocksx3_seed_neon +.p2align 2 +_kyber_shake128_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + eor v16.16b, v16.16b, v16.16b + eor x19, x19, x19 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + movz x23, #0x8000, lsl 48 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v20.2d, x23 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake128_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix NEON + eor v30.16b, v4.16b, v9.16b + eor x0, x6, x11 + eor v27.16b, v1.16b, v6.16b + eor x30, x2, x7 + eor v30.16b, v30.16b, v14.16b + eor x28, x4, x9 + eor v27.16b, v27.16b, v11.16b + eor x0, x0, x16 + eor v30.16b, v30.16b, v19.16b + eor x30, x30, x12 + eor v27.16b, v27.16b, v16.16b + eor x28, x28, x14 + eor v30.16b, v30.16b, v24.16b + eor x0, x0, x22 + eor v27.16b, v27.16b, v21.16b + eor x30, x30, x17 + ushr v25.2d, v27.2d, #63 + eor x28, x28, x20 + sli v25.2d, v27.2d, #1 + eor x0, x0, x27 + eor v25.16b, v25.16b, v30.16b + eor x30, x30, x23 + eor v31.16b, v0.16b, v5.16b + eor x28, x28, x25 + eor v28.16b, v2.16b, v7.16b + str x0, [x29, #32] + eor v31.16b, v31.16b, v10.16b + str x28, [x29, #24] + eor v28.16b, v28.16b, v12.16b + eor x1, x3, x8 + eor v31.16b, v31.16b, v15.16b + eor x28, x5, x10 + eor v28.16b, v28.16b, v17.16b + eor x1, x1, x13 + eor v31.16b, v31.16b, v20.16b + eor x28, x28, x15 + eor v28.16b, v28.16b, v22.16b + eor x1, x1, x19 + ushr v29.2d, v30.2d, #63 + eor x28, x28, x21 + ushr v26.2d, v28.2d, #63 + eor x1, x1, x24 + sli v29.2d, v30.2d, #1 + eor x28, x28, x26 + sli v26.2d, v28.2d, #1 + eor x0, x0, x1, ror 63 + eor v28.16b, v28.16b, v29.16b + eor x1, x1, x28, ror 63 + eor v29.16b, v3.16b, v8.16b + eor x2, x2, x0 + eor v26.16b, v26.16b, v31.16b + eor x7, x7, x0 + eor v29.16b, v29.16b, v13.16b + eor x12, x12, x0 + eor v29.16b, v29.16b, v18.16b + eor x17, x17, x0 + eor v29.16b, v29.16b, v23.16b + eor x23, x23, x0 + ushr v30.2d, v29.2d, #63 + eor x4, x4, x1 + sli v30.2d, v29.2d, #1 + eor x9, x9, x1 + eor v27.16b, v27.16b, v30.16b + eor x14, x14, x1 + ushr v30.2d, v31.2d, #63 + eor x20, x20, x1 + sli v30.2d, v31.2d, #1 + eor x25, x25, x1 + eor v29.16b, v29.16b, v30.16b + ldr x0, [x29, #32] + # Swap Rotate NEON + eor v0.16b, v0.16b, v25.16b + eor v31.16b, v1.16b, v26.16b + ldr x1, [x29, #24] + eor v6.16b, v6.16b, v26.16b + eor x28, x28, x30, ror 63 + ushr v30.2d, v31.2d, #63 + eor x30, x30, x1, ror 63 + ushr v1.2d, v6.2d, #20 + eor x1, x1, x0, ror 63 + sli v30.2d, v31.2d, #1 + eor x6, x6, x28 + sli v1.2d, v6.2d, #44 + eor x11, x11, x28 + eor v31.16b, v9.16b, v29.16b + eor x16, x16, x28 + eor v22.16b, v22.16b, v27.16b + eor x22, x22, x28 + ushr v6.2d, v31.2d, #44 + eor x27, x27, x28 + ushr v9.2d, v22.2d, #3 + eor x3, x3, x30 + sli v6.2d, v31.2d, #20 + eor x8, x8, x30 + sli v9.2d, v22.2d, #61 + eor x13, x13, x30 + eor v31.16b, v14.16b, v29.16b + eor x19, x19, x30 + eor v20.16b, v20.16b, v25.16b + eor x24, x24, x30 + ushr v22.2d, v31.2d, #25 + eor x5, x5, x1 + ushr v14.2d, v20.2d, #46 + eor x10, x10, x1 + sli v22.2d, v31.2d, #39 + eor x15, x15, x1 + sli v14.2d, v20.2d, #18 + eor x21, x21, x1 + eor v31.16b, v2.16b, v27.16b + eor x26, x26, x1 + # Swap Rotate Base + eor v12.16b, v12.16b, v27.16b + ror x0, x3, #63 + ushr v20.2d, v31.2d, #2 + ror x3, x8, #20 + ushr v2.2d, v12.2d, #21 + ror x8, x11, #44 + sli v20.2d, v31.2d, #62 + ror x11, x25, #3 + sli v2.2d, v12.2d, #43 + ror x25, x16, #25 + eor v31.16b, v13.16b, v28.16b + ror x16, x23, #46 + eor v19.16b, v19.16b, v29.16b + ror x23, x4, #2 + ushr v12.2d, v31.2d, #39 + ror x4, x14, #21 + ushr v13.2d, v19.2d, #56 + ror x14, x15, #39 + sli v12.2d, v31.2d, #25 + ror x15, x22, #56 + sli v13.2d, v19.2d, #8 + ror x22, x26, #8 + eor v31.16b, v23.16b, v28.16b + ror x26, x17, #23 + eor v15.16b, v15.16b, v25.16b + ror x17, x6, #37 + ushr v19.2d, v31.2d, #8 + ror x6, x27, #50 + ushr v23.2d, v15.2d, #23 + ror x27, x24, #62 + sli v19.2d, v31.2d, #56 + ror x24, x10, #9 + sli v23.2d, v15.2d, #41 + ror x10, x19, #19 + eor v31.16b, v4.16b, v29.16b + ror x19, x7, #28 + eor v24.16b, v24.16b, v29.16b + ror x7, x5, #36 + ushr v15.2d, v31.2d, #37 + ror x5, x21, #43 + ushr v4.2d, v24.2d, #50 + ror x21, x20, #49 + sli v15.2d, v31.2d, #27 + ror x20, x13, #54 + sli v4.2d, v24.2d, #14 + ror x13, x9, #58 + eor v31.16b, v21.16b, v26.16b + ror x9, x12, #61 + # Row Mix Base + eor v8.16b, v8.16b, v28.16b + bic x12, x4, x3 + ushr v24.2d, v31.2d, #62 + bic x1, x5, x4 + ushr v21.2d, v8.2d, #9 + bic x28, x2, x6 + sli v24.2d, v31.2d, #2 + bic x30, x3, x2 + sli v21.2d, v8.2d, #55 + eor x2, x2, x12 + eor v31.16b, v16.16b, v26.16b + eor x3, x3, x1 + eor v5.16b, v5.16b, v25.16b + bic x12, x6, x5 + ushr v8.2d, v31.2d, #19 + eor x5, x5, x28 + ushr v16.2d, v5.2d, #28 + eor x4, x4, x12 + sli v8.2d, v31.2d, #45 + eor x6, x6, x30 + sli v16.2d, v5.2d, #36 + bic x12, x9, x8 + eor v31.16b, v3.16b, v28.16b + bic x1, x10, x9 + eor v18.16b, v18.16b, v28.16b + bic x28, x7, x11 + ushr v5.2d, v31.2d, #36 + bic x30, x8, x7 + ushr v3.2d, v18.2d, #43 + eor x7, x7, x12 + sli v5.2d, v31.2d, #28 + eor x8, x8, x1 + sli v3.2d, v18.2d, #21 + bic x12, x11, x10 + eor v31.16b, v17.16b, v27.16b + eor x10, x10, x28 + eor v11.16b, v11.16b, v26.16b + eor x9, x9, x12 + ushr v18.2d, v31.2d, #49 + eor x11, x11, x30 + ushr v17.2d, v11.2d, #54 + bic x12, x14, x13 + sli v18.2d, v31.2d, #15 + bic x1, x15, x14 + sli v17.2d, v11.2d, #10 + bic x28, x0, x16 + eor v31.16b, v7.16b, v27.16b + bic x30, x13, x0 + eor v10.16b, v10.16b, v25.16b + eor x12, x0, x12 + ushr v11.2d, v31.2d, #58 + eor x13, x13, x1 + ushr v7.2d, v10.2d, #61 + bic x0, x16, x15 + sli v11.2d, v31.2d, #6 + eor x15, x15, x28 + sli v7.2d, v10.2d, #3 + eor x14, x14, x0 + # Row Mix NEON + bic v25.16b, v2.16b, v1.16b + eor x16, x16, x30 + bic v26.16b, v3.16b, v2.16b + bic x0, x20, x19 + bic v27.16b, v4.16b, v3.16b + bic x1, x21, x20 + bic v28.16b, v0.16b, v4.16b + bic x28, x17, x22 + bic v29.16b, v1.16b, v0.16b + bic x30, x19, x17 + eor v0.16b, v0.16b, v25.16b + eor x17, x17, x0 + eor v1.16b, v1.16b, v26.16b + eor x19, x19, x1 + eor v2.16b, v2.16b, v27.16b + bic x0, x22, x21 + eor v3.16b, v3.16b, v28.16b + eor x21, x21, x28 + eor v4.16b, v4.16b, v29.16b + eor x20, x20, x0 + bic v25.16b, v7.16b, v6.16b + eor x22, x22, x30 + bic v26.16b, v8.16b, v7.16b + bic x0, x25, x24 + bic v27.16b, v9.16b, v8.16b + bic x1, x26, x25 + bic v28.16b, v5.16b, v9.16b + bic x28, x23, x27 + bic v29.16b, v6.16b, v5.16b + bic x30, x24, x23 + eor v5.16b, v5.16b, v25.16b + eor x23, x23, x0 + eor v6.16b, v6.16b, v26.16b + eor x24, x24, x1 + eor v7.16b, v7.16b, v27.16b + bic x0, x27, x26 + eor v8.16b, v8.16b, v28.16b + eor x26, x26, x28 + eor v9.16b, v9.16b, v29.16b + eor x25, x25, x0 + bic v25.16b, v12.16b, v11.16b + eor x27, x27, x30 + bic v26.16b, v13.16b, v12.16b + bic v27.16b, v14.16b, v13.16b + bic v28.16b, v30.16b, v14.16b + bic v29.16b, v11.16b, v30.16b + eor v10.16b, v30.16b, v25.16b + eor v11.16b, v11.16b, v26.16b + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + bic v25.16b, v17.16b, v16.16b + bic v26.16b, v18.16b, v17.16b + bic v27.16b, v19.16b, v18.16b + bic v28.16b, v15.16b, v19.16b + bic v29.16b, v16.16b, v15.16b + eor v15.16b, v15.16b, v25.16b + eor v16.16b, v16.16b, v26.16b + eor v17.16b, v17.16b, v27.16b + eor v18.16b, v18.16b, v28.16b + eor v19.16b, v19.16b, v29.16b + bic v25.16b, v22.16b, v21.16b + bic v26.16b, v23.16b, v22.16b + bic v27.16b, v24.16b, v23.16b + bic v28.16b, v20.16b, v24.16b + bic v29.16b, v21.16b, v20.16b + eor v20.16b, v20.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v22.16b, v22.16b, v27.16b + eor v23.16b, v23.16b, v28.16b + eor v24.16b, v24.16b, v29.16b + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake128_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake128_blocksx3_seed_neon,.-kyber_shake128_blocksx3_seed_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_shake256_blocksx3_seed_neon +.type kyber_shake256_blocksx3_seed_neon,@function +.align 2 +kyber_shake256_blocksx3_seed_neon: +#else +.section __TEXT,__text +.globl _kyber_shake256_blocksx3_seed_neon +.p2align 2 +_kyber_shake256_blocksx3_seed_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-224]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] + stp d8, d9, [x29, #160] + stp d10, d11, [x29, #176] + stp d12, d13, [x29, #192] + stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ + str x0, [x29, #40] + add x0, x0, #32 + ld1 {v4.d}[0], [x0] + ldp x2, x3, [x1], #16 + add x0, x0, #0xc8 + ld1 {v4.d}[1], [x0] + ldp x4, x5, [x1], #16 + ldr x6, [x0, #200] + eor v5.16b, v5.16b, v5.16b + eor x7, x7, x7 + eor v6.16b, v6.16b, v6.16b + eor x8, x8, x8 + eor v7.16b, v7.16b, v7.16b + eor x9, x9, x9 + eor v8.16b, v8.16b, v8.16b + eor x10, x10, x10 + eor v9.16b, v9.16b, v9.16b + eor x11, x11, x11 + eor v10.16b, v10.16b, v10.16b + eor x12, x12, x12 + eor v11.16b, v11.16b, v11.16b + eor x13, x13, x13 + eor v12.16b, v12.16b, v12.16b + eor x14, x14, x14 + eor v13.16b, v13.16b, v13.16b + eor x15, x15, x15 + eor v14.16b, v14.16b, v14.16b + eor x16, x16, x16 + eor v15.16b, v15.16b, v15.16b + eor x17, x17, x17 + movz x19, #0x8000, lsl 48 + eor v17.16b, v17.16b, v17.16b + eor x20, x20, x20 + eor v18.16b, v18.16b, v18.16b + eor x21, x21, x21 + eor v19.16b, v19.16b, v19.16b + eor x22, x22, x22 + eor v20.16b, v20.16b, v20.16b + eor x23, x23, x23 + eor v21.16b, v21.16b, v21.16b + eor x24, x24, x24 + eor v22.16b, v22.16b, v22.16b + eor x25, x25, x25 + eor v23.16b, v23.16b, v23.16b + eor x26, x26, x26 + eor v24.16b, v24.16b, v24.16b + eor x27, x27, x27 + dup v0.2d, x2 + dup v1.2d, x3 + dup v2.2d, x4 + dup v3.2d, x5 + dup v16.2d, x19 + mov x1, #24 + # Start of 24 rounds +L_SHA3_shake256_blocksx3_seed_neon_begin: + stp x28, x1, [x29, #48] + # Col Mix NEON + eor v30.16b, v4.16b, v9.16b + eor x0, x6, x11 + eor v27.16b, v1.16b, v6.16b + eor x30, x2, x7 + eor v30.16b, v30.16b, v14.16b + eor x28, x4, x9 + eor v27.16b, v27.16b, v11.16b + eor x0, x0, x16 + eor v30.16b, v30.16b, v19.16b + eor x30, x30, x12 + eor v27.16b, v27.16b, v16.16b + eor x28, x28, x14 + eor v30.16b, v30.16b, v24.16b + eor x0, x0, x22 + eor v27.16b, v27.16b, v21.16b + eor x30, x30, x17 + ushr v25.2d, v27.2d, #63 + eor x28, x28, x20 + sli v25.2d, v27.2d, #1 + eor x0, x0, x27 + eor v25.16b, v25.16b, v30.16b + eor x30, x30, x23 + eor v31.16b, v0.16b, v5.16b + eor x28, x28, x25 + eor v28.16b, v2.16b, v7.16b + str x0, [x29, #32] + eor v31.16b, v31.16b, v10.16b + str x28, [x29, #24] + eor v28.16b, v28.16b, v12.16b + eor x1, x3, x8 + eor v31.16b, v31.16b, v15.16b + eor x28, x5, x10 + eor v28.16b, v28.16b, v17.16b + eor x1, x1, x13 + eor v31.16b, v31.16b, v20.16b + eor x28, x28, x15 + eor v28.16b, v28.16b, v22.16b + eor x1, x1, x19 + ushr v29.2d, v30.2d, #63 + eor x28, x28, x21 + ushr v26.2d, v28.2d, #63 + eor x1, x1, x24 + sli v29.2d, v30.2d, #1 + eor x28, x28, x26 + sli v26.2d, v28.2d, #1 + eor x0, x0, x1, ror 63 + eor v28.16b, v28.16b, v29.16b + eor x1, x1, x28, ror 63 + eor v29.16b, v3.16b, v8.16b + eor x2, x2, x0 + eor v26.16b, v26.16b, v31.16b + eor x7, x7, x0 + eor v29.16b, v29.16b, v13.16b + eor x12, x12, x0 + eor v29.16b, v29.16b, v18.16b + eor x17, x17, x0 + eor v29.16b, v29.16b, v23.16b + eor x23, x23, x0 + ushr v30.2d, v29.2d, #63 + eor x4, x4, x1 + sli v30.2d, v29.2d, #1 + eor x9, x9, x1 + eor v27.16b, v27.16b, v30.16b + eor x14, x14, x1 + ushr v30.2d, v31.2d, #63 + eor x20, x20, x1 + sli v30.2d, v31.2d, #1 + eor x25, x25, x1 + eor v29.16b, v29.16b, v30.16b + ldr x0, [x29, #32] + # Swap Rotate NEON + eor v0.16b, v0.16b, v25.16b + eor v31.16b, v1.16b, v26.16b + ldr x1, [x29, #24] + eor v6.16b, v6.16b, v26.16b + eor x28, x28, x30, ror 63 + ushr v30.2d, v31.2d, #63 + eor x30, x30, x1, ror 63 + ushr v1.2d, v6.2d, #20 + eor x1, x1, x0, ror 63 + sli v30.2d, v31.2d, #1 + eor x6, x6, x28 + sli v1.2d, v6.2d, #44 + eor x11, x11, x28 + eor v31.16b, v9.16b, v29.16b + eor x16, x16, x28 + eor v22.16b, v22.16b, v27.16b + eor x22, x22, x28 + ushr v6.2d, v31.2d, #44 + eor x27, x27, x28 + ushr v9.2d, v22.2d, #3 + eor x3, x3, x30 + sli v6.2d, v31.2d, #20 + eor x8, x8, x30 + sli v9.2d, v22.2d, #61 + eor x13, x13, x30 + eor v31.16b, v14.16b, v29.16b + eor x19, x19, x30 + eor v20.16b, v20.16b, v25.16b + eor x24, x24, x30 + ushr v22.2d, v31.2d, #25 + eor x5, x5, x1 + ushr v14.2d, v20.2d, #46 + eor x10, x10, x1 + sli v22.2d, v31.2d, #39 + eor x15, x15, x1 + sli v14.2d, v20.2d, #18 + eor x21, x21, x1 + eor v31.16b, v2.16b, v27.16b + eor x26, x26, x1 + # Swap Rotate Base + eor v12.16b, v12.16b, v27.16b + ror x0, x3, #63 + ushr v20.2d, v31.2d, #2 + ror x3, x8, #20 + ushr v2.2d, v12.2d, #21 + ror x8, x11, #44 + sli v20.2d, v31.2d, #62 + ror x11, x25, #3 + sli v2.2d, v12.2d, #43 + ror x25, x16, #25 + eor v31.16b, v13.16b, v28.16b + ror x16, x23, #46 + eor v19.16b, v19.16b, v29.16b + ror x23, x4, #2 + ushr v12.2d, v31.2d, #39 + ror x4, x14, #21 + ushr v13.2d, v19.2d, #56 + ror x14, x15, #39 + sli v12.2d, v31.2d, #25 + ror x15, x22, #56 + sli v13.2d, v19.2d, #8 + ror x22, x26, #8 + eor v31.16b, v23.16b, v28.16b + ror x26, x17, #23 + eor v15.16b, v15.16b, v25.16b + ror x17, x6, #37 + ushr v19.2d, v31.2d, #8 + ror x6, x27, #50 + ushr v23.2d, v15.2d, #23 + ror x27, x24, #62 + sli v19.2d, v31.2d, #56 + ror x24, x10, #9 + sli v23.2d, v15.2d, #41 + ror x10, x19, #19 + eor v31.16b, v4.16b, v29.16b + ror x19, x7, #28 + eor v24.16b, v24.16b, v29.16b + ror x7, x5, #36 + ushr v15.2d, v31.2d, #37 + ror x5, x21, #43 + ushr v4.2d, v24.2d, #50 + ror x21, x20, #49 + sli v15.2d, v31.2d, #27 + ror x20, x13, #54 + sli v4.2d, v24.2d, #14 + ror x13, x9, #58 + eor v31.16b, v21.16b, v26.16b + ror x9, x12, #61 + # Row Mix Base + eor v8.16b, v8.16b, v28.16b + bic x12, x4, x3 + ushr v24.2d, v31.2d, #62 + bic x1, x5, x4 + ushr v21.2d, v8.2d, #9 + bic x28, x2, x6 + sli v24.2d, v31.2d, #2 + bic x30, x3, x2 + sli v21.2d, v8.2d, #55 + eor x2, x2, x12 + eor v31.16b, v16.16b, v26.16b + eor x3, x3, x1 + eor v5.16b, v5.16b, v25.16b + bic x12, x6, x5 + ushr v8.2d, v31.2d, #19 + eor x5, x5, x28 + ushr v16.2d, v5.2d, #28 + eor x4, x4, x12 + sli v8.2d, v31.2d, #45 + eor x6, x6, x30 + sli v16.2d, v5.2d, #36 + bic x12, x9, x8 + eor v31.16b, v3.16b, v28.16b + bic x1, x10, x9 + eor v18.16b, v18.16b, v28.16b + bic x28, x7, x11 + ushr v5.2d, v31.2d, #36 + bic x30, x8, x7 + ushr v3.2d, v18.2d, #43 + eor x7, x7, x12 + sli v5.2d, v31.2d, #28 + eor x8, x8, x1 + sli v3.2d, v18.2d, #21 + bic x12, x11, x10 + eor v31.16b, v17.16b, v27.16b + eor x10, x10, x28 + eor v11.16b, v11.16b, v26.16b + eor x9, x9, x12 + ushr v18.2d, v31.2d, #49 + eor x11, x11, x30 + ushr v17.2d, v11.2d, #54 + bic x12, x14, x13 + sli v18.2d, v31.2d, #15 + bic x1, x15, x14 + sli v17.2d, v11.2d, #10 + bic x28, x0, x16 + eor v31.16b, v7.16b, v27.16b + bic x30, x13, x0 + eor v10.16b, v10.16b, v25.16b + eor x12, x0, x12 + ushr v11.2d, v31.2d, #58 + eor x13, x13, x1 + ushr v7.2d, v10.2d, #61 + bic x0, x16, x15 + sli v11.2d, v31.2d, #6 + eor x15, x15, x28 + sli v7.2d, v10.2d, #3 + eor x14, x14, x0 + # Row Mix NEON + bic v25.16b, v2.16b, v1.16b + eor x16, x16, x30 + bic v26.16b, v3.16b, v2.16b + bic x0, x20, x19 + bic v27.16b, v4.16b, v3.16b + bic x1, x21, x20 + bic v28.16b, v0.16b, v4.16b + bic x28, x17, x22 + bic v29.16b, v1.16b, v0.16b + bic x30, x19, x17 + eor v0.16b, v0.16b, v25.16b + eor x17, x17, x0 + eor v1.16b, v1.16b, v26.16b + eor x19, x19, x1 + eor v2.16b, v2.16b, v27.16b + bic x0, x22, x21 + eor v3.16b, v3.16b, v28.16b + eor x21, x21, x28 + eor v4.16b, v4.16b, v29.16b + eor x20, x20, x0 + bic v25.16b, v7.16b, v6.16b + eor x22, x22, x30 + bic v26.16b, v8.16b, v7.16b + bic x0, x25, x24 + bic v27.16b, v9.16b, v8.16b + bic x1, x26, x25 + bic v28.16b, v5.16b, v9.16b + bic x28, x23, x27 + bic v29.16b, v6.16b, v5.16b + bic x30, x24, x23 + eor v5.16b, v5.16b, v25.16b + eor x23, x23, x0 + eor v6.16b, v6.16b, v26.16b + eor x24, x24, x1 + eor v7.16b, v7.16b, v27.16b + bic x0, x27, x26 + eor v8.16b, v8.16b, v28.16b + eor x26, x26, x28 + eor v9.16b, v9.16b, v29.16b + eor x25, x25, x0 + bic v25.16b, v12.16b, v11.16b + eor x27, x27, x30 + bic v26.16b, v13.16b, v12.16b + bic v27.16b, v14.16b, v13.16b + bic v28.16b, v30.16b, v14.16b + bic v29.16b, v11.16b, v30.16b + eor v10.16b, v30.16b, v25.16b + eor v11.16b, v11.16b, v26.16b + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + bic v25.16b, v17.16b, v16.16b + bic v26.16b, v18.16b, v17.16b + bic v27.16b, v19.16b, v18.16b + bic v28.16b, v15.16b, v19.16b + bic v29.16b, v16.16b, v15.16b + eor v15.16b, v15.16b, v25.16b + eor v16.16b, v16.16b, v26.16b + eor v17.16b, v17.16b, v27.16b + eor v18.16b, v18.16b, v28.16b + eor v19.16b, v19.16b, v29.16b + bic v25.16b, v22.16b, v21.16b + bic v26.16b, v23.16b, v22.16b + bic v27.16b, v24.16b, v23.16b + bic v28.16b, v20.16b, v24.16b + bic v29.16b, v21.16b, v20.16b + eor v20.16b, v20.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v22.16b, v22.16b, v27.16b + eor v23.16b, v23.16b, v28.16b + eor v24.16b, v24.16b, v29.16b + # Done tranforming + ldp x28, x1, [x29, #48] + ldr x0, [x28], #8 + subs x1, x1, #1 + mov v30.d[0], x0 + mov v30.d[1], x0 + eor x2, x2, x0 + eor v0.16b, v0.16b, v30.16b + bne L_SHA3_shake256_blocksx3_seed_neon_begin + ldr x0, [x29, #40] + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.d}[0], [x0] + add x0, x0, #8 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 + st1 {v24.d}[1], [x0] + add x0, x0, #8 + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] + stp x14, x15, [x0, #96] + stp x16, x17, [x0, #112] + stp x19, x20, [x0, #128] + stp x21, x22, [x0, #144] + stp x23, x24, [x0, #160] + stp x25, x26, [x0, #176] + str x27, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp d8, d9, [x29, #160] + ldp d10, d11, [x29, #176] + ldp d12, d13, [x29, #192] + ldp d14, d15, [x29, #208] + ldp x29, x30, [sp], #0xe0 + ret +#ifndef __APPLE__ + .size kyber_shake256_blocksx3_seed_neon,.-kyber_shake256_blocksx3_seed_neon +#endif /* __APPLE__ */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#endif /* WOLFSSL_WC_KYBER */ +#endif /* __aarch64__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c b/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c new file mode 100644 index 0000000000..09f123b4c7 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c @@ -0,0 +1,14303 @@ +/* armv8-kyber-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./kyber/kyber.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-kyber-asm.c + */ +#ifdef WOLFSSL_ARMASM +#ifdef __aarch64__ +#ifdef WOLFSSL_ARMASM_INLINE +static const uint16_t L_kyber_aarch64_q[] = { + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, + 0xd01, +}; + +static const uint16_t L_kyber_aarch64_consts[] = { + 0xd01, + 0xf301, + 0x4ebf, + 0x549, + 0x5049, + 0x0, + 0x0, + 0x0, +}; + +static const uint64_t L_sha3_aarch64_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +#include + +#ifdef WOLFSSL_WC_KYBER +static const uint16_t L_kyber_aarch64_zetas[] = { + 0x8ed, + 0xa0b, + 0xb9a, + 0x714, + 0x5d5, + 0x58e, + 0x11f, + 0xca, + 0xc56, + 0x26e, + 0x629, + 0xb6, + 0x3c2, + 0x84f, + 0x73f, + 0x5bc, + 0x23d, + 0x7d4, + 0x108, + 0x17f, + 0x9c4, + 0x5b2, + 0x6bf, + 0xc7f, + 0xa58, + 0x3f9, + 0x2dc, + 0x260, + 0x6fb, + 0x19b, + 0xc34, + 0x6de, + 0x4c7, + 0x4c7, + 0x4c7, + 0x4c7, + 0x28c, + 0x28c, + 0x28c, + 0x28c, + 0xad9, + 0xad9, + 0xad9, + 0xad9, + 0x3f7, + 0x3f7, + 0x3f7, + 0x3f7, + 0x7f4, + 0x7f4, + 0x7f4, + 0x7f4, + 0x5d3, + 0x5d3, + 0x5d3, + 0x5d3, + 0xbe7, + 0xbe7, + 0xbe7, + 0xbe7, + 0x6f9, + 0x6f9, + 0x6f9, + 0x6f9, + 0x204, + 0x204, + 0x204, + 0x204, + 0xcf9, + 0xcf9, + 0xcf9, + 0xcf9, + 0xbc1, + 0xbc1, + 0xbc1, + 0xbc1, + 0xa67, + 0xa67, + 0xa67, + 0xa67, + 0x6af, + 0x6af, + 0x6af, + 0x6af, + 0x877, + 0x877, + 0x877, + 0x877, + 0x7e, + 0x7e, + 0x7e, + 0x7e, + 0x5bd, + 0x5bd, + 0x5bd, + 0x5bd, + 0x9ac, + 0x9ac, + 0x9ac, + 0x9ac, + 0xca7, + 0xca7, + 0xca7, + 0xca7, + 0xbf2, + 0xbf2, + 0xbf2, + 0xbf2, + 0x33e, + 0x33e, + 0x33e, + 0x33e, + 0x6b, + 0x6b, + 0x6b, + 0x6b, + 0x774, + 0x774, + 0x774, + 0x774, + 0xc0a, + 0xc0a, + 0xc0a, + 0xc0a, + 0x94a, + 0x94a, + 0x94a, + 0x94a, + 0xb73, + 0xb73, + 0xb73, + 0xb73, + 0x3c1, + 0x3c1, + 0x3c1, + 0x3c1, + 0x71d, + 0x71d, + 0x71d, + 0x71d, + 0xa2c, + 0xa2c, + 0xa2c, + 0xa2c, + 0x1c0, + 0x1c0, + 0x1c0, + 0x1c0, + 0x8d8, + 0x8d8, + 0x8d8, + 0x8d8, + 0x2a5, + 0x2a5, + 0x2a5, + 0x2a5, + 0x806, + 0x806, + 0x806, + 0x806, + 0x8b2, + 0x8b2, + 0x1ae, + 0x1ae, + 0x22b, + 0x22b, + 0x34b, + 0x34b, + 0x81e, + 0x81e, + 0x367, + 0x367, + 0x60e, + 0x60e, + 0x69, + 0x69, + 0x1a6, + 0x1a6, + 0x24b, + 0x24b, + 0xb1, + 0xb1, + 0xc16, + 0xc16, + 0xbde, + 0xbde, + 0xb35, + 0xb35, + 0x626, + 0x626, + 0x675, + 0x675, + 0xc0b, + 0xc0b, + 0x30a, + 0x30a, + 0x487, + 0x487, + 0xc6e, + 0xc6e, + 0x9f8, + 0x9f8, + 0x5cb, + 0x5cb, + 0xaa7, + 0xaa7, + 0x45f, + 0x45f, + 0x6cb, + 0x6cb, + 0x284, + 0x284, + 0x999, + 0x999, + 0x15d, + 0x15d, + 0x1a2, + 0x1a2, + 0x149, + 0x149, + 0xc65, + 0xc65, + 0xcb6, + 0xcb6, + 0x331, + 0x331, + 0x449, + 0x449, + 0x25b, + 0x25b, + 0x262, + 0x262, + 0x52a, + 0x52a, + 0x7fc, + 0x7fc, + 0x748, + 0x748, + 0x180, + 0x180, + 0x842, + 0x842, + 0xc79, + 0xc79, + 0x4c2, + 0x4c2, + 0x7ca, + 0x7ca, + 0x997, + 0x997, + 0xdc, + 0xdc, + 0x85e, + 0x85e, + 0x686, + 0x686, + 0x860, + 0x860, + 0x707, + 0x707, + 0x803, + 0x803, + 0x31a, + 0x31a, + 0x71b, + 0x71b, + 0x9ab, + 0x9ab, + 0x99b, + 0x99b, + 0x1de, + 0x1de, + 0xc95, + 0xc95, + 0xbcd, + 0xbcd, + 0x3e4, + 0x3e4, + 0x3df, + 0x3df, + 0x3be, + 0x3be, + 0x74d, + 0x74d, + 0x5f2, + 0x5f2, + 0x65c, + 0x65c, +}; + +static const uint16_t L_kyber_aarch64_zetas_qinv[] = { + 0xffed, + 0x7b0b, + 0x399a, + 0x314, + 0x34d5, + 0xcf8e, + 0x6e1f, + 0xbeca, + 0xae56, + 0x6c6e, + 0xf129, + 0xc2b6, + 0x29c2, + 0x54f, + 0xd43f, + 0x79bc, + 0xe93d, + 0x43d4, + 0x9908, + 0x8e7f, + 0x15c4, + 0xfbb2, + 0x53bf, + 0x997f, + 0x9258, + 0x5ef9, + 0xd6dc, + 0x2260, + 0x47fb, + 0x229b, + 0x6834, + 0xc0de, + 0xe9c7, + 0xe9c7, + 0xe9c7, + 0xe9c7, + 0xe68c, + 0xe68c, + 0xe68c, + 0xe68c, + 0x5d9, + 0x5d9, + 0x5d9, + 0x5d9, + 0x78f7, + 0x78f7, + 0x78f7, + 0x78f7, + 0xa3f4, + 0xa3f4, + 0xa3f4, + 0xa3f4, + 0x4ed3, + 0x4ed3, + 0x4ed3, + 0x4ed3, + 0x50e7, + 0x50e7, + 0x50e7, + 0x50e7, + 0x61f9, + 0x61f9, + 0x61f9, + 0x61f9, + 0xce04, + 0xce04, + 0xce04, + 0xce04, + 0x67f9, + 0x67f9, + 0x67f9, + 0x67f9, + 0x3ec1, + 0x3ec1, + 0x3ec1, + 0x3ec1, + 0xcf67, + 0xcf67, + 0xcf67, + 0xcf67, + 0x23af, + 0x23af, + 0x23af, + 0x23af, + 0xfd77, + 0xfd77, + 0xfd77, + 0xfd77, + 0x9a7e, + 0x9a7e, + 0x9a7e, + 0x9a7e, + 0x6cbd, + 0x6cbd, + 0x6cbd, + 0x6cbd, + 0x4dac, + 0x4dac, + 0x4dac, + 0x4dac, + 0x91a7, + 0x91a7, + 0x91a7, + 0x91a7, + 0xc1f2, + 0xc1f2, + 0xc1f2, + 0xc1f2, + 0xdd3e, + 0xdd3e, + 0xdd3e, + 0xdd3e, + 0x916b, + 0x916b, + 0x916b, + 0x916b, + 0x2374, + 0x2374, + 0x2374, + 0x2374, + 0x8a0a, + 0x8a0a, + 0x8a0a, + 0x8a0a, + 0x474a, + 0x474a, + 0x474a, + 0x474a, + 0x3473, + 0x3473, + 0x3473, + 0x3473, + 0x36c1, + 0x36c1, + 0x36c1, + 0x36c1, + 0x8e1d, + 0x8e1d, + 0x8e1d, + 0x8e1d, + 0xce2c, + 0xce2c, + 0xce2c, + 0xce2c, + 0x41c0, + 0x41c0, + 0x41c0, + 0x41c0, + 0x10d8, + 0x10d8, + 0x10d8, + 0x10d8, + 0xa1a5, + 0xa1a5, + 0xa1a5, + 0xa1a5, + 0xba06, + 0xba06, + 0xba06, + 0xba06, + 0xfeb2, + 0xfeb2, + 0x2bae, + 0x2bae, + 0xd32b, + 0xd32b, + 0x344b, + 0x344b, + 0x821e, + 0x821e, + 0xc867, + 0xc867, + 0x500e, + 0x500e, + 0xab69, + 0xab69, + 0x93a6, + 0x93a6, + 0x334b, + 0x334b, + 0x3b1, + 0x3b1, + 0xee16, + 0xee16, + 0xc5de, + 0xc5de, + 0x5a35, + 0x5a35, + 0x1826, + 0x1826, + 0x1575, + 0x1575, + 0x7d0b, + 0x7d0b, + 0x810a, + 0x810a, + 0x2987, + 0x2987, + 0x766e, + 0x766e, + 0x71f8, + 0x71f8, + 0xb6cb, + 0xb6cb, + 0x8fa7, + 0x8fa7, + 0x315f, + 0x315f, + 0xb7cb, + 0xb7cb, + 0x4e84, + 0x4e84, + 0x4499, + 0x4499, + 0x485d, + 0x485d, + 0xc7a2, + 0xc7a2, + 0x4c49, + 0x4c49, + 0xeb65, + 0xeb65, + 0xceb6, + 0xceb6, + 0x8631, + 0x8631, + 0x4f49, + 0x4f49, + 0x635b, + 0x635b, + 0x862, + 0x862, + 0xe32a, + 0xe32a, + 0x3bfc, + 0x3bfc, + 0x5f48, + 0x5f48, + 0x8180, + 0x8180, + 0xae42, + 0xae42, + 0xe779, + 0xe779, + 0x2ac2, + 0x2ac2, + 0xc5ca, + 0xc5ca, + 0x5e97, + 0x5e97, + 0xd4dc, + 0xd4dc, + 0x425e, + 0x425e, + 0x3886, + 0x3886, + 0x2860, + 0x2860, + 0xac07, + 0xac07, + 0xe103, + 0xe103, + 0xb11a, + 0xb11a, + 0xa81b, + 0xa81b, + 0x5aab, + 0x5aab, + 0x2a9b, + 0x2a9b, + 0xbbde, + 0xbbde, + 0x7b95, + 0x7b95, + 0xa2cd, + 0xa2cd, + 0x6fe4, + 0x6fe4, + 0xb0df, + 0xb0df, + 0x5dbe, + 0x5dbe, + 0x1e4d, + 0x1e4d, + 0xbbf2, + 0xbbf2, + 0x5a5c, + 0x5a5c, +}; + +void kyber_ntt(sword16* r) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_zetas]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_zetas]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_zetas]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_zetas]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_qinv]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_qinv]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_qinv]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_qinv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "add x1, %x[r], #0x100\n\t" + "ldr q4, [x4]\n\t" + "ldr q5, [%x[r]]\n\t" + "ldr q6, [%x[r], #32]\n\t" + "ldr q7, [%x[r], #64]\n\t" + "ldr q8, [%x[r], #96]\n\t" + "ldr q9, [%x[r], #128]\n\t" + "ldr q10, [%x[r], #160]\n\t" + "ldr q11, [%x[r], #192]\n\t" + "ldr q12, [%x[r], #224]\n\t" + "ldr q13, [x1]\n\t" + "ldr q14, [x1, #32]\n\t" + "ldr q15, [x1, #64]\n\t" + "ldr q16, [x1, #96]\n\t" + "ldr q17, [x1, #128]\n\t" + "ldr q18, [x1, #160]\n\t" + "ldr q19, [x1, #192]\n\t" + "ldr q20, [x1, #224]\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "mul v29.8h, v13.8h, v1.h[1]\n\t" + "mul v30.8h, v14.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" + "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[1]\n\t" + "mul v30.8h, v16.8h, v1.h[1]\n\t" + "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" + "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[1]\n\t" + "mul v30.8h, v18.8h, v1.h[1]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[1]\n\t" + "mul v30.8h, v20.8h, v1.h[1]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v13.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v14.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v15.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v16.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v9.8h, v25.8h\n\t" + "add v9.8h, v9.8h, v25.8h\n\t" + "sub v18.8h, v10.8h, v26.8h\n\t" + "add v10.8h, v10.8h, v26.8h\n\t" + "sub v19.8h, v11.8h, v27.8h\n\t" + "add v11.8h, v11.8h, v27.8h\n\t" + "sub v20.8h, v12.8h, v28.8h\n\t" + "add v12.8h, v12.8h, v28.8h\n\t" + "mul v29.8h, v9.8h, v1.h[2]\n\t" + "mul v30.8h, v10.8h, v1.h[2]\n\t" + "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" + "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[2]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[3]\n\t" + "mul v30.8h, v18.8h, v1.h[3]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[3]\n\t" + "mul v30.8h, v20.8h, v1.h[3]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v9.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v10.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v12.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v18.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v15.8h, v27.8h\n\t" + "add v15.8h, v15.8h, v27.8h\n\t" + "sub v20.8h, v16.8h, v28.8h\n\t" + "add v16.8h, v16.8h, v28.8h\n\t" + "mul v29.8h, v7.8h, v1.h[4]\n\t" + "mul v30.8h, v8.8h, v1.h[4]\n\t" + "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[5]\n\t" + "mul v30.8h, v12.8h, v1.h[5]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[6]\n\t" + "mul v30.8h, v16.8h, v1.h[6]\n\t" + "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[7]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v7.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v10.8h, v24.8h\n\t" + "add v10.8h, v10.8h, v24.8h\n\t" + "sub v15.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v18.8h, v28.8h\n\t" + "add v18.8h, v18.8h, v28.8h\n\t" + "ldr q0, [x2, #16]\n\t" + "ldr q1, [x3, #16]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "str q5, [%x[r]]\n\t" + "str q6, [%x[r], #32]\n\t" + "str q7, [%x[r], #64]\n\t" + "str q8, [%x[r], #96]\n\t" + "str q9, [%x[r], #128]\n\t" + "str q10, [%x[r], #160]\n\t" + "str q11, [%x[r], #192]\n\t" + "str q12, [%x[r], #224]\n\t" + "str q13, [x1]\n\t" + "str q14, [x1, #32]\n\t" + "str q15, [x1, #64]\n\t" + "str q16, [x1, #96]\n\t" + "str q17, [x1, #128]\n\t" + "str q18, [x1, #160]\n\t" + "str q19, [x1, #192]\n\t" + "str q20, [x1, #224]\n\t" + "ldr q5, [%x[r], #16]\n\t" + "ldr q6, [%x[r], #48]\n\t" + "ldr q7, [%x[r], #80]\n\t" + "ldr q8, [%x[r], #112]\n\t" + "ldr q9, [%x[r], #144]\n\t" + "ldr q10, [%x[r], #176]\n\t" + "ldr q11, [%x[r], #208]\n\t" + "ldr q12, [%x[r], #240]\n\t" + "ldr q13, [x1, #16]\n\t" + "ldr q14, [x1, #48]\n\t" + "ldr q15, [x1, #80]\n\t" + "ldr q16, [x1, #112]\n\t" + "ldr q17, [x1, #144]\n\t" + "ldr q18, [x1, #176]\n\t" + "ldr q19, [x1, #208]\n\t" + "ldr q20, [x1, #240]\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "mul v29.8h, v13.8h, v1.h[1]\n\t" + "mul v30.8h, v14.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" + "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[1]\n\t" + "mul v30.8h, v16.8h, v1.h[1]\n\t" + "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" + "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[1]\n\t" + "mul v30.8h, v18.8h, v1.h[1]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[1]\n\t" + "mul v30.8h, v20.8h, v1.h[1]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v13.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v14.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v15.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v16.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v9.8h, v25.8h\n\t" + "add v9.8h, v9.8h, v25.8h\n\t" + "sub v18.8h, v10.8h, v26.8h\n\t" + "add v10.8h, v10.8h, v26.8h\n\t" + "sub v19.8h, v11.8h, v27.8h\n\t" + "add v11.8h, v11.8h, v27.8h\n\t" + "sub v20.8h, v12.8h, v28.8h\n\t" + "add v12.8h, v12.8h, v28.8h\n\t" + "mul v29.8h, v9.8h, v1.h[2]\n\t" + "mul v30.8h, v10.8h, v1.h[2]\n\t" + "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" + "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[2]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[3]\n\t" + "mul v30.8h, v18.8h, v1.h[3]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[3]\n\t" + "mul v30.8h, v20.8h, v1.h[3]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v9.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v10.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v12.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v18.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v15.8h, v27.8h\n\t" + "add v15.8h, v15.8h, v27.8h\n\t" + "sub v20.8h, v16.8h, v28.8h\n\t" + "add v16.8h, v16.8h, v28.8h\n\t" + "mul v29.8h, v7.8h, v1.h[4]\n\t" + "mul v30.8h, v8.8h, v1.h[4]\n\t" + "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[5]\n\t" + "mul v30.8h, v12.8h, v1.h[5]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[6]\n\t" + "mul v30.8h, v16.8h, v1.h[6]\n\t" + "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[7]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v7.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v10.8h, v24.8h\n\t" + "add v10.8h, v10.8h, v24.8h\n\t" + "sub v15.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v18.8h, v28.8h\n\t" + "add v18.8h, v18.8h, v28.8h\n\t" + "ldr q0, [x2, #16]\n\t" + "ldr q1, [x3, #16]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "str q5, [%x[r], #16]\n\t" + "str q6, [%x[r], #48]\n\t" + "str q7, [%x[r], #80]\n\t" + "str q8, [%x[r], #112]\n\t" + "str q9, [%x[r], #144]\n\t" + "str q10, [%x[r], #176]\n\t" + "str q11, [%x[r], #208]\n\t" + "str q12, [%x[r], #240]\n\t" + "str q13, [x1, #16]\n\t" + "str q14, [x1, #48]\n\t" + "str q15, [x1, #80]\n\t" + "str q16, [x1, #112]\n\t" + "str q17, [x1, #144]\n\t" + "str q18, [x1, #176]\n\t" + "str q19, [x1, #208]\n\t" + "str q20, [x1, #240]\n\t" + "ldp q5, q6, [%x[r]]\n\t" + "ldp q7, q8, [%x[r], #32]\n\t" + "ldp q9, q10, [%x[r], #64]\n\t" + "ldp q11, q12, [%x[r], #96]\n\t" + "ldp q13, q14, [%x[r], #128]\n\t" + "ldp q15, q16, [%x[r], #160]\n\t" + "ldp q17, q18, [%x[r], #192]\n\t" + "ldp q19, q20, [%x[r], #224]\n\t" + "ldr q0, [x2, #32]\n\t" + "ldr q1, [x3, #32]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #64]\n\t" + "ldr q2, [x2, #80]\n\t" + "ldr q1, [x3, #64]\n\t" + "ldr q3, [x3, #80]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "trn2 v8.2d, v30.2d, v8.2d\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #96]\n\t" + "ldr q2, [x2, #112]\n\t" + "ldr q1, [x3, #96]\n\t" + "ldr q3, [x3, #112]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "trn2 v12.2d, v30.2d, v12.2d\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #128]\n\t" + "ldr q2, [x2, #144]\n\t" + "ldr q1, [x3, #128]\n\t" + "ldr q3, [x3, #144]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "trn2 v16.2d, v30.2d, v16.2d\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #160]\n\t" + "ldr q2, [x2, #176]\n\t" + "ldr q1, [x3, #160]\n\t" + "ldr q3, [x3, #176]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "trn2 v20.2d, v30.2d, v20.2d\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #320]\n\t" + "ldr q2, [x2, #336]\n\t" + "ldr q1, [x3, #320]\n\t" + "ldr q3, [x3, #336]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "trn2 v8.4s, v30.4s, v8.4s\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #352]\n\t" + "ldr q2, [x2, #368]\n\t" + "ldr q1, [x3, #352]\n\t" + "ldr q3, [x3, #368]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "trn2 v12.4s, v30.4s, v12.4s\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #384]\n\t" + "ldr q2, [x2, #400]\n\t" + "ldr q1, [x3, #384]\n\t" + "ldr q3, [x3, #400]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "trn2 v16.4s, v30.4s, v16.4s\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #416]\n\t" + "ldr q2, [x2, #432]\n\t" + "ldr q1, [x3, #416]\n\t" + "ldr q3, [x3, #432]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "trn2 v20.4s, v30.4s, v20.4s\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "sqdmulh v21.8h, v5.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v6.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v5.8h, v21.8h, v4.h[0]\n\t" + "mls v6.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v7.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v8.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v7.8h, v21.8h, v4.h[0]\n\t" + "mls v8.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v9.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v10.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v9.8h, v21.8h, v4.h[0]\n\t" + "mls v10.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v11.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v12.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v11.8h, v21.8h, v4.h[0]\n\t" + "mls v12.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v13.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v14.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v13.8h, v21.8h, v4.h[0]\n\t" + "mls v14.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v15.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v16.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v15.8h, v21.8h, v4.h[0]\n\t" + "mls v16.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v17.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v18.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v17.8h, v21.8h, v4.h[0]\n\t" + "mls v18.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v19.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v20.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v19.8h, v21.8h, v4.h[0]\n\t" + "mls v20.8h, v22.8h, v4.h[0]\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v8.4s, v29.4s, v8.4s\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v8.2d, v29.2d, v8.2d\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v29.4s, v12.4s\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v29.2d, v12.2d\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v29.4s, v16.4s\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v29.2d, v16.2d\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v29.4s, v20.4s\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v29.2d, v20.2d\n\t" + "stp q5, q6, [%x[r]]\n\t" + "stp q7, q8, [%x[r], #32]\n\t" + "stp q9, q10, [%x[r], #64]\n\t" + "stp q11, q12, [%x[r], #96]\n\t" + "stp q13, q14, [%x[r], #128]\n\t" + "stp q15, q16, [%x[r], #160]\n\t" + "stp q17, q18, [%x[r], #192]\n\t" + "stp q19, q20, [%x[r], #224]\n\t" + "ldp q5, q6, [x1]\n\t" + "ldp q7, q8, [x1, #32]\n\t" + "ldp q9, q10, [x1, #64]\n\t" + "ldp q11, q12, [x1, #96]\n\t" + "ldp q13, q14, [x1, #128]\n\t" + "ldp q15, q16, [x1, #160]\n\t" + "ldp q17, q18, [x1, #192]\n\t" + "ldp q19, q20, [x1, #224]\n\t" + "ldr q0, [x2, #48]\n\t" + "ldr q1, [x3, #48]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #192]\n\t" + "ldr q2, [x2, #208]\n\t" + "ldr q1, [x3, #192]\n\t" + "ldr q3, [x3, #208]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "trn2 v8.2d, v30.2d, v8.2d\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #224]\n\t" + "ldr q2, [x2, #240]\n\t" + "ldr q1, [x3, #224]\n\t" + "ldr q3, [x3, #240]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "trn2 v12.2d, v30.2d, v12.2d\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #256]\n\t" + "ldr q2, [x2, #272]\n\t" + "ldr q1, [x3, #256]\n\t" + "ldr q3, [x3, #272]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "trn2 v16.2d, v30.2d, v16.2d\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #288]\n\t" + "ldr q2, [x2, #304]\n\t" + "ldr q1, [x3, #288]\n\t" + "ldr q3, [x3, #304]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "trn2 v20.2d, v30.2d, v20.2d\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #448]\n\t" + "ldr q2, [x2, #464]\n\t" + "ldr q1, [x3, #448]\n\t" + "ldr q3, [x3, #464]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "trn2 v8.4s, v30.4s, v8.4s\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v21.8h, v21.8h, v29.8h\n\t" + "sub v22.8h, v22.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #480]\n\t" + "ldr q2, [x2, #496]\n\t" + "ldr q1, [x3, #480]\n\t" + "ldr q3, [x3, #496]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "trn2 v12.4s, v30.4s, v12.4s\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v23.8h, v23.8h, v29.8h\n\t" + "sub v24.8h, v24.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #512]\n\t" + "ldr q2, [x2, #528]\n\t" + "ldr q1, [x3, #512]\n\t" + "ldr q3, [x3, #528]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "trn2 v16.4s, v30.4s, v16.4s\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v25.8h, v25.8h, v29.8h\n\t" + "sub v26.8h, v26.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #544]\n\t" + "ldr q2, [x2, #560]\n\t" + "ldr q1, [x3, #544]\n\t" + "ldr q3, [x3, #560]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "trn2 v20.4s, v30.4s, v20.4s\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" +#else + "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" + "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" + "sub v27.8h, v27.8h, v29.8h\n\t" + "sub v28.8h, v28.8h, v30.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "sqdmulh v21.8h, v5.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v6.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v5.8h, v21.8h, v4.h[0]\n\t" + "mls v6.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v7.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v8.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v7.8h, v21.8h, v4.h[0]\n\t" + "mls v8.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v9.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v10.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v9.8h, v21.8h, v4.h[0]\n\t" + "mls v10.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v11.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v12.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v11.8h, v21.8h, v4.h[0]\n\t" + "mls v12.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v13.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v14.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v13.8h, v21.8h, v4.h[0]\n\t" + "mls v14.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v15.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v16.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v15.8h, v21.8h, v4.h[0]\n\t" + "mls v16.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v17.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v18.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v17.8h, v21.8h, v4.h[0]\n\t" + "mls v18.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v19.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v20.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v19.8h, v21.8h, v4.h[0]\n\t" + "mls v20.8h, v22.8h, v4.h[0]\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v8.4s, v29.4s, v8.4s\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v8.2d, v29.2d, v8.2d\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v29.4s, v12.4s\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v29.2d, v12.2d\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v29.4s, v16.4s\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v29.2d, v16.2d\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v29.4s, v20.4s\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v29.2d, v20.2d\n\t" + "stp q5, q6, [x1]\n\t" + "stp q7, q8, [x1, #32]\n\t" + "stp q9, q10, [x1, #64]\n\t" + "stp q11, q12, [x1, #96]\n\t" + "stp q13, q14, [x1, #128]\n\t" + "stp q15, q16, [x1, #160]\n\t" + "stp q17, q18, [x1, #192]\n\t" + "stp q19, q20, [x1, #224]\n\t" + : [r] "+r" (r) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv) + : "memory", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_zetas_inv[] = { + 0x6a5, + 0x6a5, + 0x70f, + 0x70f, + 0x5b4, + 0x5b4, + 0x943, + 0x943, + 0x922, + 0x922, + 0x91d, + 0x91d, + 0x134, + 0x134, + 0x6c, + 0x6c, + 0xb23, + 0xb23, + 0x366, + 0x366, + 0x356, + 0x356, + 0x5e6, + 0x5e6, + 0x9e7, + 0x9e7, + 0x4fe, + 0x4fe, + 0x5fa, + 0x5fa, + 0x4a1, + 0x4a1, + 0x67b, + 0x67b, + 0x4a3, + 0x4a3, + 0xc25, + 0xc25, + 0x36a, + 0x36a, + 0x537, + 0x537, + 0x83f, + 0x83f, + 0x88, + 0x88, + 0x4bf, + 0x4bf, + 0xb81, + 0xb81, + 0x5b9, + 0x5b9, + 0x505, + 0x505, + 0x7d7, + 0x7d7, + 0xa9f, + 0xa9f, + 0xaa6, + 0xaa6, + 0x8b8, + 0x8b8, + 0x9d0, + 0x9d0, + 0x4b, + 0x4b, + 0x9c, + 0x9c, + 0xbb8, + 0xbb8, + 0xb5f, + 0xb5f, + 0xba4, + 0xba4, + 0x368, + 0x368, + 0xa7d, + 0xa7d, + 0x636, + 0x636, + 0x8a2, + 0x8a2, + 0x25a, + 0x25a, + 0x736, + 0x736, + 0x309, + 0x309, + 0x93, + 0x93, + 0x87a, + 0x87a, + 0x9f7, + 0x9f7, + 0xf6, + 0xf6, + 0x68c, + 0x68c, + 0x6db, + 0x6db, + 0x1cc, + 0x1cc, + 0x123, + 0x123, + 0xeb, + 0xeb, + 0xc50, + 0xc50, + 0xab6, + 0xab6, + 0xb5b, + 0xb5b, + 0xc98, + 0xc98, + 0x6f3, + 0x6f3, + 0x99a, + 0x99a, + 0x4e3, + 0x4e3, + 0x9b6, + 0x9b6, + 0xad6, + 0xad6, + 0xb53, + 0xb53, + 0x44f, + 0x44f, + 0x4fb, + 0x4fb, + 0x4fb, + 0x4fb, + 0xa5c, + 0xa5c, + 0xa5c, + 0xa5c, + 0x429, + 0x429, + 0x429, + 0x429, + 0xb41, + 0xb41, + 0xb41, + 0xb41, + 0x2d5, + 0x2d5, + 0x2d5, + 0x2d5, + 0x5e4, + 0x5e4, + 0x5e4, + 0x5e4, + 0x940, + 0x940, + 0x940, + 0x940, + 0x18e, + 0x18e, + 0x18e, + 0x18e, + 0x3b7, + 0x3b7, + 0x3b7, + 0x3b7, + 0xf7, + 0xf7, + 0xf7, + 0xf7, + 0x58d, + 0x58d, + 0x58d, + 0x58d, + 0xc96, + 0xc96, + 0xc96, + 0xc96, + 0x9c3, + 0x9c3, + 0x9c3, + 0x9c3, + 0x10f, + 0x10f, + 0x10f, + 0x10f, + 0x5a, + 0x5a, + 0x5a, + 0x5a, + 0x355, + 0x355, + 0x355, + 0x355, + 0x744, + 0x744, + 0x744, + 0x744, + 0xc83, + 0xc83, + 0xc83, + 0xc83, + 0x48a, + 0x48a, + 0x48a, + 0x48a, + 0x652, + 0x652, + 0x652, + 0x652, + 0x29a, + 0x29a, + 0x29a, + 0x29a, + 0x140, + 0x140, + 0x140, + 0x140, + 0x8, + 0x8, + 0x8, + 0x8, + 0xafd, + 0xafd, + 0xafd, + 0xafd, + 0x608, + 0x608, + 0x608, + 0x608, + 0x11a, + 0x11a, + 0x11a, + 0x11a, + 0x72e, + 0x72e, + 0x72e, + 0x72e, + 0x50d, + 0x50d, + 0x50d, + 0x50d, + 0x90a, + 0x90a, + 0x90a, + 0x90a, + 0x228, + 0x228, + 0x228, + 0x228, + 0xa75, + 0xa75, + 0xa75, + 0xa75, + 0x83a, + 0x83a, + 0x83a, + 0x83a, + 0x623, + 0xcd, + 0xb66, + 0x606, + 0xaa1, + 0xa25, + 0x908, + 0x2a9, + 0x82, + 0x642, + 0x74f, + 0x33d, + 0xb82, + 0xbf9, + 0x52d, + 0xac4, + 0x745, + 0x5c2, + 0x4b2, + 0x93f, + 0xc4b, + 0x6d8, + 0xa93, + 0xab, + 0xc37, + 0xbe2, + 0x773, + 0x72c, + 0x5ed, + 0x167, + 0x2f6, + 0x5a1, +}; + +static const uint16_t L_kyber_aarch64_zetas_inv_qinv[] = { + 0xa5a5, + 0xa5a5, + 0x440f, + 0x440f, + 0xe1b4, + 0xe1b4, + 0xa243, + 0xa243, + 0x4f22, + 0x4f22, + 0x901d, + 0x901d, + 0x5d34, + 0x5d34, + 0x846c, + 0x846c, + 0x4423, + 0x4423, + 0xd566, + 0xd566, + 0xa556, + 0xa556, + 0x57e6, + 0x57e6, + 0x4ee7, + 0x4ee7, + 0x1efe, + 0x1efe, + 0x53fa, + 0x53fa, + 0xd7a1, + 0xd7a1, + 0xc77b, + 0xc77b, + 0xbda3, + 0xbda3, + 0x2b25, + 0x2b25, + 0xa16a, + 0xa16a, + 0x3a37, + 0x3a37, + 0xd53f, + 0xd53f, + 0x1888, + 0x1888, + 0x51bf, + 0x51bf, + 0x7e81, + 0x7e81, + 0xa0b9, + 0xa0b9, + 0xc405, + 0xc405, + 0x1cd7, + 0x1cd7, + 0xf79f, + 0xf79f, + 0x9ca6, + 0x9ca6, + 0xb0b8, + 0xb0b8, + 0x79d0, + 0x79d0, + 0x314b, + 0x314b, + 0x149c, + 0x149c, + 0xb3b8, + 0xb3b8, + 0x385f, + 0x385f, + 0xb7a4, + 0xb7a4, + 0xbb68, + 0xbb68, + 0xb17d, + 0xb17d, + 0x4836, + 0x4836, + 0xcea2, + 0xcea2, + 0x705a, + 0x705a, + 0x4936, + 0x4936, + 0x8e09, + 0x8e09, + 0x8993, + 0x8993, + 0xd67a, + 0xd67a, + 0x7ef7, + 0x7ef7, + 0x82f6, + 0x82f6, + 0xea8c, + 0xea8c, + 0xe7db, + 0xe7db, + 0xa5cc, + 0xa5cc, + 0x3a23, + 0x3a23, + 0x11eb, + 0x11eb, + 0xfc50, + 0xfc50, + 0xccb6, + 0xccb6, + 0x6c5b, + 0x6c5b, + 0x5498, + 0x5498, + 0xaff3, + 0xaff3, + 0x379a, + 0x379a, + 0x7de3, + 0x7de3, + 0xcbb6, + 0xcbb6, + 0x2cd6, + 0x2cd6, + 0xd453, + 0xd453, + 0x14f, + 0x14f, + 0x45fb, + 0x45fb, + 0x45fb, + 0x45fb, + 0x5e5c, + 0x5e5c, + 0x5e5c, + 0x5e5c, + 0xef29, + 0xef29, + 0xef29, + 0xef29, + 0xbe41, + 0xbe41, + 0xbe41, + 0xbe41, + 0x31d5, + 0x31d5, + 0x31d5, + 0x31d5, + 0x71e4, + 0x71e4, + 0x71e4, + 0x71e4, + 0xc940, + 0xc940, + 0xc940, + 0xc940, + 0xcb8e, + 0xcb8e, + 0xcb8e, + 0xcb8e, + 0xb8b7, + 0xb8b7, + 0xb8b7, + 0xb8b7, + 0x75f7, + 0x75f7, + 0x75f7, + 0x75f7, + 0xdc8d, + 0xdc8d, + 0xdc8d, + 0xdc8d, + 0x6e96, + 0x6e96, + 0x6e96, + 0x6e96, + 0x22c3, + 0x22c3, + 0x22c3, + 0x22c3, + 0x3e0f, + 0x3e0f, + 0x3e0f, + 0x3e0f, + 0x6e5a, + 0x6e5a, + 0x6e5a, + 0x6e5a, + 0xb255, + 0xb255, + 0xb255, + 0xb255, + 0x9344, + 0x9344, + 0x9344, + 0x9344, + 0x6583, + 0x6583, + 0x6583, + 0x6583, + 0x28a, + 0x28a, + 0x28a, + 0x28a, + 0xdc52, + 0xdc52, + 0xdc52, + 0xdc52, + 0x309a, + 0x309a, + 0x309a, + 0x309a, + 0xc140, + 0xc140, + 0xc140, + 0xc140, + 0x9808, + 0x9808, + 0x9808, + 0x9808, + 0x31fd, + 0x31fd, + 0x31fd, + 0x31fd, + 0x9e08, + 0x9e08, + 0x9e08, + 0x9e08, + 0xaf1a, + 0xaf1a, + 0xaf1a, + 0xaf1a, + 0xb12e, + 0xb12e, + 0xb12e, + 0xb12e, + 0x5c0d, + 0x5c0d, + 0x5c0d, + 0x5c0d, + 0x870a, + 0x870a, + 0x870a, + 0x870a, + 0xfa28, + 0xfa28, + 0xfa28, + 0xfa28, + 0x1975, + 0x1975, + 0x1975, + 0x1975, + 0x163a, + 0x163a, + 0x163a, + 0x163a, + 0x3f23, + 0x97cd, + 0xdd66, + 0xb806, + 0xdda1, + 0x2925, + 0xa108, + 0x6da9, + 0x6682, + 0xac42, + 0x44f, + 0xea3d, + 0x7182, + 0x66f9, + 0xbc2d, + 0x16c4, + 0x8645, + 0x2bc2, + 0xfab2, + 0xd63f, + 0x3d4b, + 0xed8, + 0x9393, + 0x51ab, + 0x4137, + 0x91e2, + 0x3073, + 0xcb2c, + 0xfced, + 0xc667, + 0x84f6, + 0xd8a1, +}; + +void kyber_invntt(sword16* r) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_zetas_inv]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_zetas_inv]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_zetas_inv]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_zetas_inv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_inv_qinv]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_inv_qinv]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_inv_qinv]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_inv_qinv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "add x1, %x[r], #0x100\n\t" + "ldr q8, [x4]\n\t" + "ldp q9, q10, [%x[r]]\n\t" + "ldp q11, q12, [%x[r], #32]\n\t" + "ldp q13, q14, [%x[r], #64]\n\t" + "ldp q15, q16, [%x[r], #96]\n\t" + "ldp q17, q18, [%x[r], #128]\n\t" + "ldp q19, q20, [%x[r], #160]\n\t" + "ldp q21, q22, [%x[r], #192]\n\t" + "ldp q23, q24, [%x[r], #224]\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v25.2d, v12.2d\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v25.4s, v12.4s\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v25.2d, v16.2d\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v25.4s, v16.4s\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v25.2d, v20.2d\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v25.4s, v20.4s\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v24.2d, v25.2d, v24.2d\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v24.4s, v25.4s, v24.4s\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x2, #16]\n\t" + "ldr q2, [x3]\n\t" + "ldr q3, [x3, #16]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #32]\n\t" + "ldr q1, [x2, #48]\n\t" + "ldr q2, [x3, #32]\n\t" + "ldr q3, [x3, #48]\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #64]\n\t" + "ldr q1, [x2, #80]\n\t" + "ldr q2, [x3, #64]\n\t" + "ldr q3, [x3, #80]\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #96]\n\t" + "ldr q1, [x2, #112]\n\t" + "ldr q2, [x3, #96]\n\t" + "ldr q3, [x3, #112]\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #256]\n\t" + "ldr q1, [x2, #272]\n\t" + "ldr q2, [x3, #256]\n\t" + "ldr q3, [x3, #272]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "trn2 v12.4s, v26.4s, v12.4s\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #288]\n\t" + "ldr q1, [x2, #304]\n\t" + "ldr q2, [x3, #288]\n\t" + "ldr q3, [x3, #304]\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "trn2 v16.4s, v26.4s, v16.4s\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #320]\n\t" + "ldr q1, [x2, #336]\n\t" + "ldr q2, [x3, #320]\n\t" + "ldr q3, [x3, #336]\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "trn2 v20.4s, v26.4s, v20.4s\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #352]\n\t" + "ldr q1, [x2, #368]\n\t" + "ldr q2, [x3, #352]\n\t" + "ldr q3, [x3, #368]\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "trn2 v24.4s, v26.4s, v24.4s\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #512]\n\t" + "ldr q2, [x3, #512]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "trn2 v12.2d, v26.2d, v12.2d\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.h[0]\n\t" + "mul v27.8h, v28.8h, v2.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "trn2 v16.2d, v26.2d, v16.2d\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.h[2]\n\t" + "mul v27.8h, v28.8h, v2.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "trn2 v20.2d, v26.2d, v20.2d\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.h[4]\n\t" + "mul v27.8h, v28.8h, v2.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "trn2 v24.2d, v26.2d, v24.2d\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.h[6]\n\t" + "mul v27.8h, v28.8h, v2.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v11.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v11.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v13.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v15.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v13.8h, v25.8h, v8.h[0]\n\t" + "mls v15.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v19.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v19.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v21.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v23.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v21.8h, v25.8h, v8.h[0]\n\t" + "mls v23.8h, v26.8h, v8.h[0]\n\t" + "stp q9, q10, [%x[r]]\n\t" + "stp q11, q12, [%x[r], #32]\n\t" + "stp q13, q14, [%x[r], #64]\n\t" + "stp q15, q16, [%x[r], #96]\n\t" + "stp q17, q18, [%x[r], #128]\n\t" + "stp q19, q20, [%x[r], #160]\n\t" + "stp q21, q22, [%x[r], #192]\n\t" + "stp q23, q24, [%x[r], #224]\n\t" + "ldp q9, q10, [x1]\n\t" + "ldp q11, q12, [x1, #32]\n\t" + "ldp q13, q14, [x1, #64]\n\t" + "ldp q15, q16, [x1, #96]\n\t" + "ldp q17, q18, [x1, #128]\n\t" + "ldp q19, q20, [x1, #160]\n\t" + "ldp q21, q22, [x1, #192]\n\t" + "ldp q23, q24, [x1, #224]\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v25.2d, v12.2d\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v25.4s, v12.4s\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v25.2d, v16.2d\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v25.4s, v16.4s\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v25.2d, v20.2d\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v25.4s, v20.4s\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v24.2d, v25.2d, v24.2d\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v24.4s, v25.4s, v24.4s\n\t" + "ldr q0, [x2, #128]\n\t" + "ldr q1, [x2, #144]\n\t" + "ldr q2, [x3, #128]\n\t" + "ldr q3, [x3, #144]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #160]\n\t" + "ldr q1, [x2, #176]\n\t" + "ldr q2, [x3, #160]\n\t" + "ldr q3, [x3, #176]\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #192]\n\t" + "ldr q1, [x2, #208]\n\t" + "ldr q2, [x3, #192]\n\t" + "ldr q3, [x3, #208]\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #224]\n\t" + "ldr q1, [x2, #240]\n\t" + "ldr q2, [x3, #224]\n\t" + "ldr q3, [x3, #240]\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #384]\n\t" + "ldr q1, [x2, #400]\n\t" + "ldr q2, [x3, #384]\n\t" + "ldr q3, [x3, #400]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "trn2 v12.4s, v26.4s, v12.4s\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #416]\n\t" + "ldr q1, [x2, #432]\n\t" + "ldr q2, [x3, #416]\n\t" + "ldr q3, [x3, #432]\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "trn2 v16.4s, v26.4s, v16.4s\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #448]\n\t" + "ldr q1, [x2, #464]\n\t" + "ldr q2, [x3, #448]\n\t" + "ldr q3, [x3, #464]\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "trn2 v20.4s, v26.4s, v20.4s\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #480]\n\t" + "ldr q1, [x2, #496]\n\t" + "ldr q2, [x3, #480]\n\t" + "ldr q3, [x3, #496]\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "trn2 v24.4s, v26.4s, v24.4s\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #528]\n\t" + "ldr q2, [x3, #528]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "trn2 v12.2d, v26.2d, v12.2d\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.h[0]\n\t" + "mul v27.8h, v28.8h, v2.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "trn2 v16.2d, v26.2d, v16.2d\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.h[2]\n\t" + "mul v27.8h, v28.8h, v2.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "trn2 v20.2d, v26.2d, v20.2d\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.h[4]\n\t" + "mul v27.8h, v28.8h, v2.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "trn2 v24.2d, v26.2d, v24.2d\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.h[6]\n\t" + "mul v27.8h, v28.8h, v2.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v11.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v11.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v13.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v15.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v13.8h, v25.8h, v8.h[0]\n\t" + "mls v15.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v19.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v19.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v21.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v23.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v21.8h, v25.8h, v8.h[0]\n\t" + "mls v23.8h, v26.8h, v8.h[0]\n\t" + "stp q9, q10, [x1]\n\t" + "stp q11, q12, [x1, #32]\n\t" + "stp q13, q14, [x1, #64]\n\t" + "stp q15, q16, [x1, #96]\n\t" + "stp q17, q18, [x1, #128]\n\t" + "stp q19, q20, [x1, #160]\n\t" + "stp q21, q22, [x1, #192]\n\t" + "stp q23, q24, [x1, #224]\n\t" + "ldr q4, [x2, #544]\n\t" + "ldr q5, [x2, #560]\n\t" + "ldr q6, [x3, #544]\n\t" + "ldr q7, [x3, #560]\n\t" + "ldr q9, [%x[r]]\n\t" + "ldr q10, [%x[r], #32]\n\t" + "ldr q11, [%x[r], #64]\n\t" + "ldr q12, [%x[r], #96]\n\t" + "ldr q13, [%x[r], #128]\n\t" + "ldr q14, [%x[r], #160]\n\t" + "ldr q15, [%x[r], #192]\n\t" + "ldr q16, [%x[r], #224]\n\t" + "ldr q17, [x1]\n\t" + "ldr q18, [x1, #32]\n\t" + "ldr q19, [x1, #64]\n\t" + "ldr q20, [x1, #96]\n\t" + "ldr q21, [x1, #128]\n\t" + "ldr q22, [x1, #160]\n\t" + "ldr q23, [x1, #192]\n\t" + "ldr q24, [x1, #224]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v6.h[0]\n\t" + "mul v27.8h, v28.8h, v6.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v6.h[2]\n\t" + "mul v27.8h, v28.8h, v6.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v6.h[4]\n\t" + "mul v27.8h, v28.8h, v6.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v6.h[6]\n\t" + "mul v27.8h, v28.8h, v6.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v11.8h\n\t" + "sub v28.8h, v10.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v11.8h\n\t" + "add v10.8h, v10.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v7.h[0]\n\t" + "mul v27.8h, v28.8h, v7.h[0]\n\t" + "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v11.8h, v11.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v15.8h\n\t" + "sub v28.8h, v14.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v15.8h\n\t" + "add v14.8h, v14.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[1]\n\t" + "mul v27.8h, v28.8h, v7.h[1]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v15.8h, v15.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v19.8h\n\t" + "sub v28.8h, v18.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v19.8h\n\t" + "add v18.8h, v18.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[2]\n\t" + "mul v27.8h, v28.8h, v7.h[2]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v19.8h, v19.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v23.8h\n\t" + "sub v28.8h, v22.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v23.8h\n\t" + "add v22.8h, v22.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[3]\n\t" + "mul v27.8h, v28.8h, v7.h[3]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v13.8h\n\t" + "sub v28.8h, v10.8h, v14.8h\n\t" + "add v9.8h, v9.8h, v13.8h\n\t" + "add v10.8h, v10.8h, v14.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v13.8h, v13.8h, v25.8h\n\t" + "sub v14.8h, v14.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sub v26.8h, v11.8h, v15.8h\n\t" + "sub v28.8h, v12.8h, v16.8h\n\t" + "add v11.8h, v11.8h, v15.8h\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v15.8h, v15.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v21.8h\n\t" + "sub v28.8h, v18.8h, v22.8h\n\t" + "add v17.8h, v17.8h, v21.8h\n\t" + "add v18.8h, v18.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v21.8h, v21.8h, v25.8h\n\t" + "sub v22.8h, v22.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v19.8h, v23.8h\n\t" + "sub v28.8h, v20.8h, v24.8h\n\t" + "add v19.8h, v19.8h, v23.8h\n\t" + "add v20.8h, v20.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v10.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v10.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v11.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v12.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v11.8h, v25.8h, v8.h[0]\n\t" + "mls v12.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v18.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v18.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v19.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v20.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v19.8h, v25.8h, v8.h[0]\n\t" + "mls v20.8h, v26.8h, v8.h[0]\n\t" + "sub v26.8h, v9.8h, v17.8h\n\t" + "sub v28.8h, v10.8h, v18.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v17.8h, v17.8h, v25.8h\n\t" + "sub v18.8h, v18.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sub v26.8h, v11.8h, v19.8h\n\t" + "sub v28.8h, v12.8h, v20.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "add v12.8h, v12.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v19.8h, v19.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v13.8h, v21.8h\n\t" + "sub v28.8h, v14.8h, v22.8h\n\t" + "add v13.8h, v13.8h, v21.8h\n\t" + "add v14.8h, v14.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v21.8h, v21.8h, v25.8h\n\t" + "sub v22.8h, v22.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v15.8h, v23.8h\n\t" + "sub v28.8h, v16.8h, v24.8h\n\t" + "add v15.8h, v15.8h, v23.8h\n\t" + "add v16.8h, v16.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v25.8h, v9.8h, v7.h[7]\n\t" + "mul v26.8h, v10.8h, v7.h[7]\n\t" + "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" + "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v9.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v10.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v9.8h, v9.8h, v25.8h\n\t" + "sub v10.8h, v10.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v25.8h, v11.8h, v7.h[7]\n\t" + "mul v26.8h, v12.8h, v7.h[7]\n\t" + "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" + "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v11.8h, v11.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v25.8h, v13.8h, v7.h[7]\n\t" + "mul v26.8h, v14.8h, v7.h[7]\n\t" + "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" + "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v13.8h, v13.8h, v25.8h\n\t" + "sub v14.8h, v14.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v25.8h, v15.8h, v7.h[7]\n\t" + "mul v26.8h, v16.8h, v7.h[7]\n\t" + "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" + "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v15.8h, v15.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mul v25.8h, v17.8h, v7.h[7]\n\t" + "mul v26.8h, v18.8h, v7.h[7]\n\t" + "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" + "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v17.8h, v17.8h, v25.8h\n\t" + "sub v18.8h, v18.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "mul v25.8h, v19.8h, v7.h[7]\n\t" + "mul v26.8h, v20.8h, v7.h[7]\n\t" + "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" + "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v19.8h, v19.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mul v25.8h, v21.8h, v7.h[7]\n\t" + "mul v26.8h, v22.8h, v7.h[7]\n\t" + "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" + "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v21.8h, v21.8h, v25.8h\n\t" + "sub v22.8h, v22.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v25.8h, v23.8h, v7.h[7]\n\t" + "mul v26.8h, v24.8h, v7.h[7]\n\t" + "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" + "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "str q9, [%x[r]]\n\t" + "str q10, [%x[r], #32]\n\t" + "str q11, [%x[r], #64]\n\t" + "str q12, [%x[r], #96]\n\t" + "str q13, [%x[r], #128]\n\t" + "str q14, [%x[r], #160]\n\t" + "str q15, [%x[r], #192]\n\t" + "str q16, [%x[r], #224]\n\t" + "str q17, [x1]\n\t" + "str q18, [x1, #32]\n\t" + "str q19, [x1, #64]\n\t" + "str q20, [x1, #96]\n\t" + "str q21, [x1, #128]\n\t" + "str q22, [x1, #160]\n\t" + "str q23, [x1, #192]\n\t" + "str q24, [x1, #224]\n\t" + "ldr q9, [%x[r], #16]\n\t" + "ldr q10, [%x[r], #48]\n\t" + "ldr q11, [%x[r], #80]\n\t" + "ldr q12, [%x[r], #112]\n\t" + "ldr q13, [%x[r], #144]\n\t" + "ldr q14, [%x[r], #176]\n\t" + "ldr q15, [%x[r], #208]\n\t" + "ldr q16, [%x[r], #240]\n\t" + "ldr q17, [x1, #16]\n\t" + "ldr q18, [x1, #48]\n\t" + "ldr q19, [x1, #80]\n\t" + "ldr q20, [x1, #112]\n\t" + "ldr q21, [x1, #144]\n\t" + "ldr q22, [x1, #176]\n\t" + "ldr q23, [x1, #208]\n\t" + "ldr q24, [x1, #240]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v6.h[0]\n\t" + "mul v27.8h, v28.8h, v6.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v10.8h, v10.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v6.h[2]\n\t" + "mul v27.8h, v28.8h, v6.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v14.8h, v14.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v6.h[4]\n\t" + "mul v27.8h, v28.8h, v6.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v18.8h, v18.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v6.h[6]\n\t" + "mul v27.8h, v28.8h, v6.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v22.8h, v22.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v11.8h\n\t" + "sub v28.8h, v10.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v11.8h\n\t" + "add v10.8h, v10.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v7.h[0]\n\t" + "mul v27.8h, v28.8h, v7.h[0]\n\t" + "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v11.8h, v11.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v15.8h\n\t" + "sub v28.8h, v14.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v15.8h\n\t" + "add v14.8h, v14.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[1]\n\t" + "mul v27.8h, v28.8h, v7.h[1]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v15.8h, v15.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v19.8h\n\t" + "sub v28.8h, v18.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v19.8h\n\t" + "add v18.8h, v18.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[2]\n\t" + "mul v27.8h, v28.8h, v7.h[2]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v19.8h, v19.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v23.8h\n\t" + "sub v28.8h, v22.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v23.8h\n\t" + "add v22.8h, v22.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[3]\n\t" + "mul v27.8h, v28.8h, v7.h[3]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v13.8h\n\t" + "sub v28.8h, v10.8h, v14.8h\n\t" + "add v9.8h, v9.8h, v13.8h\n\t" + "add v10.8h, v10.8h, v14.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v13.8h, v13.8h, v25.8h\n\t" + "sub v14.8h, v14.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sub v26.8h, v11.8h, v15.8h\n\t" + "sub v28.8h, v12.8h, v16.8h\n\t" + "add v11.8h, v11.8h, v15.8h\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v15.8h, v15.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v21.8h\n\t" + "sub v28.8h, v18.8h, v22.8h\n\t" + "add v17.8h, v17.8h, v21.8h\n\t" + "add v18.8h, v18.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v21.8h, v21.8h, v25.8h\n\t" + "sub v22.8h, v22.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v19.8h, v23.8h\n\t" + "sub v28.8h, v20.8h, v24.8h\n\t" + "add v19.8h, v19.8h, v23.8h\n\t" + "add v20.8h, v20.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v10.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v10.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v11.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v12.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v11.8h, v25.8h, v8.h[0]\n\t" + "mls v12.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v18.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v18.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v19.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v20.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v19.8h, v25.8h, v8.h[0]\n\t" + "mls v20.8h, v26.8h, v8.h[0]\n\t" + "sub v26.8h, v9.8h, v17.8h\n\t" + "sub v28.8h, v10.8h, v18.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v17.8h, v17.8h, v25.8h\n\t" + "sub v18.8h, v18.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sub v26.8h, v11.8h, v19.8h\n\t" + "sub v28.8h, v12.8h, v20.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "add v12.8h, v12.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v19.8h, v19.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v13.8h, v21.8h\n\t" + "sub v28.8h, v14.8h, v22.8h\n\t" + "add v13.8h, v13.8h, v21.8h\n\t" + "add v14.8h, v14.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v21.8h, v21.8h, v25.8h\n\t" + "sub v22.8h, v22.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v15.8h, v23.8h\n\t" + "sub v28.8h, v16.8h, v24.8h\n\t" + "add v15.8h, v15.8h, v23.8h\n\t" + "add v16.8h, v16.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v25.8h, v9.8h, v7.h[7]\n\t" + "mul v26.8h, v10.8h, v7.h[7]\n\t" + "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" + "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v9.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v10.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v9.8h, v9.8h, v25.8h\n\t" + "sub v10.8h, v10.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v25.8h, v11.8h, v7.h[7]\n\t" + "mul v26.8h, v12.8h, v7.h[7]\n\t" + "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" + "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v11.8h, v11.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v25.8h, v13.8h, v7.h[7]\n\t" + "mul v26.8h, v14.8h, v7.h[7]\n\t" + "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" + "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v13.8h, v13.8h, v25.8h\n\t" + "sub v14.8h, v14.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v25.8h, v15.8h, v7.h[7]\n\t" + "mul v26.8h, v16.8h, v7.h[7]\n\t" + "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" + "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v15.8h, v15.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mul v25.8h, v17.8h, v7.h[7]\n\t" + "mul v26.8h, v18.8h, v7.h[7]\n\t" + "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" + "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v17.8h, v17.8h, v25.8h\n\t" + "sub v18.8h, v18.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "mul v25.8h, v19.8h, v7.h[7]\n\t" + "mul v26.8h, v20.8h, v7.h[7]\n\t" + "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" + "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v19.8h, v19.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mul v25.8h, v21.8h, v7.h[7]\n\t" + "mul v26.8h, v22.8h, v7.h[7]\n\t" + "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" + "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v21.8h, v21.8h, v25.8h\n\t" + "sub v22.8h, v22.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v25.8h, v23.8h, v7.h[7]\n\t" + "mul v26.8h, v24.8h, v7.h[7]\n\t" + "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" + "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v26.8h, v8.h[0]\n\t" +#else + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v26.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "str q9, [%x[r], #16]\n\t" + "str q10, [%x[r], #48]\n\t" + "str q11, [%x[r], #80]\n\t" + "str q12, [%x[r], #112]\n\t" + "str q13, [%x[r], #144]\n\t" + "str q14, [%x[r], #176]\n\t" + "str q15, [%x[r], #208]\n\t" + "str q16, [%x[r], #240]\n\t" + "str q17, [x1, #16]\n\t" + "str q18, [x1, #48]\n\t" + "str q19, [x1, #80]\n\t" + "str q20, [x1, #112]\n\t" + "str q21, [x1, #144]\n\t" + "str q22, [x1, #176]\n\t" + "str q23, [x1, #208]\n\t" + "str q24, [x1, #240]\n\t" + : [r] "+r" (r) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv) + : "memory", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_zetas_mul[] = { + 0x8b2, + 0xf74e, + 0x1ae, + 0xfe52, + 0x22b, + 0xfdd5, + 0x34b, + 0xfcb5, + 0x81e, + 0xf7e2, + 0x367, + 0xfc99, + 0x60e, + 0xf9f2, + 0x69, + 0xff97, + 0x1a6, + 0xfe5a, + 0x24b, + 0xfdb5, + 0xb1, + 0xff4f, + 0xc16, + 0xf3ea, + 0xbde, + 0xf422, + 0xb35, + 0xf4cb, + 0x626, + 0xf9da, + 0x675, + 0xf98b, + 0xc0b, + 0xf3f5, + 0x30a, + 0xfcf6, + 0x487, + 0xfb79, + 0xc6e, + 0xf392, + 0x9f8, + 0xf608, + 0x5cb, + 0xfa35, + 0xaa7, + 0xf559, + 0x45f, + 0xfba1, + 0x6cb, + 0xf935, + 0x284, + 0xfd7c, + 0x999, + 0xf667, + 0x15d, + 0xfea3, + 0x1a2, + 0xfe5e, + 0x149, + 0xfeb7, + 0xc65, + 0xf39b, + 0xcb6, + 0xf34a, + 0x331, + 0xfccf, + 0x449, + 0xfbb7, + 0x25b, + 0xfda5, + 0x262, + 0xfd9e, + 0x52a, + 0xfad6, + 0x7fc, + 0xf804, + 0x748, + 0xf8b8, + 0x180, + 0xfe80, + 0x842, + 0xf7be, + 0xc79, + 0xf387, + 0x4c2, + 0xfb3e, + 0x7ca, + 0xf836, + 0x997, + 0xf669, + 0xdc, + 0xff24, + 0x85e, + 0xf7a2, + 0x686, + 0xf97a, + 0x860, + 0xf7a0, + 0x707, + 0xf8f9, + 0x803, + 0xf7fd, + 0x31a, + 0xfce6, + 0x71b, + 0xf8e5, + 0x9ab, + 0xf655, + 0x99b, + 0xf665, + 0x1de, + 0xfe22, + 0xc95, + 0xf36b, + 0xbcd, + 0xf433, + 0x3e4, + 0xfc1c, + 0x3df, + 0xfc21, + 0x3be, + 0xfc42, + 0x74d, + 0xf8b3, + 0x5f2, + 0xfa0e, + 0x65c, + 0xf9a4, +}; + +void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_mul]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_mul]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_mul]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_mul]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q1, [x4]\n\t" + "ldp q2, q3, [%x[a]]\n\t" + "ldp q4, q5, [%x[a], #32]\n\t" + "ldp q6, q7, [%x[a], #64]\n\t" + "ldp q8, q9, [%x[a], #96]\n\t" + "ldp q10, q11, [%x[b]]\n\t" + "ldp q12, q13, [%x[b], #32]\n\t" + "ldp q14, q15, [%x[b], #64]\n\t" + "ldp q16, q17, [%x[b], #96]\n\t" + "ldr q0, [x3]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r]]\n\t" + "ldr q0, [x3, #16]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #32]\n\t" + "ldr q0, [x3, #32]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #64]\n\t" + "ldr q0, [x3, #48]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #96]\n\t" + "ldp q2, q3, [%x[a], #128]\n\t" + "ldp q4, q5, [%x[a], #160]\n\t" + "ldp q6, q7, [%x[a], #192]\n\t" + "ldp q8, q9, [%x[a], #224]\n\t" + "ldp q10, q11, [%x[b], #128]\n\t" + "ldp q12, q13, [%x[b], #160]\n\t" + "ldp q14, q15, [%x[b], #192]\n\t" + "ldp q16, q17, [%x[b], #224]\n\t" + "ldr q0, [x3, #64]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #128]\n\t" + "ldr q0, [x3, #80]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #160]\n\t" + "ldr q0, [x3, #96]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #192]\n\t" + "ldr q0, [x3, #112]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #224]\n\t" + "ldp q2, q3, [%x[a], #256]\n\t" + "ldp q4, q5, [%x[a], #288]\n\t" + "ldp q6, q7, [%x[a], #320]\n\t" + "ldp q8, q9, [%x[a], #352]\n\t" + "ldp q10, q11, [%x[b], #256]\n\t" + "ldp q12, q13, [%x[b], #288]\n\t" + "ldp q14, q15, [%x[b], #320]\n\t" + "ldp q16, q17, [%x[b], #352]\n\t" + "ldr q0, [x3, #128]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #256]\n\t" + "ldr q0, [x3, #144]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #288]\n\t" + "ldr q0, [x3, #160]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #320]\n\t" + "ldr q0, [x3, #176]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #352]\n\t" + "ldp q2, q3, [%x[a], #384]\n\t" + "ldp q4, q5, [%x[a], #416]\n\t" + "ldp q6, q7, [%x[a], #448]\n\t" + "ldp q8, q9, [%x[a], #480]\n\t" + "ldp q10, q11, [%x[b], #384]\n\t" + "ldp q12, q13, [%x[b], #416]\n\t" + "ldp q14, q15, [%x[b], #448]\n\t" + "ldp q16, q17, [%x[b], #480]\n\t" + "ldr q0, [x3, #192]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #384]\n\t" + "ldr q0, [x3, #208]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #416]\n\t" + "ldr q0, [x3, #224]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #448]\n\t" + "ldr q0, [x3, #240]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "stp q24, q25, [%x[r], #480]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "cc" + ); +} + +void kyber_basemul_mont_add(sword16* r, const sword16* a, const sword16* b) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_mul]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_mul]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_mul]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_mul]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q1, [x4]\n\t" + "ldp q2, q3, [%x[a]]\n\t" + "ldp q4, q5, [%x[a], #32]\n\t" + "ldp q6, q7, [%x[a], #64]\n\t" + "ldp q8, q9, [%x[a], #96]\n\t" + "ldp q10, q11, [%x[b]]\n\t" + "ldp q12, q13, [%x[b], #32]\n\t" + "ldp q14, q15, [%x[b], #64]\n\t" + "ldp q16, q17, [%x[b], #96]\n\t" + "ldp q28, q29, [%x[r]]\n\t" + "ldr q0, [x3]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r]]\n\t" + "ldp q28, q29, [%x[r], #32]\n\t" + "ldr q0, [x3, #16]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #32]\n\t" + "ldp q28, q29, [%x[r], #64]\n\t" + "ldr q0, [x3, #32]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #64]\n\t" + "ldp q28, q29, [%x[r], #96]\n\t" + "ldr q0, [x3, #48]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #96]\n\t" + "ldp q2, q3, [%x[a], #128]\n\t" + "ldp q4, q5, [%x[a], #160]\n\t" + "ldp q6, q7, [%x[a], #192]\n\t" + "ldp q8, q9, [%x[a], #224]\n\t" + "ldp q10, q11, [%x[b], #128]\n\t" + "ldp q12, q13, [%x[b], #160]\n\t" + "ldp q14, q15, [%x[b], #192]\n\t" + "ldp q16, q17, [%x[b], #224]\n\t" + "ldp q28, q29, [%x[r], #128]\n\t" + "ldr q0, [x3, #64]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #128]\n\t" + "ldp q28, q29, [%x[r], #160]\n\t" + "ldr q0, [x3, #80]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #160]\n\t" + "ldp q28, q29, [%x[r], #192]\n\t" + "ldr q0, [x3, #96]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #192]\n\t" + "ldp q28, q29, [%x[r], #224]\n\t" + "ldr q0, [x3, #112]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #224]\n\t" + "ldp q2, q3, [%x[a], #256]\n\t" + "ldp q4, q5, [%x[a], #288]\n\t" + "ldp q6, q7, [%x[a], #320]\n\t" + "ldp q8, q9, [%x[a], #352]\n\t" + "ldp q10, q11, [%x[b], #256]\n\t" + "ldp q12, q13, [%x[b], #288]\n\t" + "ldp q14, q15, [%x[b], #320]\n\t" + "ldp q16, q17, [%x[b], #352]\n\t" + "ldp q28, q29, [%x[r], #256]\n\t" + "ldr q0, [x3, #128]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #256]\n\t" + "ldp q28, q29, [%x[r], #288]\n\t" + "ldr q0, [x3, #144]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #288]\n\t" + "ldp q28, q29, [%x[r], #320]\n\t" + "ldr q0, [x3, #160]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #320]\n\t" + "ldp q28, q29, [%x[r], #352]\n\t" + "ldr q0, [x3, #176]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #352]\n\t" + "ldp q2, q3, [%x[a], #384]\n\t" + "ldp q4, q5, [%x[a], #416]\n\t" + "ldp q6, q7, [%x[a], #448]\n\t" + "ldp q8, q9, [%x[a], #480]\n\t" + "ldp q10, q11, [%x[b], #384]\n\t" + "ldp q12, q13, [%x[b], #416]\n\t" + "ldp q14, q15, [%x[b], #448]\n\t" + "ldp q16, q17, [%x[b], #480]\n\t" + "ldp q28, q29, [%x[r], #384]\n\t" + "ldr q0, [x3, #192]\n\t" + "uzp1 v18.8h, v2.8h, v3.8h\n\t" + "uzp2 v19.8h, v2.8h, v3.8h\n\t" + "uzp1 v20.8h, v10.8h, v11.8h\n\t" + "uzp2 v21.8h, v10.8h, v11.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #384]\n\t" + "ldp q28, q29, [%x[r], #416]\n\t" + "ldr q0, [x3, #208]\n\t" + "uzp1 v18.8h, v4.8h, v5.8h\n\t" + "uzp2 v19.8h, v4.8h, v5.8h\n\t" + "uzp1 v20.8h, v12.8h, v13.8h\n\t" + "uzp2 v21.8h, v12.8h, v13.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #416]\n\t" + "ldp q28, q29, [%x[r], #448]\n\t" + "ldr q0, [x3, #224]\n\t" + "uzp1 v18.8h, v6.8h, v7.8h\n\t" + "uzp2 v19.8h, v6.8h, v7.8h\n\t" + "uzp1 v20.8h, v14.8h, v15.8h\n\t" + "uzp2 v21.8h, v14.8h, v15.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #448]\n\t" + "ldp q28, q29, [%x[r], #480]\n\t" + "ldr q0, [x3, #240]\n\t" + "uzp1 v18.8h, v8.8h, v9.8h\n\t" + "uzp2 v19.8h, v8.8h, v9.8h\n\t" + "uzp1 v20.8h, v16.8h, v17.8h\n\t" + "uzp2 v21.8h, v16.8h, v17.8h\n\t" + "smull v26.4s, v18.4h, v20.4h\n\t" + "smull2 v27.4s, v18.8h, v20.8h\n\t" + "smull v23.4s, v19.4h, v21.4h\n\t" + "smull2 v24.4s, v19.8h, v21.8h\n\t" + "xtn v25.4h, v23.4s\n\t" + "xtn2 v25.8h, v24.4s\n\t" + "mul v25.8h, v25.8h, v1.h[1]\n\t" + "smlsl v23.4s, v25.4h, v1.h[0]\n\t" + "smlsl2 v24.4s, v25.8h, v1.h[0]\n\t" + "shrn v22.4h, v23.4s, #16\n\t" + "shrn2 v22.8h, v24.4s, #16\n\t" + "smlal v26.4s, v22.4h, v0.4h\n\t" + "smlal2 v27.4s, v22.8h, v0.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v22.4h, v26.4s, #16\n\t" + "shrn2 v22.8h, v27.4s, #16\n\t" + "smull v26.4s, v18.4h, v21.4h\n\t" + "smull2 v27.4s, v18.8h, v21.8h\n\t" + "smlal v26.4s, v19.4h, v20.4h\n\t" + "smlal2 v27.4s, v19.8h, v20.8h\n\t" + "xtn v24.4h, v26.4s\n\t" + "xtn2 v24.8h, v27.4s\n\t" + "mul v24.8h, v24.8h, v1.h[1]\n\t" + "smlsl v26.4s, v24.4h, v1.h[0]\n\t" + "smlsl2 v27.4s, v24.8h, v1.h[0]\n\t" + "shrn v23.4h, v26.4s, #16\n\t" + "shrn2 v23.8h, v27.4s, #16\n\t" + "zip1 v24.8h, v22.8h, v23.8h\n\t" + "zip2 v25.8h, v22.8h, v23.8h\n\t" + "add v28.8h, v28.8h, v24.8h\n\t" + "add v29.8h, v29.8h, v25.8h\n\t" + "stp q28, q29, [%x[r], #480]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "cc" + ); +} + +void kyber_csubq_neon(sword16* p) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x1, %[L_kyber_aarch64_q]\n\t" + "add x1, x1, :lo12:%[L_kyber_aarch64_q]\n\t" +#else + "adrp x1, %[L_kyber_aarch64_q]@PAGE\n\t" + "add x1, x1, %[L_kyber_aarch64_q]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q20, [x1]\n\t" + "ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "sub v0.8h, v0.8h, v20.8h\n\t" + "sub v1.8h, v1.8h, v20.8h\n\t" + "sub v2.8h, v2.8h, v20.8h\n\t" + "sub v3.8h, v3.8h, v20.8h\n\t" + "sub v4.8h, v4.8h, v20.8h\n\t" + "sub v5.8h, v5.8h, v20.8h\n\t" + "sub v6.8h, v6.8h, v20.8h\n\t" + "sub v7.8h, v7.8h, v20.8h\n\t" + "sub v8.8h, v8.8h, v20.8h\n\t" + "sub v9.8h, v9.8h, v20.8h\n\t" + "sub v10.8h, v10.8h, v20.8h\n\t" + "sub v11.8h, v11.8h, v20.8h\n\t" + "sub v12.8h, v12.8h, v20.8h\n\t" + "sub v13.8h, v13.8h, v20.8h\n\t" + "sub v14.8h, v14.8h, v20.8h\n\t" + "sub v15.8h, v15.8h, v20.8h\n\t" + "sshr v16.8h, v0.8h, #15\n\t" + "sshr v17.8h, v1.8h, #15\n\t" + "sshr v18.8h, v2.8h, #15\n\t" + "sshr v19.8h, v3.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v0.8h, v0.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "sshr v16.8h, v4.8h, #15\n\t" + "sshr v17.8h, v5.8h, #15\n\t" + "sshr v18.8h, v6.8h, #15\n\t" + "sshr v19.8h, v7.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v4.8h, v4.8h, v16.8h\n\t" + "add v5.8h, v5.8h, v17.8h\n\t" + "add v6.8h, v6.8h, v18.8h\n\t" + "add v7.8h, v7.8h, v19.8h\n\t" + "sshr v16.8h, v8.8h, #15\n\t" + "sshr v17.8h, v9.8h, #15\n\t" + "sshr v18.8h, v10.8h, #15\n\t" + "sshr v19.8h, v11.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "sshr v16.8h, v12.8h, #15\n\t" + "sshr v17.8h, v13.8h, #15\n\t" + "sshr v18.8h, v14.8h, #15\n\t" + "sshr v19.8h, v15.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v17.8h\n\t" + "add v14.8h, v14.8h, v18.8h\n\t" + "add v15.8h, v15.8h, v19.8h\n\t" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + "ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "sub v0.8h, v0.8h, v20.8h\n\t" + "sub v1.8h, v1.8h, v20.8h\n\t" + "sub v2.8h, v2.8h, v20.8h\n\t" + "sub v3.8h, v3.8h, v20.8h\n\t" + "sub v4.8h, v4.8h, v20.8h\n\t" + "sub v5.8h, v5.8h, v20.8h\n\t" + "sub v6.8h, v6.8h, v20.8h\n\t" + "sub v7.8h, v7.8h, v20.8h\n\t" + "sub v8.8h, v8.8h, v20.8h\n\t" + "sub v9.8h, v9.8h, v20.8h\n\t" + "sub v10.8h, v10.8h, v20.8h\n\t" + "sub v11.8h, v11.8h, v20.8h\n\t" + "sub v12.8h, v12.8h, v20.8h\n\t" + "sub v13.8h, v13.8h, v20.8h\n\t" + "sub v14.8h, v14.8h, v20.8h\n\t" + "sub v15.8h, v15.8h, v20.8h\n\t" + "sshr v16.8h, v0.8h, #15\n\t" + "sshr v17.8h, v1.8h, #15\n\t" + "sshr v18.8h, v2.8h, #15\n\t" + "sshr v19.8h, v3.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v0.8h, v0.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "sshr v16.8h, v4.8h, #15\n\t" + "sshr v17.8h, v5.8h, #15\n\t" + "sshr v18.8h, v6.8h, #15\n\t" + "sshr v19.8h, v7.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v4.8h, v4.8h, v16.8h\n\t" + "add v5.8h, v5.8h, v17.8h\n\t" + "add v6.8h, v6.8h, v18.8h\n\t" + "add v7.8h, v7.8h, v19.8h\n\t" + "sshr v16.8h, v8.8h, #15\n\t" + "sshr v17.8h, v9.8h, #15\n\t" + "sshr v18.8h, v10.8h, #15\n\t" + "sshr v19.8h, v11.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "sshr v16.8h, v12.8h, #15\n\t" + "sshr v17.8h, v13.8h, #15\n\t" + "sshr v18.8h, v14.8h, #15\n\t" + "sshr v19.8h, v15.8h, #15\n\t" + "and v16.16b, v16.16b, v20.16b\n\t" + "and v17.16b, v17.16b, v20.16b\n\t" + "and v18.16b, v18.16b, v20.16b\n\t" + "and v19.16b, v19.16b, v20.16b\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v17.8h\n\t" + "add v14.8h, v14.8h, v18.8h\n\t" + "add v15.8h, v15.8h, v19.8h\n\t" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%x[p]], #0x40\n\t" + "st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t" + "st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t" + : [p] "+r" (p) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "cc" + ); +} + +void kyber_add_reduce(sword16* r, const sword16* a) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_consts]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x2]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc" + ); +} + +void kyber_add3_reduce(sword16* r, const sword16* a, const sword16* b) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_consts]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x3]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [%x[b]], #0x40\n\t" + "ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [%x[b]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "add v1.8h, v1.8h, v9.8h\n\t" + "add v2.8h, v2.8h, v10.8h\n\t" + "add v3.8h, v3.8h, v11.8h\n\t" + "add v4.8h, v4.8h, v12.8h\n\t" + "add v5.8h, v5.8h, v13.8h\n\t" + "add v6.8h, v6.8h, v14.8h\n\t" + "add v7.8h, v7.8h, v15.8h\n\t" + "add v8.8h, v8.8h, v16.8h\n\t" + "add v1.8h, v1.8h, v17.8h\n\t" + "add v2.8h, v2.8h, v18.8h\n\t" + "add v3.8h, v3.8h, v19.8h\n\t" + "add v4.8h, v4.8h, v20.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sqdmulh v25.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v2.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v1.8h, v25.8h, v0.h[0]\n\t" + "mls v2.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v4.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v3.8h, v25.8h, v0.h[0]\n\t" + "mls v4.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v6.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v5.8h, v25.8h, v0.h[0]\n\t" + "mls v6.8h, v26.8h, v0.h[0]\n\t" + "sqdmulh v25.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v26.8h, v8.8h, v0.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v7.8h, v25.8h, v0.h[0]\n\t" + "mls v8.8h, v26.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "cc" + ); +} + +void kyber_rsub_reduce(sword16* r, const sword16* a) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_consts]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x2]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[a]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[a]], #0x40\n\t" + "sub %x[r], %x[r], #0x80\n\t" + "sub v1.8h, v9.8h, v1.8h\n\t" + "sub v2.8h, v10.8h, v2.8h\n\t" + "sub v3.8h, v11.8h, v3.8h\n\t" + "sub v4.8h, v12.8h, v4.8h\n\t" + "sub v5.8h, v13.8h, v5.8h\n\t" + "sub v6.8h, v14.8h, v6.8h\n\t" + "sub v7.8h, v15.8h, v7.8h\n\t" + "sub v8.8h, v16.8h, v8.8h\n\t" + "sqdmulh v17.8h, v1.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v2.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v1.8h, v17.8h, v0.h[0]\n\t" + "mls v2.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v3.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v4.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v3.8h, v17.8h, v0.h[0]\n\t" + "mls v4.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v5.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v6.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v5.8h, v17.8h, v0.h[0]\n\t" + "mls v6.8h, v18.8h, v0.h[0]\n\t" + "sqdmulh v17.8h, v7.8h, v0.h[2]\n\t" + "sqdmulh v18.8h, v8.8h, v0.h[2]\n\t" + "sshr v17.8h, v17.8h, #11\n\t" + "sshr v18.8h, v18.8h, #11\n\t" + "mls v7.8h, v17.8h, v0.h[0]\n\t" + "mls v8.8h, v18.8h, v0.h[0]\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc" + ); +} + +void kyber_to_mont(sword16* p) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x1, %[L_kyber_aarch64_consts]\n\t" + "add x1, x1, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x1, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x1, x1, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x1]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "mul v17.8h, v1.8h, v0.h[4]\n\t" + "mul v18.8h, v2.8h, v0.h[4]\n\t" + "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" + "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v1.8h, v1.8h, v17.8h\n\t" + "sub v2.8h, v2.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v1.8h, v1.8h, #1\n\t" + "sshr v2.8h, v2.8h, #1\n\t" + "mul v17.8h, v3.8h, v0.h[4]\n\t" + "mul v18.8h, v4.8h, v0.h[4]\n\t" + "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" + "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v3.8h, v3.8h, v17.8h\n\t" + "sub v4.8h, v4.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v3.8h, v3.8h, #1\n\t" + "sshr v4.8h, v4.8h, #1\n\t" + "mul v17.8h, v5.8h, v0.h[4]\n\t" + "mul v18.8h, v6.8h, v0.h[4]\n\t" + "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" + "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v5.8h, v5.8h, v17.8h\n\t" + "sub v6.8h, v6.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v5.8h, v5.8h, #1\n\t" + "sshr v6.8h, v6.8h, #1\n\t" + "mul v17.8h, v7.8h, v0.h[4]\n\t" + "mul v18.8h, v8.8h, v0.h[4]\n\t" + "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" + "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v7.8h, v7.8h, v17.8h\n\t" + "sub v8.8h, v8.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v7.8h, v7.8h, #1\n\t" + "sshr v8.8h, v8.8h, #1\n\t" + "mul v17.8h, v9.8h, v0.h[4]\n\t" + "mul v18.8h, v10.8h, v0.h[4]\n\t" + "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" + "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v9.8h, v9.8h, v17.8h\n\t" + "sub v10.8h, v10.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v17.8h, v11.8h, v0.h[4]\n\t" + "mul v18.8h, v12.8h, v0.h[4]\n\t" + "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" + "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v11.8h, v11.8h, v17.8h\n\t" + "sub v12.8h, v12.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v17.8h, v13.8h, v0.h[4]\n\t" + "mul v18.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" + "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v13.8h, v13.8h, v17.8h\n\t" + "sub v14.8h, v14.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v17.8h, v15.8h, v0.h[4]\n\t" + "mul v18.8h, v16.8h, v0.h[4]\n\t" + "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" + "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v15.8h, v15.8h, v17.8h\n\t" + "sub v16.8h, v16.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "mul v17.8h, v1.8h, v0.h[4]\n\t" + "mul v18.8h, v2.8h, v0.h[4]\n\t" + "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" + "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v1.8h, v1.8h, v17.8h\n\t" + "sub v2.8h, v2.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v1.8h, v1.8h, #1\n\t" + "sshr v2.8h, v2.8h, #1\n\t" + "mul v17.8h, v3.8h, v0.h[4]\n\t" + "mul v18.8h, v4.8h, v0.h[4]\n\t" + "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" + "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v3.8h, v3.8h, v17.8h\n\t" + "sub v4.8h, v4.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v3.8h, v3.8h, #1\n\t" + "sshr v4.8h, v4.8h, #1\n\t" + "mul v17.8h, v5.8h, v0.h[4]\n\t" + "mul v18.8h, v6.8h, v0.h[4]\n\t" + "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" + "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v5.8h, v5.8h, v17.8h\n\t" + "sub v6.8h, v6.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v5.8h, v5.8h, #1\n\t" + "sshr v6.8h, v6.8h, #1\n\t" + "mul v17.8h, v7.8h, v0.h[4]\n\t" + "mul v18.8h, v8.8h, v0.h[4]\n\t" + "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" + "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v7.8h, v7.8h, v17.8h\n\t" + "sub v8.8h, v8.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v7.8h, v7.8h, #1\n\t" + "sshr v8.8h, v8.8h, #1\n\t" + "mul v17.8h, v9.8h, v0.h[4]\n\t" + "mul v18.8h, v10.8h, v0.h[4]\n\t" + "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" + "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v9.8h, v9.8h, v17.8h\n\t" + "sub v10.8h, v10.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v17.8h, v11.8h, v0.h[4]\n\t" + "mul v18.8h, v12.8h, v0.h[4]\n\t" + "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" + "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v11.8h, v11.8h, v17.8h\n\t" + "sub v12.8h, v12.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v17.8h, v13.8h, v0.h[4]\n\t" + "mul v18.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" + "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v13.8h, v13.8h, v17.8h\n\t" + "sub v14.8h, v14.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v17.8h, v15.8h, v0.h[4]\n\t" + "mul v18.8h, v16.8h, v0.h[4]\n\t" + "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" + "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" +#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" +#else + "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" + "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" + "sub v15.8h, v15.8h, v17.8h\n\t" + "sub v16.8h, v16.8h, v18.8h\n\t" +#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + : [p] "+r" (p) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_to_msg_neon_low[] = { + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, + 0x373, +}; + +static const uint16_t L_kyber_aarch64_to_msg_neon_high[] = { + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, + 0x9c0, +}; + +static const uint16_t L_kyber_aarch64_to_msg_neon_bits[] = { + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, +}; + +void kyber_to_msg_neon(byte* msg, sword16* p) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_to_msg_neon_low]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_to_msg_neon_low]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_to_msg_neon_low]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_to_msg_neon_low]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_to_msg_neon_high]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_to_msg_neon_high]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_to_msg_neon_high]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_to_msg_neon_high]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_to_msg_neon_bits]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_to_msg_neon_bits]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_to_msg_neon_bits]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_to_msg_neon_bits]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "ldr q26, [x4]\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + "ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%x[p]], #0x40\n\t" + "ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%x[p]], #0x40\n\t" + "cmge v10.8h, v2.8h, v0.8h\n\t" + "cmge v18.8h, v1.8h, v2.8h\n\t" + "cmge v11.8h, v3.8h, v0.8h\n\t" + "cmge v19.8h, v1.8h, v3.8h\n\t" + "cmge v12.8h, v4.8h, v0.8h\n\t" + "cmge v20.8h, v1.8h, v4.8h\n\t" + "cmge v13.8h, v5.8h, v0.8h\n\t" + "cmge v21.8h, v1.8h, v5.8h\n\t" + "cmge v14.8h, v6.8h, v0.8h\n\t" + "cmge v22.8h, v1.8h, v6.8h\n\t" + "cmge v15.8h, v7.8h, v0.8h\n\t" + "cmge v23.8h, v1.8h, v7.8h\n\t" + "cmge v16.8h, v8.8h, v0.8h\n\t" + "cmge v24.8h, v1.8h, v8.8h\n\t" + "cmge v17.8h, v9.8h, v0.8h\n\t" + "cmge v25.8h, v1.8h, v9.8h\n\t" + "and v18.16b, v18.16b, v10.16b\n\t" + "and v19.16b, v19.16b, v11.16b\n\t" + "and v20.16b, v20.16b, v12.16b\n\t" + "and v21.16b, v21.16b, v13.16b\n\t" + "and v22.16b, v22.16b, v14.16b\n\t" + "and v23.16b, v23.16b, v15.16b\n\t" + "and v24.16b, v24.16b, v16.16b\n\t" + "and v25.16b, v25.16b, v17.16b\n\t" + "and v18.16b, v18.16b, v26.16b\n\t" + "and v19.16b, v19.16b, v26.16b\n\t" + "and v20.16b, v20.16b, v26.16b\n\t" + "and v21.16b, v21.16b, v26.16b\n\t" + "and v22.16b, v22.16b, v26.16b\n\t" + "and v23.16b, v23.16b, v26.16b\n\t" + "and v24.16b, v24.16b, v26.16b\n\t" + "and v25.16b, v25.16b, v26.16b\n\t" + "addv h18, v18.8h\n\t" + "addv h19, v19.8h\n\t" + "addv h20, v20.8h\n\t" + "addv h21, v21.8h\n\t" + "addv h22, v22.8h\n\t" + "addv h23, v23.8h\n\t" + "addv h24, v24.8h\n\t" + "addv h25, v25.8h\n\t" + "ins v18.b[1], v19.b[0]\n\t" + "ins v18.b[2], v20.b[0]\n\t" + "ins v18.b[3], v21.b[0]\n\t" + "ins v18.b[4], v22.b[0]\n\t" + "ins v18.b[5], v23.b[0]\n\t" + "ins v18.b[6], v24.b[0]\n\t" + "ins v18.b[7], v25.b[0]\n\t" + "st1 {v18.8b}, [%x[msg]], #8\n\t" + : [msg] "+r" (msg), [p] "+r" (p) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits) + : "memory", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "cc" + ); +} + +static const uint16_t L_kyber_aarch64_from_msg_neon_q1half[] = { + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, + 0x681, +}; + +static const uint8_t L_kyber_aarch64_from_msg_neon_bits[] = { + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, +}; + +void kyber_from_msg_neon(sword16* p, const byte* msg) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_from_msg_neon_q1half]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_from_msg_neon_q1half]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_from_msg_neon_q1half]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_from_msg_neon_q1half]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_from_msg_neon_bits]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_from_msg_neon_bits]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_from_msg_neon_bits]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_from_msg_neon_bits]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ld1 {v2.16b, v3.16b}, [%x[msg]]\n\t" + "ldr q1, [x2]\n\t" + "ldr q0, [x3]\n\t" + "dup v4.8b, v2.b[0]\n\t" + "dup v5.8b, v2.b[1]\n\t" + "dup v6.8b, v2.b[2]\n\t" + "dup v7.8b, v2.b[3]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v2.b[4]\n\t" + "dup v5.8b, v2.b[5]\n\t" + "dup v6.8b, v2.b[6]\n\t" + "dup v7.8b, v2.b[7]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v2.b[8]\n\t" + "dup v5.8b, v2.b[9]\n\t" + "dup v6.8b, v2.b[10]\n\t" + "dup v7.8b, v2.b[11]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v2.b[12]\n\t" + "dup v5.8b, v2.b[13]\n\t" + "dup v6.8b, v2.b[14]\n\t" + "dup v7.8b, v2.b[15]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[0]\n\t" + "dup v5.8b, v3.b[1]\n\t" + "dup v6.8b, v3.b[2]\n\t" + "dup v7.8b, v3.b[3]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[4]\n\t" + "dup v5.8b, v3.b[5]\n\t" + "dup v6.8b, v3.b[6]\n\t" + "dup v7.8b, v3.b[7]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[8]\n\t" + "dup v5.8b, v3.b[9]\n\t" + "dup v6.8b, v3.b[10]\n\t" + "dup v7.8b, v3.b[11]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + "dup v4.8b, v3.b[12]\n\t" + "dup v5.8b, v3.b[13]\n\t" + "dup v6.8b, v3.b[14]\n\t" + "dup v7.8b, v3.b[15]\n\t" + "cmtst v4.8b, v4.8b, v0.8b\n\t" + "cmtst v5.8b, v5.8b, v0.8b\n\t" + "cmtst v6.8b, v6.8b, v0.8b\n\t" + "cmtst v7.8b, v7.8b, v0.8b\n\t" + "zip1 v4.16b, v4.16b, v4.16b\n\t" + "zip1 v5.16b, v5.16b, v5.16b\n\t" + "zip1 v6.16b, v6.16b, v6.16b\n\t" + "zip1 v7.16b, v7.16b, v7.16b\n\t" + "and v4.16b, v4.16b, v1.16b\n\t" + "and v5.16b, v5.16b, v1.16b\n\t" + "and v6.16b, v6.16b, v1.16b\n\t" + "and v7.16b, v7.16b, v1.16b\n\t" + "st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t" + : [p] "+r" (p), [msg] "+r" (msg) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits) + : "memory", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "cc" + ); +} + +int kyber_cmp_neon(const byte* a, const byte* b, int sz) +{ + __asm__ __volatile__ ( + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v8.16b, v0.16b, v4.16b\n\t" + "eor v9.16b, v1.16b, v5.16b\n\t" + "eor v10.16b, v2.16b, v6.16b\n\t" + "eor v11.16b, v3.16b, v7.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "subs %w[sz], %w[sz], #0x300\n\t" + "beq L_kyber_aarch64_cmp_neon_done_%=\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "subs %w[sz], %w[sz], #0x140\n\t" + "beq L_kyber_aarch64_cmp_neon_done_%=\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" + "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld2 {v0.16b, v1.16b}, [%x[a]]\n\t" + "ld2 {v4.16b, v5.16b}, [%x[b]]\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "\n" + "L_kyber_aarch64_cmp_neon_done_%=: \n\t" + "orr v8.16b, v8.16b, v9.16b\n\t" + "orr v10.16b, v10.16b, v11.16b\n\t" + "orr v8.16b, v8.16b, v10.16b\n\t" + "ins v9.b[0], v8.b[1]\n\t" + "orr v8.16b, v8.16b, v9.16b\n\t" + "mov x0, v8.d[0]\n\t" + "subs x0, x0, xzr\n\t" + "csetm w0, ne\n\t" + : [a] "+r" (a), [b] "+r" (b), [sz] "+r" (sz) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "cc" + ); + return (uint32_t)(size_t)a; +} + +static const uint16_t L_kyber_aarch64_rej_uniform_neon_mask[] = { + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, + 0xfff, +}; + +static const uint16_t L_kyber_aarch64_rej_uniform_neon_bits[] = { + 0x1, + 0x2, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, +}; + +static const uint8_t L_kyber_aarch64_rej_uniform_neon_indeces[] = { + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xff, + 0xff, + 0xff, + 0xff, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xff, + 0xff, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xe, + 0xf, + 0xff, + 0xff, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0, + 0x1, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, + 0xff, + 0xff, + 0x0, + 0x1, + 0x2, + 0x3, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0x9, + 0xa, + 0xb, + 0xc, + 0xd, + 0xe, + 0xf, +}; + +unsigned int kyber_rej_uniform_neon(sword16* p, unsigned int len, const byte* r, unsigned int rLen) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_rej_uniform_neon_mask]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_rej_uniform_neon_mask]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_rej_uniform_neon_mask]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_rej_uniform_neon_mask]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x5, %[L_kyber_aarch64_q]\n\t" + "add x5, x5, :lo12:%[L_kyber_aarch64_q]\n\t" +#else + "adrp x5, %[L_kyber_aarch64_q]@PAGE\n\t" + "add x5, x5, %[L_kyber_aarch64_q]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x6, %[L_kyber_aarch64_rej_uniform_neon_bits]\n\t" + "add x6, x6, :lo12:%[L_kyber_aarch64_rej_uniform_neon_bits]\n\t" +#else + "adrp x6, %[L_kyber_aarch64_rej_uniform_neon_bits]@PAGE\n\t" + "add x6, x6, %[L_kyber_aarch64_rej_uniform_neon_bits]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x7, %[L_kyber_aarch64_rej_uniform_neon_indeces]\n\t" + "add x7, x7, :lo12:%[L_kyber_aarch64_rej_uniform_neon_indeces]\n\t" +#else + "adrp x7, %[L_kyber_aarch64_rej_uniform_neon_indeces]@PAGE\n\t" + "add x7, x7, %[L_kyber_aarch64_rej_uniform_neon_indeces]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "eor v1.16b, v1.16b, v1.16b\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "mov x13, #0xd01\n\t" + "ldr q0, [x4]\n\t" + "ldr q3, [x5]\n\t" + "ldr q2, [x6]\n\t" + "subs wzr, %w[len], #0\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "subs wzr, %w[len], #16\n\t" + "blt L_kyber_aarch64_rej_uniform_neon_loop_4_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_loop_16_%=: \n\t" + "ld3 {v4.8b, v5.8b, v6.8b}, [%x[r]], #24\n\t" + "zip1 v4.16b, v4.16b, v1.16b\n\t" + "zip1 v5.16b, v5.16b, v1.16b\n\t" + "zip1 v6.16b, v6.16b, v1.16b\n\t" + "shl v7.8h, v5.8h, #8\n\t" + "ushr v8.8h, v5.8h, #4\n\t" + "shl v6.8h, v6.8h, #4\n\t" + "orr v4.16b, v4.16b, v7.16b\n\t" + "orr v5.16b, v8.16b, v6.16b\n\t" + "and v7.16b, v4.16b, v0.16b\n\t" + "and v8.16b, v5.16b, v0.16b\n\t" + "zip1 v4.8h, v7.8h, v8.8h\n\t" + "zip2 v5.8h, v7.8h, v8.8h\n\t" + "cmgt v7.8h, v3.8h, v4.8h\n\t" + "cmgt v8.8h, v3.8h, v5.8h\n\t" + "ushr v12.8h, v7.8h, #15\n\t" + "ushr v13.8h, v8.8h, #15\n\t" + "addv h12, v12.8h\n\t" + "addv h13, v13.8h\n\t" + "mov x10, v12.d[0]\n\t" + "mov x11, v13.d[0]\n\t" + "and v10.16b, v7.16b, v2.16b\n\t" + "and v11.16b, v8.16b, v2.16b\n\t" + "addv h10, v10.8h\n\t" + "addv h11, v11.8h\n\t" + "mov w8, v10.s[0]\n\t" + "mov w9, v11.s[0]\n\t" + "lsl w8, w8, #4\n\t" + "lsl w9, w9, #4\n\t" + "ldr q10, [x7, x8]\n\t" + "ldr q11, [x7, x9]\n\t" + "tbl v7.16b, {v4.16b}, v10.16b\n\t" + "tbl v8.16b, {v5.16b}, v11.16b\n\t" + "str q7, [%x[p]]\n\t" + "add %x[p], %x[p], x10, lsl 1\n\t" + "add x12, x12, x10\n\t" + "str q8, [%x[p]]\n\t" + "add %x[p], %x[p], x11, lsl 1\n\t" + "add x12, x12, x11\n\t" + "subs %w[rLen], %w[rLen], #24\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "sub w10, %w[len], w12\n\t" + "subs x10, x10, #16\n\t" + "blt L_kyber_aarch64_rej_uniform_neon_loop_4_%=\n\t" + "b L_kyber_aarch64_rej_uniform_neon_loop_16_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_loop_4_%=: \n\t" + "subs w10, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "subs x10, x10, #4\n\t" + "blt L_kyber_aarch64_rej_uniform_neon_loop_lt_4_%=\n\t" + "ldr x4, [%x[r]], #6\n\t" + "lsr x5, x4, #12\n\t" + "lsr x6, x4, #24\n\t" + "lsr x7, x4, #36\n\t" + "and x4, x4, #0xfff\n\t" + "and x5, x5, #0xfff\n\t" + "and x6, x6, #0xfff\n\t" + "and x7, x7, #0xfff\n\t" + "strh w4, [%x[p]]\n\t" + "subs xzr, x4, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "strh w5, [%x[p]]\n\t" + "subs xzr, x5, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "strh w6, [%x[p]]\n\t" + "subs xzr, x6, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "strh w7, [%x[p]]\n\t" + "subs xzr, x7, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs %w[rLen], %w[rLen], #6\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "b L_kyber_aarch64_rej_uniform_neon_loop_4_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_loop_lt_4_%=: \n\t" + "ldr x4, [%x[r]], #6\n\t" + "lsr x5, x4, #12\n\t" + "lsr x6, x4, #24\n\t" + "lsr x7, x4, #36\n\t" + "and x4, x4, #0xfff\n\t" + "and x5, x5, #0xfff\n\t" + "and x6, x6, #0xfff\n\t" + "and x7, x7, #0xfff\n\t" + "strh w4, [%x[p]]\n\t" + "subs xzr, x4, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "strh w5, [%x[p]]\n\t" + "subs xzr, x5, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "strh w6, [%x[p]]\n\t" + "subs xzr, x6, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "strh w7, [%x[p]]\n\t" + "subs xzr, x7, x13\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc %x[p], %x[p], lt\n\t" + "cinc x12, x12, lt\n\t" + "subs wzr, %w[len], w12\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "subs %w[rLen], %w[rLen], #6\n\t" + "beq L_kyber_aarch64_rej_uniform_neon_done_%=\n\t" + "b L_kyber_aarch64_rej_uniform_neon_loop_lt_4_%=\n\t" + "\n" + "L_kyber_aarch64_rej_uniform_neon_done_%=: \n\t" + "mov x0, x12\n\t" + : [p] "+r" (p), [len] "+r" (len), [r] "+r" (r), [rLen] "+r" (rLen) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "cc" + ); + return (uint32_t)(size_t)p; +} + +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +void kyber_sha3_blocksx3_neon(word64* state) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x27, %[L_sha3_aarch64_r]\n\t" + "add x27, x27, :lo12:%[L_sha3_aarch64_r]\n\t" +#else + "adrp x27, %[L_sha3_aarch64_r]@PAGE\n\t" + "add x27, x27, %[L_sha3_aarch64_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "ld1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "ld1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ldp x1, x2, [%x[state]]\n\t" + "ldp x3, x4, [%x[state], #16]\n\t" + "ldp x5, x6, [%x[state], #32]\n\t" + "ldp x7, x8, [%x[state], #48]\n\t" + "ldp x9, x10, [%x[state], #64]\n\t" + "ldp x11, x12, [%x[state], #80]\n\t" + "ldp x13, x14, [%x[state], #96]\n\t" + "ldp x15, x16, [%x[state], #112]\n\t" + "ldp x17, x19, [%x[state], #128]\n\t" + "ldp x20, x21, [%x[state], #144]\n\t" + "ldp x22, x23, [%x[state], #160]\n\t" + "ldp x24, x25, [%x[state], #176]\n\t" + "ldr x26, [%x[state], #192]\n\t" + "mov x28, #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_transform_blocksx3_neon_begin_%=: \n\t" + "stp x27, x28, [x29, #48]\n\t" + /* Col Mix */ + "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" + "eor %x[state], x5, x10\n\t" + "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" + "eor x30, x1, x6\n\t" + "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" + "eor x28, x3, x8\n\t" + "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" + "eor %x[state], %x[state], x15\n\t" + "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" + "eor x30, x30, x11\n\t" + "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" + "eor x28, x28, x13\n\t" + "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" + "eor %x[state], %x[state], x21\n\t" + "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" + "eor x30, x30, x16\n\t" + "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" + "eor x28, x28, x19\n\t" + "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" + "eor %x[state], %x[state], x26\n\t" + "rax1 v25.2d, v30.2d, v27.2d\n\t" + "eor x30, x30, x22\n\t" + "rax1 v26.2d, v31.2d, v28.2d\n\t" + "eor x28, x28, x24\n\t" + "rax1 v27.2d, v27.2d, v29.2d\n\t" + "str %x[state], [x29, #32]\n\t" + "rax1 v28.2d, v28.2d, v30.2d\n\t" + "str x28, [x29, #24]\n\t" + "rax1 v29.2d, v29.2d, v31.2d\n\t" + "eor x27, x2, x7\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "xar v30.2d, v1.2d, v26.2d, #63\n\t" + "eor x28, x4, x9\n\t" + "xar v1.2d, v6.2d, v26.2d, #20\n\t" + "eor x27, x27, x12\n\t" + "xar v6.2d, v9.2d, v29.2d, #44\n\t" + "eor x28, x28, x14\n\t" + "xar v9.2d, v22.2d, v27.2d, #3\n\t" + "eor x27, x27, x17\n\t" + "xar v22.2d, v14.2d, v29.2d, #25\n\t" + "eor x28, x28, x20\n\t" + "xar v14.2d, v20.2d, v25.2d, #46\n\t" + "eor x27, x27, x23\n\t" + "xar v20.2d, v2.2d, v27.2d, #2\n\t" + "eor x28, x28, x25\n\t" + "xar v2.2d, v12.2d, v27.2d, #21\n\t" + "eor %x[state], %x[state], x27, ror 63\n\t" + "xar v12.2d, v13.2d, v28.2d, #39\n\t" + "eor x27, x27, x28, ror 63\n\t" + "xar v13.2d, v19.2d, v29.2d, #56\n\t" + "eor x1, x1, %x[state]\n\t" + "xar v19.2d, v23.2d, v28.2d, #8\n\t" + "eor x6, x6, %x[state]\n\t" + "xar v23.2d, v15.2d, v25.2d, #23\n\t" + "eor x11, x11, %x[state]\n\t" + "xar v15.2d, v4.2d, v29.2d, #37\n\t" + "eor x16, x16, %x[state]\n\t" + "xar v4.2d, v24.2d, v29.2d, #50\n\t" + "eor x22, x22, %x[state]\n\t" + "xar v24.2d, v21.2d, v26.2d, #62\n\t" + "eor x3, x3, x27\n\t" + "xar v21.2d, v8.2d, v28.2d, #9\n\t" + "eor x8, x8, x27\n\t" + "xar v8.2d, v16.2d, v26.2d, #19\n\t" + "eor x13, x13, x27\n\t" + "xar v16.2d, v5.2d, v25.2d, #28\n\t" + "eor x19, x19, x27\n\t" + "xar v5.2d, v3.2d, v28.2d, #36\n\t" + "eor x24, x24, x27\n\t" + "xar v3.2d, v18.2d, v28.2d, #43\n\t" + "ldr %x[state], [x29, #32]\n\t" + "xar v18.2d, v17.2d, v27.2d, #49\n\t" + "ldr x27, [x29, #24]\n\t" + "xar v17.2d, v11.2d, v26.2d, #54\n\t" + "eor x28, x28, x30, ror 63\n\t" + "xar v11.2d, v7.2d, v27.2d, #58\n\t" + "eor x30, x30, x27, ror 63\n\t" + "xar v7.2d, v10.2d, v25.2d, #61\n\t" + "eor x27, x27, %x[state], ror 63\n\t" + /* Row Mix */ + "mov v25.16b, v0.16b\n\t" + "eor x5, x5, x28\n\t" + "mov v26.16b, v1.16b\n\t" + "eor x10, x10, x28\n\t" + "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" + "eor x15, x15, x28\n\t" + "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" + "eor x21, x21, x28\n\t" + "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" + "eor x26, x26, x28\n\t" + "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" + "eor x2, x2, x30\n\t" + "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" + "eor x7, x7, x30\n\t" + "mov v25.16b, v5.16b\n\t" + "eor x12, x12, x30\n\t" + "mov v26.16b, v6.16b\n\t" + "eor x17, x17, x30\n\t" + "bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t" + "eor x23, x23, x30\n\t" + "bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t" + "eor x4, x4, x27\n\t" + "bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t" + "eor x9, x9, x27\n\t" + "bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t" + "eor x14, x14, x27\n\t" + "bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t" + "eor x20, x20, x27\n\t" + "mov v26.16b, v11.16b\n\t" + "eor x25, x25, x27\n\t" + /* Swap Rotate Base */ + "bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t" + "ror %x[state], x2, #63\n\t" + "bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t" + "ror x2, x7, #20\n\t" + "bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t" + "ror x7, x10, #44\n\t" + "bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t" + "ror x10, x24, #3\n\t" + "bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t" + "ror x24, x15, #25\n\t" + "mov v25.16b, v15.16b\n\t" + "ror x15, x22, #46\n\t" + "mov v26.16b, v16.16b\n\t" + "ror x22, x3, #2\n\t" + "bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t" + "ror x3, x13, #21\n\t" + "bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t" + "ror x13, x14, #39\n\t" + "bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t" + "ror x14, x21, #56\n\t" + "bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t" + "ror x21, x25, #8\n\t" + "bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t" + "ror x25, x16, #23\n\t" + "mov v25.16b, v20.16b\n\t" + "ror x16, x5, #37\n\t" + "mov v26.16b, v21.16b\n\t" + "ror x5, x26, #50\n\t" + "bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t" + "ror x26, x23, #62\n\t" + "bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t" + "ror x23, x9, #9\n\t" + "bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t" + "ror x9, x17, #19\n\t" + "bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t" + "ror x17, x6, #28\n\t" + "bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t" + "ror x6, x4, #36\n\t" + "ror x4, x20, #43\n\t" + "ror x20, x19, #49\n\t" + "ror x19, x12, #54\n\t" + "ror x12, x8, #58\n\t" + "ror x8, x11, #61\n\t" + /* Row Mix Base */ + "bic x11, x3, x2\n\t" + "bic x27, x4, x3\n\t" + "bic x28, x1, x5\n\t" + "bic x30, x2, x1\n\t" + "eor x1, x1, x11\n\t" + "eor x2, x2, x27\n\t" + "bic x11, x5, x4\n\t" + "eor x4, x4, x28\n\t" + "eor x3, x3, x11\n\t" + "eor x5, x5, x30\n\t" + "bic x11, x8, x7\n\t" + "bic x27, x9, x8\n\t" + "bic x28, x6, x10\n\t" + "bic x30, x7, x6\n\t" + "eor x6, x6, x11\n\t" + "eor x7, x7, x27\n\t" + "bic x11, x10, x9\n\t" + "eor x9, x9, x28\n\t" + "eor x8, x8, x11\n\t" + "eor x10, x10, x30\n\t" + "bic x11, x13, x12\n\t" + "bic x27, x14, x13\n\t" + "bic x28, %x[state], x15\n\t" + "bic x30, x12, %x[state]\n\t" + "eor x11, %x[state], x11\n\t" + "eor x12, x12, x27\n\t" + "bic %x[state], x15, x14\n\t" + "eor x14, x14, x28\n\t" + "eor x13, x13, %x[state]\n\t" + "eor x15, x15, x30\n\t" + "bic %x[state], x19, x17\n\t" + "bic x27, x20, x19\n\t" + "bic x28, x16, x21\n\t" + "bic x30, x17, x16\n\t" + "eor x16, x16, %x[state]\n\t" + "eor x17, x17, x27\n\t" + "bic %x[state], x21, x20\n\t" + "eor x20, x20, x28\n\t" + "eor x19, x19, %x[state]\n\t" + "eor x21, x21, x30\n\t" + "bic %x[state], x24, x23\n\t" + "bic x27, x25, x24\n\t" + "bic x28, x22, x26\n\t" + "bic x30, x23, x22\n\t" + "eor x22, x22, %x[state]\n\t" + "eor x23, x23, x27\n\t" + "bic %x[state], x26, x25\n\t" + "eor x25, x25, x28\n\t" + "eor x24, x24, %x[state]\n\t" + "eor x26, x26, x30\n\t" + /* Done tranforming */ + "ldp x27, x28, [x29, #48]\n\t" + "ldr %x[state], [x27], #8\n\t" + "subs x28, x28, #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x1, x1, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x1, x2, [%x[state]]\n\t" + "stp x3, x4, [%x[state], #16]\n\t" + "stp x5, x6, [%x[state], #32]\n\t" + "stp x7, x8, [%x[state], #48]\n\t" + "stp x9, x10, [%x[state], #64]\n\t" + "stp x11, x12, [%x[state], #80]\n\t" + "stp x13, x14, [%x[state], #96]\n\t" + "stp x15, x16, [%x[state], #112]\n\t" + "stp x17, x19, [%x[state], #128]\n\t" + "stp x20, x21, [%x[state], #144]\n\t" + "stp x22, x23, [%x[state], #160]\n\t" + "stp x24, x25, [%x[state], #176]\n\t" + "str x26, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_sha3_aarch64_r]\n\t" + "add x28, x28, :lo12:%[L_sha3_aarch64_r]\n\t" +#else + "adrp x28, %[L_sha3_aarch64_r]@PAGE\n\t" + "add x28, x28, %[L_sha3_aarch64_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "eor v16.16b, v16.16b, v16.16b\n\t" + "eor x19, x19, x19\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "movz x23, #0x8000, lsl 48\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v20.2d, x23\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix */ + "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" + "eor x30, x2, x7\n\t" + "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" + "eor x28, x4, x9\n\t" + "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" + "eor x30, x30, x12\n\t" + "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" + "eor x28, x28, x14\n\t" + "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" + "eor x30, x30, x17\n\t" + "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" + "eor x28, x28, x20\n\t" + "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" + "eor %x[state], %x[state], x27\n\t" + "rax1 v25.2d, v30.2d, v27.2d\n\t" + "eor x30, x30, x23\n\t" + "rax1 v26.2d, v31.2d, v28.2d\n\t" + "eor x28, x28, x25\n\t" + "rax1 v27.2d, v27.2d, v29.2d\n\t" + "str %x[state], [x29, #32]\n\t" + "rax1 v28.2d, v28.2d, v30.2d\n\t" + "str x28, [x29, #24]\n\t" + "rax1 v29.2d, v29.2d, v31.2d\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "xar v30.2d, v1.2d, v26.2d, #63\n\t" + "eor x28, x5, x10\n\t" + "xar v1.2d, v6.2d, v26.2d, #20\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "xar v6.2d, v9.2d, v29.2d, #44\n\t" + "eor x28, x28, x15\n\t" + "xar v9.2d, v22.2d, v27.2d, #3\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "xar v22.2d, v14.2d, v29.2d, #25\n\t" + "eor x28, x28, x21\n\t" + "xar v14.2d, v20.2d, v25.2d, #46\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "xar v20.2d, v2.2d, v27.2d, #2\n\t" + "eor x28, x28, x26\n\t" + "xar v2.2d, v12.2d, v27.2d, #21\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "xar v12.2d, v13.2d, v28.2d, #39\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "xar v13.2d, v19.2d, v29.2d, #56\n\t" + "eor x2, x2, %x[state]\n\t" + "xar v19.2d, v23.2d, v28.2d, #8\n\t" + "eor x7, x7, %x[state]\n\t" + "xar v23.2d, v15.2d, v25.2d, #23\n\t" + "eor x12, x12, %x[state]\n\t" + "xar v15.2d, v4.2d, v29.2d, #37\n\t" + "eor x17, x17, %x[state]\n\t" + "xar v4.2d, v24.2d, v29.2d, #50\n\t" + "eor x23, x23, %x[state]\n\t" + "xar v24.2d, v21.2d, v26.2d, #62\n\t" + "eor x4, x4, %x[seed]\n\t" + "xar v21.2d, v8.2d, v28.2d, #9\n\t" + "eor x9, x9, %x[seed]\n\t" + "xar v8.2d, v16.2d, v26.2d, #19\n\t" + "eor x14, x14, %x[seed]\n\t" + "xar v16.2d, v5.2d, v25.2d, #28\n\t" + "eor x20, x20, %x[seed]\n\t" + "xar v5.2d, v3.2d, v28.2d, #36\n\t" + "eor x25, x25, %x[seed]\n\t" + "xar v3.2d, v18.2d, v28.2d, #43\n\t" + "ldr %x[state], [x29, #32]\n\t" + "xar v18.2d, v17.2d, v27.2d, #49\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "xar v17.2d, v11.2d, v26.2d, #54\n\t" + "eor x28, x28, x30, ror 63\n\t" + "xar v11.2d, v7.2d, v27.2d, #58\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "xar v7.2d, v10.2d, v25.2d, #61\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + /* Row Mix */ + "mov v25.16b, v0.16b\n\t" + "eor x6, x6, x28\n\t" + "mov v26.16b, v1.16b\n\t" + "eor x11, x11, x28\n\t" + "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" + "eor x16, x16, x28\n\t" + "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" + "eor x22, x22, x28\n\t" + "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" + "eor x27, x27, x28\n\t" + "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" + "eor x3, x3, x30\n\t" + "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" + "eor x8, x8, x30\n\t" + "mov v25.16b, v5.16b\n\t" + "eor x13, x13, x30\n\t" + "mov v26.16b, v6.16b\n\t" + "eor x19, x19, x30\n\t" + "bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t" + "eor x24, x24, x30\n\t" + "bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t" + "eor x5, x5, %x[seed]\n\t" + "bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t" + "eor x10, x10, %x[seed]\n\t" + "bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t" + "eor x15, x15, %x[seed]\n\t" + "bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t" + "eor x21, x21, %x[seed]\n\t" + "mov v26.16b, v11.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t" + "ror %x[state], x3, #63\n\t" + "bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t" + "ror x3, x8, #20\n\t" + "bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t" + "ror x8, x11, #44\n\t" + "bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t" + "ror x11, x25, #3\n\t" + "bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t" + "ror x25, x16, #25\n\t" + "mov v25.16b, v15.16b\n\t" + "ror x16, x23, #46\n\t" + "mov v26.16b, v16.16b\n\t" + "ror x23, x4, #2\n\t" + "bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t" + "ror x4, x14, #21\n\t" + "bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t" + "ror x14, x15, #39\n\t" + "bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t" + "ror x15, x22, #56\n\t" + "bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t" + "ror x22, x26, #8\n\t" + "bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t" + "ror x26, x17, #23\n\t" + "mov v25.16b, v20.16b\n\t" + "ror x17, x6, #37\n\t" + "mov v26.16b, v21.16b\n\t" + "ror x6, x27, #50\n\t" + "bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t" + "ror x27, x24, #62\n\t" + "bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t" + "ror x24, x10, #9\n\t" + "bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t" + "ror x10, x19, #19\n\t" + "bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t" + "ror x19, x7, #28\n\t" + "bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t" + "ror x7, x5, #36\n\t" + "ror x5, x21, #43\n\t" + "ror x21, x20, #49\n\t" + "ror x20, x13, #54\n\t" + "ror x13, x9, #58\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "bic x12, x4, x3\n\t" + "bic %x[seed], x5, x4\n\t" + "bic x28, x2, x6\n\t" + "bic x30, x3, x2\n\t" + "eor x2, x2, x12\n\t" + "eor x3, x3, %x[seed]\n\t" + "bic x12, x6, x5\n\t" + "eor x5, x5, x28\n\t" + "eor x4, x4, x12\n\t" + "eor x6, x6, x30\n\t" + "bic x12, x9, x8\n\t" + "bic %x[seed], x10, x9\n\t" + "bic x28, x7, x11\n\t" + "bic x30, x8, x7\n\t" + "eor x7, x7, x12\n\t" + "eor x8, x8, %x[seed]\n\t" + "bic x12, x11, x10\n\t" + "eor x10, x10, x28\n\t" + "eor x9, x9, x12\n\t" + "eor x11, x11, x30\n\t" + "bic x12, x14, x13\n\t" + "bic %x[seed], x15, x14\n\t" + "bic x28, %x[state], x16\n\t" + "bic x30, x13, %x[state]\n\t" + "eor x12, %x[state], x12\n\t" + "eor x13, x13, %x[seed]\n\t" + "bic %x[state], x16, x15\n\t" + "eor x15, x15, x28\n\t" + "eor x14, x14, %x[state]\n\t" + "eor x16, x16, x30\n\t" + "bic %x[state], x20, x19\n\t" + "bic %x[seed], x21, x20\n\t" + "bic x28, x17, x22\n\t" + "bic x30, x19, x17\n\t" + "eor x17, x17, %x[state]\n\t" + "eor x19, x19, %x[seed]\n\t" + "bic %x[state], x22, x21\n\t" + "eor x21, x21, x28\n\t" + "eor x20, x20, %x[state]\n\t" + "eor x22, x22, x30\n\t" + "bic %x[state], x25, x24\n\t" + "bic %x[seed], x26, x25\n\t" + "bic x28, x23, x27\n\t" + "bic x30, x24, x23\n\t" + "eor x23, x23, %x[state]\n\t" + "eor x24, x24, %x[seed]\n\t" + "bic %x[state], x27, x26\n\t" + "eor x26, x26, x28\n\t" + "eor x25, x25, %x[state]\n\t" + "eor x27, x27, x30\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_sha3_aarch64_r]\n\t" + "add x28, x28, :lo12:%[L_sha3_aarch64_r]\n\t" +#else + "adrp x28, %[L_sha3_aarch64_r]@PAGE\n\t" + "add x28, x28, %[L_sha3_aarch64_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "movz x19, #0x8000, lsl 48\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "eor v20.16b, v20.16b, v20.16b\n\t" + "eor x23, x23, x23\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v16.2d, x19\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix */ + "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" + "eor x30, x2, x7\n\t" + "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" + "eor x28, x4, x9\n\t" + "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" + "eor x30, x30, x12\n\t" + "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" + "eor x28, x28, x14\n\t" + "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" + "eor x30, x30, x17\n\t" + "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" + "eor x28, x28, x20\n\t" + "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" + "eor %x[state], %x[state], x27\n\t" + "rax1 v25.2d, v30.2d, v27.2d\n\t" + "eor x30, x30, x23\n\t" + "rax1 v26.2d, v31.2d, v28.2d\n\t" + "eor x28, x28, x25\n\t" + "rax1 v27.2d, v27.2d, v29.2d\n\t" + "str %x[state], [x29, #32]\n\t" + "rax1 v28.2d, v28.2d, v30.2d\n\t" + "str x28, [x29, #24]\n\t" + "rax1 v29.2d, v29.2d, v31.2d\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "xar v30.2d, v1.2d, v26.2d, #63\n\t" + "eor x28, x5, x10\n\t" + "xar v1.2d, v6.2d, v26.2d, #20\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "xar v6.2d, v9.2d, v29.2d, #44\n\t" + "eor x28, x28, x15\n\t" + "xar v9.2d, v22.2d, v27.2d, #3\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "xar v22.2d, v14.2d, v29.2d, #25\n\t" + "eor x28, x28, x21\n\t" + "xar v14.2d, v20.2d, v25.2d, #46\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "xar v20.2d, v2.2d, v27.2d, #2\n\t" + "eor x28, x28, x26\n\t" + "xar v2.2d, v12.2d, v27.2d, #21\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "xar v12.2d, v13.2d, v28.2d, #39\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "xar v13.2d, v19.2d, v29.2d, #56\n\t" + "eor x2, x2, %x[state]\n\t" + "xar v19.2d, v23.2d, v28.2d, #8\n\t" + "eor x7, x7, %x[state]\n\t" + "xar v23.2d, v15.2d, v25.2d, #23\n\t" + "eor x12, x12, %x[state]\n\t" + "xar v15.2d, v4.2d, v29.2d, #37\n\t" + "eor x17, x17, %x[state]\n\t" + "xar v4.2d, v24.2d, v29.2d, #50\n\t" + "eor x23, x23, %x[state]\n\t" + "xar v24.2d, v21.2d, v26.2d, #62\n\t" + "eor x4, x4, %x[seed]\n\t" + "xar v21.2d, v8.2d, v28.2d, #9\n\t" + "eor x9, x9, %x[seed]\n\t" + "xar v8.2d, v16.2d, v26.2d, #19\n\t" + "eor x14, x14, %x[seed]\n\t" + "xar v16.2d, v5.2d, v25.2d, #28\n\t" + "eor x20, x20, %x[seed]\n\t" + "xar v5.2d, v3.2d, v28.2d, #36\n\t" + "eor x25, x25, %x[seed]\n\t" + "xar v3.2d, v18.2d, v28.2d, #43\n\t" + "ldr %x[state], [x29, #32]\n\t" + "xar v18.2d, v17.2d, v27.2d, #49\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "xar v17.2d, v11.2d, v26.2d, #54\n\t" + "eor x28, x28, x30, ror 63\n\t" + "xar v11.2d, v7.2d, v27.2d, #58\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "xar v7.2d, v10.2d, v25.2d, #61\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + /* Row Mix */ + "mov v25.16b, v0.16b\n\t" + "eor x6, x6, x28\n\t" + "mov v26.16b, v1.16b\n\t" + "eor x11, x11, x28\n\t" + "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" + "eor x16, x16, x28\n\t" + "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" + "eor x22, x22, x28\n\t" + "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" + "eor x27, x27, x28\n\t" + "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" + "eor x3, x3, x30\n\t" + "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" + "eor x8, x8, x30\n\t" + "mov v25.16b, v5.16b\n\t" + "eor x13, x13, x30\n\t" + "mov v26.16b, v6.16b\n\t" + "eor x19, x19, x30\n\t" + "bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t" + "eor x24, x24, x30\n\t" + "bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t" + "eor x5, x5, %x[seed]\n\t" + "bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t" + "eor x10, x10, %x[seed]\n\t" + "bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t" + "eor x15, x15, %x[seed]\n\t" + "bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t" + "eor x21, x21, %x[seed]\n\t" + "mov v26.16b, v11.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t" + "ror %x[state], x3, #63\n\t" + "bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t" + "ror x3, x8, #20\n\t" + "bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t" + "ror x8, x11, #44\n\t" + "bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t" + "ror x11, x25, #3\n\t" + "bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t" + "ror x25, x16, #25\n\t" + "mov v25.16b, v15.16b\n\t" + "ror x16, x23, #46\n\t" + "mov v26.16b, v16.16b\n\t" + "ror x23, x4, #2\n\t" + "bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t" + "ror x4, x14, #21\n\t" + "bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t" + "ror x14, x15, #39\n\t" + "bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t" + "ror x15, x22, #56\n\t" + "bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t" + "ror x22, x26, #8\n\t" + "bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t" + "ror x26, x17, #23\n\t" + "mov v25.16b, v20.16b\n\t" + "ror x17, x6, #37\n\t" + "mov v26.16b, v21.16b\n\t" + "ror x6, x27, #50\n\t" + "bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t" + "ror x27, x24, #62\n\t" + "bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t" + "ror x24, x10, #9\n\t" + "bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t" + "ror x10, x19, #19\n\t" + "bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t" + "ror x19, x7, #28\n\t" + "bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t" + "ror x7, x5, #36\n\t" + "ror x5, x21, #43\n\t" + "ror x21, x20, #49\n\t" + "ror x20, x13, #54\n\t" + "ror x13, x9, #58\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "bic x12, x4, x3\n\t" + "bic %x[seed], x5, x4\n\t" + "bic x28, x2, x6\n\t" + "bic x30, x3, x2\n\t" + "eor x2, x2, x12\n\t" + "eor x3, x3, %x[seed]\n\t" + "bic x12, x6, x5\n\t" + "eor x5, x5, x28\n\t" + "eor x4, x4, x12\n\t" + "eor x6, x6, x30\n\t" + "bic x12, x9, x8\n\t" + "bic %x[seed], x10, x9\n\t" + "bic x28, x7, x11\n\t" + "bic x30, x8, x7\n\t" + "eor x7, x7, x12\n\t" + "eor x8, x8, %x[seed]\n\t" + "bic x12, x11, x10\n\t" + "eor x10, x10, x28\n\t" + "eor x9, x9, x12\n\t" + "eor x11, x11, x30\n\t" + "bic x12, x14, x13\n\t" + "bic %x[seed], x15, x14\n\t" + "bic x28, %x[state], x16\n\t" + "bic x30, x13, %x[state]\n\t" + "eor x12, %x[state], x12\n\t" + "eor x13, x13, %x[seed]\n\t" + "bic %x[state], x16, x15\n\t" + "eor x15, x15, x28\n\t" + "eor x14, x14, %x[state]\n\t" + "eor x16, x16, x30\n\t" + "bic %x[state], x20, x19\n\t" + "bic %x[seed], x21, x20\n\t" + "bic x28, x17, x22\n\t" + "bic x30, x19, x17\n\t" + "eor x17, x17, %x[state]\n\t" + "eor x19, x19, %x[seed]\n\t" + "bic %x[state], x22, x21\n\t" + "eor x21, x21, x28\n\t" + "eor x20, x20, %x[state]\n\t" + "eor x22, x22, x30\n\t" + "bic %x[state], x25, x24\n\t" + "bic %x[seed], x26, x25\n\t" + "bic x28, x23, x27\n\t" + "bic x30, x24, x23\n\t" + "eor x23, x23, %x[state]\n\t" + "eor x24, x24, %x[seed]\n\t" + "bic %x[state], x27, x26\n\t" + "eor x26, x26, x28\n\t" + "eor x25, x25, %x[state]\n\t" + "eor x27, x27, x30\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +#else +void kyber_sha3_blocksx3_neon(word64* state) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x27, %[L_sha3_aarch64_r]\n\t" + "add x27, x27, :lo12:%[L_sha3_aarch64_r]\n\t" +#else + "adrp x27, %[L_sha3_aarch64_r]@PAGE\n\t" + "add x27, x27, %[L_sha3_aarch64_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "ld1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ld4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "ld1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "ldp x1, x2, [%x[state]]\n\t" + "ldp x3, x4, [%x[state], #16]\n\t" + "ldp x5, x6, [%x[state], #32]\n\t" + "ldp x7, x8, [%x[state], #48]\n\t" + "ldp x9, x10, [%x[state], #64]\n\t" + "ldp x11, x12, [%x[state], #80]\n\t" + "ldp x13, x14, [%x[state], #96]\n\t" + "ldp x15, x16, [%x[state], #112]\n\t" + "ldp x17, x19, [%x[state], #128]\n\t" + "ldp x20, x21, [%x[state], #144]\n\t" + "ldp x22, x23, [%x[state], #160]\n\t" + "ldp x24, x25, [%x[state], #176]\n\t" + "ldr x26, [%x[state], #192]\n\t" + "mov x28, #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_transform_blocksx3_neon_begin_%=: \n\t" + "stp x27, x28, [x29, #48]\n\t" + /* Col Mix NEON */ + "eor v30.16b, v4.16b, v9.16b\n\t" + "eor %x[state], x5, x10\n\t" + "eor v27.16b, v1.16b, v6.16b\n\t" + "eor x30, x1, x6\n\t" + "eor v30.16b, v30.16b, v14.16b\n\t" + "eor x28, x3, x8\n\t" + "eor v27.16b, v27.16b, v11.16b\n\t" + "eor %x[state], %x[state], x15\n\t" + "eor v30.16b, v30.16b, v19.16b\n\t" + "eor x30, x30, x11\n\t" + "eor v27.16b, v27.16b, v16.16b\n\t" + "eor x28, x28, x13\n\t" + "eor v30.16b, v30.16b, v24.16b\n\t" + "eor %x[state], %x[state], x21\n\t" + "eor v27.16b, v27.16b, v21.16b\n\t" + "eor x30, x30, x16\n\t" + "ushr v25.2d, v27.2d, #63\n\t" + "eor x28, x28, x19\n\t" + "sli v25.2d, v27.2d, #1\n\t" + "eor %x[state], %x[state], x26\n\t" + "eor v25.16b, v25.16b, v30.16b\n\t" + "eor x30, x30, x22\n\t" + "eor v31.16b, v0.16b, v5.16b\n\t" + "eor x28, x28, x24\n\t" + "eor v28.16b, v2.16b, v7.16b\n\t" + "str %x[state], [x29, #32]\n\t" + "eor v31.16b, v31.16b, v10.16b\n\t" + "str x28, [x29, #24]\n\t" + "eor v28.16b, v28.16b, v12.16b\n\t" + "eor x27, x2, x7\n\t" + "eor v31.16b, v31.16b, v15.16b\n\t" + "eor x28, x4, x9\n\t" + "eor v28.16b, v28.16b, v17.16b\n\t" + "eor x27, x27, x12\n\t" + "eor v31.16b, v31.16b, v20.16b\n\t" + "eor x28, x28, x14\n\t" + "eor v28.16b, v28.16b, v22.16b\n\t" + "eor x27, x27, x17\n\t" + "ushr v29.2d, v30.2d, #63\n\t" + "eor x28, x28, x20\n\t" + "ushr v26.2d, v28.2d, #63\n\t" + "eor x27, x27, x23\n\t" + "sli v29.2d, v30.2d, #1\n\t" + "eor x28, x28, x25\n\t" + "sli v26.2d, v28.2d, #1\n\t" + "eor %x[state], %x[state], x27, ror 63\n\t" + "eor v28.16b, v28.16b, v29.16b\n\t" + "eor x27, x27, x28, ror 63\n\t" + "eor v29.16b, v3.16b, v8.16b\n\t" + "eor x1, x1, %x[state]\n\t" + "eor v26.16b, v26.16b, v31.16b\n\t" + "eor x6, x6, %x[state]\n\t" + "eor v29.16b, v29.16b, v13.16b\n\t" + "eor x11, x11, %x[state]\n\t" + "eor v29.16b, v29.16b, v18.16b\n\t" + "eor x16, x16, %x[state]\n\t" + "eor v29.16b, v29.16b, v23.16b\n\t" + "eor x22, x22, %x[state]\n\t" + "ushr v30.2d, v29.2d, #63\n\t" + "eor x3, x3, x27\n\t" + "sli v30.2d, v29.2d, #1\n\t" + "eor x8, x8, x27\n\t" + "eor v27.16b, v27.16b, v30.16b\n\t" + "eor x13, x13, x27\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x19, x19, x27\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x24, x24, x27\n\t" + "eor v29.16b, v29.16b, v30.16b\n\t" + "ldr %x[state], [x29, #32]\n\t" + /* Swap Rotate NEON */ + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor v31.16b, v1.16b, v26.16b\n\t" + "ldr x27, [x29, #24]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x28, x28, x30, ror 63\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x30, x30, x27, ror 63\n\t" + "ushr v1.2d, v6.2d, #20\n\t" + "eor x27, x27, %x[state], ror 63\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x5, x5, x28\n\t" + "sli v1.2d, v6.2d, #44\n\t" + "eor x10, x10, x28\n\t" + "eor v31.16b, v9.16b, v29.16b\n\t" + "eor x15, x15, x28\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor x21, x21, x28\n\t" + "ushr v6.2d, v31.2d, #44\n\t" + "eor x26, x26, x28\n\t" + "ushr v9.2d, v22.2d, #3\n\t" + "eor x2, x2, x30\n\t" + "sli v6.2d, v31.2d, #20\n\t" + "eor x7, x7, x30\n\t" + "sli v9.2d, v22.2d, #61\n\t" + "eor x12, x12, x30\n\t" + "eor v31.16b, v14.16b, v29.16b\n\t" + "eor x17, x17, x30\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor x23, x23, x30\n\t" + "ushr v22.2d, v31.2d, #25\n\t" + "eor x4, x4, x27\n\t" + "ushr v14.2d, v20.2d, #46\n\t" + "eor x9, x9, x27\n\t" + "sli v22.2d, v31.2d, #39\n\t" + "eor x14, x14, x27\n\t" + "sli v14.2d, v20.2d, #18\n\t" + "eor x20, x20, x27\n\t" + "eor v31.16b, v2.16b, v27.16b\n\t" + "eor x25, x25, x27\n\t" + /* Swap Rotate Base */ + "eor v12.16b, v12.16b, v27.16b\n\t" + "ror %x[state], x2, #63\n\t" + "ushr v20.2d, v31.2d, #2\n\t" + "ror x2, x7, #20\n\t" + "ushr v2.2d, v12.2d, #21\n\t" + "ror x7, x10, #44\n\t" + "sli v20.2d, v31.2d, #62\n\t" + "ror x10, x24, #3\n\t" + "sli v2.2d, v12.2d, #43\n\t" + "ror x24, x15, #25\n\t" + "eor v31.16b, v13.16b, v28.16b\n\t" + "ror x15, x22, #46\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "ror x22, x3, #2\n\t" + "ushr v12.2d, v31.2d, #39\n\t" + "ror x3, x13, #21\n\t" + "ushr v13.2d, v19.2d, #56\n\t" + "ror x13, x14, #39\n\t" + "sli v12.2d, v31.2d, #25\n\t" + "ror x14, x21, #56\n\t" + "sli v13.2d, v19.2d, #8\n\t" + "ror x21, x25, #8\n\t" + "eor v31.16b, v23.16b, v28.16b\n\t" + "ror x25, x16, #23\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "ror x16, x5, #37\n\t" + "ushr v19.2d, v31.2d, #8\n\t" + "ror x5, x26, #50\n\t" + "ushr v23.2d, v15.2d, #23\n\t" + "ror x26, x23, #62\n\t" + "sli v19.2d, v31.2d, #56\n\t" + "ror x23, x9, #9\n\t" + "sli v23.2d, v15.2d, #41\n\t" + "ror x9, x17, #19\n\t" + "eor v31.16b, v4.16b, v29.16b\n\t" + "ror x17, x6, #28\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + "ror x6, x4, #36\n\t" + "ushr v15.2d, v31.2d, #37\n\t" + "ror x4, x20, #43\n\t" + "ushr v4.2d, v24.2d, #50\n\t" + "ror x20, x19, #49\n\t" + "sli v15.2d, v31.2d, #27\n\t" + "ror x19, x12, #54\n\t" + "sli v4.2d, v24.2d, #14\n\t" + "ror x12, x8, #58\n\t" + "eor v31.16b, v21.16b, v26.16b\n\t" + "ror x8, x11, #61\n\t" + /* Row Mix Base */ + "eor v8.16b, v8.16b, v28.16b\n\t" + "bic x11, x3, x2\n\t" + "ushr v24.2d, v31.2d, #62\n\t" + "bic x27, x4, x3\n\t" + "ushr v21.2d, v8.2d, #9\n\t" + "bic x28, x1, x5\n\t" + "sli v24.2d, v31.2d, #2\n\t" + "bic x30, x2, x1\n\t" + "sli v21.2d, v8.2d, #55\n\t" + "eor x1, x1, x11\n\t" + "eor v31.16b, v16.16b, v26.16b\n\t" + "eor x2, x2, x27\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "bic x11, x5, x4\n\t" + "ushr v8.2d, v31.2d, #19\n\t" + "eor x4, x4, x28\n\t" + "ushr v16.2d, v5.2d, #28\n\t" + "eor x3, x3, x11\n\t" + "sli v8.2d, v31.2d, #45\n\t" + "eor x5, x5, x30\n\t" + "sli v16.2d, v5.2d, #36\n\t" + "bic x11, x8, x7\n\t" + "eor v31.16b, v3.16b, v28.16b\n\t" + "bic x27, x9, x8\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "bic x28, x6, x10\n\t" + "ushr v5.2d, v31.2d, #36\n\t" + "bic x30, x7, x6\n\t" + "ushr v3.2d, v18.2d, #43\n\t" + "eor x6, x6, x11\n\t" + "sli v5.2d, v31.2d, #28\n\t" + "eor x7, x7, x27\n\t" + "sli v3.2d, v18.2d, #21\n\t" + "bic x11, x10, x9\n\t" + "eor v31.16b, v17.16b, v27.16b\n\t" + "eor x9, x9, x28\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor x8, x8, x11\n\t" + "ushr v18.2d, v31.2d, #49\n\t" + "eor x10, x10, x30\n\t" + "ushr v17.2d, v11.2d, #54\n\t" + "bic x11, x13, x12\n\t" + "sli v18.2d, v31.2d, #15\n\t" + "bic x27, x14, x13\n\t" + "sli v17.2d, v11.2d, #10\n\t" + "bic x28, %x[state], x15\n\t" + "eor v31.16b, v7.16b, v27.16b\n\t" + "bic x30, x12, %x[state]\n\t" + "eor v10.16b, v10.16b, v25.16b\n\t" + "eor x11, %x[state], x11\n\t" + "ushr v11.2d, v31.2d, #58\n\t" + "eor x12, x12, x27\n\t" + "ushr v7.2d, v10.2d, #61\n\t" + "bic %x[state], x15, x14\n\t" + "sli v11.2d, v31.2d, #6\n\t" + "eor x14, x14, x28\n\t" + "sli v7.2d, v10.2d, #3\n\t" + "eor x13, x13, %x[state]\n\t" + /* Row Mix NEON */ + "bic v25.16b, v2.16b, v1.16b\n\t" + "eor x15, x15, x30\n\t" + "bic v26.16b, v3.16b, v2.16b\n\t" + "bic %x[state], x19, x17\n\t" + "bic v27.16b, v4.16b, v3.16b\n\t" + "bic x27, x20, x19\n\t" + "bic v28.16b, v0.16b, v4.16b\n\t" + "bic x28, x16, x21\n\t" + "bic v29.16b, v1.16b, v0.16b\n\t" + "bic x30, x17, x16\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor x16, x16, %x[state]\n\t" + "eor v1.16b, v1.16b, v26.16b\n\t" + "eor x17, x17, x27\n\t" + "eor v2.16b, v2.16b, v27.16b\n\t" + "bic %x[state], x21, x20\n\t" + "eor v3.16b, v3.16b, v28.16b\n\t" + "eor x20, x20, x28\n\t" + "eor v4.16b, v4.16b, v29.16b\n\t" + "eor x19, x19, %x[state]\n\t" + "bic v25.16b, v7.16b, v6.16b\n\t" + "eor x21, x21, x30\n\t" + "bic v26.16b, v8.16b, v7.16b\n\t" + "bic %x[state], x24, x23\n\t" + "bic v27.16b, v9.16b, v8.16b\n\t" + "bic x27, x25, x24\n\t" + "bic v28.16b, v5.16b, v9.16b\n\t" + "bic x28, x22, x26\n\t" + "bic v29.16b, v6.16b, v5.16b\n\t" + "bic x30, x23, x22\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "eor x22, x22, %x[state]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x23, x23, x27\n\t" + "eor v7.16b, v7.16b, v27.16b\n\t" + "bic %x[state], x26, x25\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor x25, x25, x28\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor x24, x24, %x[state]\n\t" + "bic v25.16b, v12.16b, v11.16b\n\t" + "eor x26, x26, x30\n\t" + "bic v26.16b, v13.16b, v12.16b\n\t" + "bic v27.16b, v14.16b, v13.16b\n\t" + "bic v28.16b, v30.16b, v14.16b\n\t" + "bic v29.16b, v11.16b, v30.16b\n\t" + "eor v10.16b, v30.16b, v25.16b\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor v12.16b, v12.16b, v27.16b\n\t" + "eor v13.16b, v13.16b, v28.16b\n\t" + "eor v14.16b, v14.16b, v29.16b\n\t" + "bic v25.16b, v17.16b, v16.16b\n\t" + "bic v26.16b, v18.16b, v17.16b\n\t" + "bic v27.16b, v19.16b, v18.16b\n\t" + "bic v28.16b, v15.16b, v19.16b\n\t" + "bic v29.16b, v16.16b, v15.16b\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "eor v16.16b, v16.16b, v26.16b\n\t" + "eor v17.16b, v17.16b, v27.16b\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "bic v25.16b, v22.16b, v21.16b\n\t" + "bic v26.16b, v23.16b, v22.16b\n\t" + "bic v27.16b, v24.16b, v23.16b\n\t" + "bic v28.16b, v20.16b, v24.16b\n\t" + "bic v29.16b, v21.16b, v20.16b\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor v21.16b, v21.16b, v26.16b\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor v23.16b, v23.16b, v28.16b\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + /* Done tranforming */ + "ldp x27, x28, [x29, #48]\n\t" + "ldr %x[state], [x27], #8\n\t" + "subs x28, x28, #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x1, x1, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x1, x2, [%x[state]]\n\t" + "stp x3, x4, [%x[state], #16]\n\t" + "stp x5, x6, [%x[state], #32]\n\t" + "stp x7, x8, [%x[state], #48]\n\t" + "stp x9, x10, [%x[state], #64]\n\t" + "stp x11, x12, [%x[state], #80]\n\t" + "stp x13, x14, [%x[state], #96]\n\t" + "stp x15, x16, [%x[state], #112]\n\t" + "stp x17, x19, [%x[state], #128]\n\t" + "stp x20, x21, [%x[state], #144]\n\t" + "stp x22, x23, [%x[state], #160]\n\t" + "stp x24, x25, [%x[state], #176]\n\t" + "str x26, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_sha3_aarch64_r]\n\t" + "add x28, x28, :lo12:%[L_sha3_aarch64_r]\n\t" +#else + "adrp x28, %[L_sha3_aarch64_r]@PAGE\n\t" + "add x28, x28, %[L_sha3_aarch64_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "eor v16.16b, v16.16b, v16.16b\n\t" + "eor x19, x19, x19\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "movz x23, #0x8000, lsl 48\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v20.2d, x23\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix NEON */ + "eor v30.16b, v4.16b, v9.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor v27.16b, v1.16b, v6.16b\n\t" + "eor x30, x2, x7\n\t" + "eor v30.16b, v30.16b, v14.16b\n\t" + "eor x28, x4, x9\n\t" + "eor v27.16b, v27.16b, v11.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor v30.16b, v30.16b, v19.16b\n\t" + "eor x30, x30, x12\n\t" + "eor v27.16b, v27.16b, v16.16b\n\t" + "eor x28, x28, x14\n\t" + "eor v30.16b, v30.16b, v24.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor v27.16b, v27.16b, v21.16b\n\t" + "eor x30, x30, x17\n\t" + "ushr v25.2d, v27.2d, #63\n\t" + "eor x28, x28, x20\n\t" + "sli v25.2d, v27.2d, #1\n\t" + "eor %x[state], %x[state], x27\n\t" + "eor v25.16b, v25.16b, v30.16b\n\t" + "eor x30, x30, x23\n\t" + "eor v31.16b, v0.16b, v5.16b\n\t" + "eor x28, x28, x25\n\t" + "eor v28.16b, v2.16b, v7.16b\n\t" + "str %x[state], [x29, #32]\n\t" + "eor v31.16b, v31.16b, v10.16b\n\t" + "str x28, [x29, #24]\n\t" + "eor v28.16b, v28.16b, v12.16b\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v31.16b, v31.16b, v15.16b\n\t" + "eor x28, x5, x10\n\t" + "eor v28.16b, v28.16b, v17.16b\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "eor v31.16b, v31.16b, v20.16b\n\t" + "eor x28, x28, x15\n\t" + "eor v28.16b, v28.16b, v22.16b\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "ushr v29.2d, v30.2d, #63\n\t" + "eor x28, x28, x21\n\t" + "ushr v26.2d, v28.2d, #63\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "sli v29.2d, v30.2d, #1\n\t" + "eor x28, x28, x26\n\t" + "sli v26.2d, v28.2d, #1\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "eor v28.16b, v28.16b, v29.16b\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor v29.16b, v3.16b, v8.16b\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v26.16b, v26.16b, v31.16b\n\t" + "eor x7, x7, %x[state]\n\t" + "eor v29.16b, v29.16b, v13.16b\n\t" + "eor x12, x12, %x[state]\n\t" + "eor v29.16b, v29.16b, v18.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v29.16b, v29.16b, v23.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "ushr v30.2d, v29.2d, #63\n\t" + "eor x4, x4, %x[seed]\n\t" + "sli v30.2d, v29.2d, #1\n\t" + "eor x9, x9, %x[seed]\n\t" + "eor v27.16b, v27.16b, v30.16b\n\t" + "eor x14, x14, %x[seed]\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x20, x20, %x[seed]\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x25, x25, %x[seed]\n\t" + "eor v29.16b, v29.16b, v30.16b\n\t" + "ldr %x[state], [x29, #32]\n\t" + /* Swap Rotate NEON */ + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor v31.16b, v1.16b, v26.16b\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x28, x28, x30, ror 63\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "ushr v1.2d, v6.2d, #20\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x6, x6, x28\n\t" + "sli v1.2d, v6.2d, #44\n\t" + "eor x11, x11, x28\n\t" + "eor v31.16b, v9.16b, v29.16b\n\t" + "eor x16, x16, x28\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor x22, x22, x28\n\t" + "ushr v6.2d, v31.2d, #44\n\t" + "eor x27, x27, x28\n\t" + "ushr v9.2d, v22.2d, #3\n\t" + "eor x3, x3, x30\n\t" + "sli v6.2d, v31.2d, #20\n\t" + "eor x8, x8, x30\n\t" + "sli v9.2d, v22.2d, #61\n\t" + "eor x13, x13, x30\n\t" + "eor v31.16b, v14.16b, v29.16b\n\t" + "eor x19, x19, x30\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor x24, x24, x30\n\t" + "ushr v22.2d, v31.2d, #25\n\t" + "eor x5, x5, %x[seed]\n\t" + "ushr v14.2d, v20.2d, #46\n\t" + "eor x10, x10, %x[seed]\n\t" + "sli v22.2d, v31.2d, #39\n\t" + "eor x15, x15, %x[seed]\n\t" + "sli v14.2d, v20.2d, #18\n\t" + "eor x21, x21, %x[seed]\n\t" + "eor v31.16b, v2.16b, v27.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "eor v12.16b, v12.16b, v27.16b\n\t" + "ror %x[state], x3, #63\n\t" + "ushr v20.2d, v31.2d, #2\n\t" + "ror x3, x8, #20\n\t" + "ushr v2.2d, v12.2d, #21\n\t" + "ror x8, x11, #44\n\t" + "sli v20.2d, v31.2d, #62\n\t" + "ror x11, x25, #3\n\t" + "sli v2.2d, v12.2d, #43\n\t" + "ror x25, x16, #25\n\t" + "eor v31.16b, v13.16b, v28.16b\n\t" + "ror x16, x23, #46\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "ror x23, x4, #2\n\t" + "ushr v12.2d, v31.2d, #39\n\t" + "ror x4, x14, #21\n\t" + "ushr v13.2d, v19.2d, #56\n\t" + "ror x14, x15, #39\n\t" + "sli v12.2d, v31.2d, #25\n\t" + "ror x15, x22, #56\n\t" + "sli v13.2d, v19.2d, #8\n\t" + "ror x22, x26, #8\n\t" + "eor v31.16b, v23.16b, v28.16b\n\t" + "ror x26, x17, #23\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "ror x17, x6, #37\n\t" + "ushr v19.2d, v31.2d, #8\n\t" + "ror x6, x27, #50\n\t" + "ushr v23.2d, v15.2d, #23\n\t" + "ror x27, x24, #62\n\t" + "sli v19.2d, v31.2d, #56\n\t" + "ror x24, x10, #9\n\t" + "sli v23.2d, v15.2d, #41\n\t" + "ror x10, x19, #19\n\t" + "eor v31.16b, v4.16b, v29.16b\n\t" + "ror x19, x7, #28\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + "ror x7, x5, #36\n\t" + "ushr v15.2d, v31.2d, #37\n\t" + "ror x5, x21, #43\n\t" + "ushr v4.2d, v24.2d, #50\n\t" + "ror x21, x20, #49\n\t" + "sli v15.2d, v31.2d, #27\n\t" + "ror x20, x13, #54\n\t" + "sli v4.2d, v24.2d, #14\n\t" + "ror x13, x9, #58\n\t" + "eor v31.16b, v21.16b, v26.16b\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "eor v8.16b, v8.16b, v28.16b\n\t" + "bic x12, x4, x3\n\t" + "ushr v24.2d, v31.2d, #62\n\t" + "bic %x[seed], x5, x4\n\t" + "ushr v21.2d, v8.2d, #9\n\t" + "bic x28, x2, x6\n\t" + "sli v24.2d, v31.2d, #2\n\t" + "bic x30, x3, x2\n\t" + "sli v21.2d, v8.2d, #55\n\t" + "eor x2, x2, x12\n\t" + "eor v31.16b, v16.16b, v26.16b\n\t" + "eor x3, x3, %x[seed]\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "bic x12, x6, x5\n\t" + "ushr v8.2d, v31.2d, #19\n\t" + "eor x5, x5, x28\n\t" + "ushr v16.2d, v5.2d, #28\n\t" + "eor x4, x4, x12\n\t" + "sli v8.2d, v31.2d, #45\n\t" + "eor x6, x6, x30\n\t" + "sli v16.2d, v5.2d, #36\n\t" + "bic x12, x9, x8\n\t" + "eor v31.16b, v3.16b, v28.16b\n\t" + "bic %x[seed], x10, x9\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "bic x28, x7, x11\n\t" + "ushr v5.2d, v31.2d, #36\n\t" + "bic x30, x8, x7\n\t" + "ushr v3.2d, v18.2d, #43\n\t" + "eor x7, x7, x12\n\t" + "sli v5.2d, v31.2d, #28\n\t" + "eor x8, x8, %x[seed]\n\t" + "sli v3.2d, v18.2d, #21\n\t" + "bic x12, x11, x10\n\t" + "eor v31.16b, v17.16b, v27.16b\n\t" + "eor x10, x10, x28\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor x9, x9, x12\n\t" + "ushr v18.2d, v31.2d, #49\n\t" + "eor x11, x11, x30\n\t" + "ushr v17.2d, v11.2d, #54\n\t" + "bic x12, x14, x13\n\t" + "sli v18.2d, v31.2d, #15\n\t" + "bic %x[seed], x15, x14\n\t" + "sli v17.2d, v11.2d, #10\n\t" + "bic x28, %x[state], x16\n\t" + "eor v31.16b, v7.16b, v27.16b\n\t" + "bic x30, x13, %x[state]\n\t" + "eor v10.16b, v10.16b, v25.16b\n\t" + "eor x12, %x[state], x12\n\t" + "ushr v11.2d, v31.2d, #58\n\t" + "eor x13, x13, %x[seed]\n\t" + "ushr v7.2d, v10.2d, #61\n\t" + "bic %x[state], x16, x15\n\t" + "sli v11.2d, v31.2d, #6\n\t" + "eor x15, x15, x28\n\t" + "sli v7.2d, v10.2d, #3\n\t" + "eor x14, x14, %x[state]\n\t" + /* Row Mix NEON */ + "bic v25.16b, v2.16b, v1.16b\n\t" + "eor x16, x16, x30\n\t" + "bic v26.16b, v3.16b, v2.16b\n\t" + "bic %x[state], x20, x19\n\t" + "bic v27.16b, v4.16b, v3.16b\n\t" + "bic %x[seed], x21, x20\n\t" + "bic v28.16b, v0.16b, v4.16b\n\t" + "bic x28, x17, x22\n\t" + "bic v29.16b, v1.16b, v0.16b\n\t" + "bic x30, x19, x17\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v1.16b, v1.16b, v26.16b\n\t" + "eor x19, x19, %x[seed]\n\t" + "eor v2.16b, v2.16b, v27.16b\n\t" + "bic %x[state], x22, x21\n\t" + "eor v3.16b, v3.16b, v28.16b\n\t" + "eor x21, x21, x28\n\t" + "eor v4.16b, v4.16b, v29.16b\n\t" + "eor x20, x20, %x[state]\n\t" + "bic v25.16b, v7.16b, v6.16b\n\t" + "eor x22, x22, x30\n\t" + "bic v26.16b, v8.16b, v7.16b\n\t" + "bic %x[state], x25, x24\n\t" + "bic v27.16b, v9.16b, v8.16b\n\t" + "bic %x[seed], x26, x25\n\t" + "bic v28.16b, v5.16b, v9.16b\n\t" + "bic x28, x23, x27\n\t" + "bic v29.16b, v6.16b, v5.16b\n\t" + "bic x30, x24, x23\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x24, x24, %x[seed]\n\t" + "eor v7.16b, v7.16b, v27.16b\n\t" + "bic %x[state], x27, x26\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor x26, x26, x28\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor x25, x25, %x[state]\n\t" + "bic v25.16b, v12.16b, v11.16b\n\t" + "eor x27, x27, x30\n\t" + "bic v26.16b, v13.16b, v12.16b\n\t" + "bic v27.16b, v14.16b, v13.16b\n\t" + "bic v28.16b, v30.16b, v14.16b\n\t" + "bic v29.16b, v11.16b, v30.16b\n\t" + "eor v10.16b, v30.16b, v25.16b\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor v12.16b, v12.16b, v27.16b\n\t" + "eor v13.16b, v13.16b, v28.16b\n\t" + "eor v14.16b, v14.16b, v29.16b\n\t" + "bic v25.16b, v17.16b, v16.16b\n\t" + "bic v26.16b, v18.16b, v17.16b\n\t" + "bic v27.16b, v19.16b, v18.16b\n\t" + "bic v28.16b, v15.16b, v19.16b\n\t" + "bic v29.16b, v16.16b, v15.16b\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "eor v16.16b, v16.16b, v26.16b\n\t" + "eor v17.16b, v17.16b, v27.16b\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "bic v25.16b, v22.16b, v21.16b\n\t" + "bic v26.16b, v23.16b, v22.16b\n\t" + "bic v27.16b, v24.16b, v23.16b\n\t" + "bic v28.16b, v20.16b, v24.16b\n\t" + "bic v29.16b, v21.16b, v20.16b\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor v21.16b, v21.16b, v26.16b\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor v23.16b, v23.16b, v28.16b\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x28, %[L_sha3_aarch64_r]\n\t" + "add x28, x28, :lo12:%[L_sha3_aarch64_r]\n\t" +#else + "adrp x28, %[L_sha3_aarch64_r]@PAGE\n\t" + "add x28, x28, %[L_sha3_aarch64_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "str %x[state], [x29, #40]\n\t" + "add %x[state], %x[state], #32\n\t" + "ld1 {v4.d}[0], [%x[state]]\n\t" + "ldp x2, x3, [%x[seed]], #16\n\t" + "add %x[state], %x[state], #0xc8\n\t" + "ld1 {v4.d}[1], [%x[state]]\n\t" + "ldp x4, x5, [%x[seed]], #16\n\t" + "ldr x6, [%x[state], #200]\n\t" + "eor v5.16b, v5.16b, v5.16b\n\t" + "eor x7, x7, x7\n\t" + "eor v6.16b, v6.16b, v6.16b\n\t" + "eor x8, x8, x8\n\t" + "eor v7.16b, v7.16b, v7.16b\n\t" + "eor x9, x9, x9\n\t" + "eor v8.16b, v8.16b, v8.16b\n\t" + "eor x10, x10, x10\n\t" + "eor v9.16b, v9.16b, v9.16b\n\t" + "eor x11, x11, x11\n\t" + "eor v10.16b, v10.16b, v10.16b\n\t" + "eor x12, x12, x12\n\t" + "eor v11.16b, v11.16b, v11.16b\n\t" + "eor x13, x13, x13\n\t" + "eor v12.16b, v12.16b, v12.16b\n\t" + "eor x14, x14, x14\n\t" + "eor v13.16b, v13.16b, v13.16b\n\t" + "eor x15, x15, x15\n\t" + "eor v14.16b, v14.16b, v14.16b\n\t" + "eor x16, x16, x16\n\t" + "eor v15.16b, v15.16b, v15.16b\n\t" + "eor x17, x17, x17\n\t" + "movz x19, #0x8000, lsl 48\n\t" + "eor v17.16b, v17.16b, v17.16b\n\t" + "eor x20, x20, x20\n\t" + "eor v18.16b, v18.16b, v18.16b\n\t" + "eor x21, x21, x21\n\t" + "eor v19.16b, v19.16b, v19.16b\n\t" + "eor x22, x22, x22\n\t" + "eor v20.16b, v20.16b, v20.16b\n\t" + "eor x23, x23, x23\n\t" + "eor v21.16b, v21.16b, v21.16b\n\t" + "eor x24, x24, x24\n\t" + "eor v22.16b, v22.16b, v22.16b\n\t" + "eor x25, x25, x25\n\t" + "eor v23.16b, v23.16b, v23.16b\n\t" + "eor x26, x26, x26\n\t" + "eor v24.16b, v24.16b, v24.16b\n\t" + "eor x27, x27, x27\n\t" + "dup v0.2d, x2\n\t" + "dup v1.2d, x3\n\t" + "dup v2.2d, x4\n\t" + "dup v3.2d, x5\n\t" + "dup v16.2d, x19\n\t" + "mov %x[seed], #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t" + "stp x28, %x[seed], [x29, #48]\n\t" + /* Col Mix NEON */ + "eor v30.16b, v4.16b, v9.16b\n\t" + "eor %x[state], x6, x11\n\t" + "eor v27.16b, v1.16b, v6.16b\n\t" + "eor x30, x2, x7\n\t" + "eor v30.16b, v30.16b, v14.16b\n\t" + "eor x28, x4, x9\n\t" + "eor v27.16b, v27.16b, v11.16b\n\t" + "eor %x[state], %x[state], x16\n\t" + "eor v30.16b, v30.16b, v19.16b\n\t" + "eor x30, x30, x12\n\t" + "eor v27.16b, v27.16b, v16.16b\n\t" + "eor x28, x28, x14\n\t" + "eor v30.16b, v30.16b, v24.16b\n\t" + "eor %x[state], %x[state], x22\n\t" + "eor v27.16b, v27.16b, v21.16b\n\t" + "eor x30, x30, x17\n\t" + "ushr v25.2d, v27.2d, #63\n\t" + "eor x28, x28, x20\n\t" + "sli v25.2d, v27.2d, #1\n\t" + "eor %x[state], %x[state], x27\n\t" + "eor v25.16b, v25.16b, v30.16b\n\t" + "eor x30, x30, x23\n\t" + "eor v31.16b, v0.16b, v5.16b\n\t" + "eor x28, x28, x25\n\t" + "eor v28.16b, v2.16b, v7.16b\n\t" + "str %x[state], [x29, #32]\n\t" + "eor v31.16b, v31.16b, v10.16b\n\t" + "str x28, [x29, #24]\n\t" + "eor v28.16b, v28.16b, v12.16b\n\t" + "eor %x[seed], x3, x8\n\t" + "eor v31.16b, v31.16b, v15.16b\n\t" + "eor x28, x5, x10\n\t" + "eor v28.16b, v28.16b, v17.16b\n\t" + "eor %x[seed], %x[seed], x13\n\t" + "eor v31.16b, v31.16b, v20.16b\n\t" + "eor x28, x28, x15\n\t" + "eor v28.16b, v28.16b, v22.16b\n\t" + "eor %x[seed], %x[seed], x19\n\t" + "ushr v29.2d, v30.2d, #63\n\t" + "eor x28, x28, x21\n\t" + "ushr v26.2d, v28.2d, #63\n\t" + "eor %x[seed], %x[seed], x24\n\t" + "sli v29.2d, v30.2d, #1\n\t" + "eor x28, x28, x26\n\t" + "sli v26.2d, v28.2d, #1\n\t" + "eor %x[state], %x[state], %x[seed], ror 63\n\t" + "eor v28.16b, v28.16b, v29.16b\n\t" + "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor v29.16b, v3.16b, v8.16b\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v26.16b, v26.16b, v31.16b\n\t" + "eor x7, x7, %x[state]\n\t" + "eor v29.16b, v29.16b, v13.16b\n\t" + "eor x12, x12, %x[state]\n\t" + "eor v29.16b, v29.16b, v18.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v29.16b, v29.16b, v23.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "ushr v30.2d, v29.2d, #63\n\t" + "eor x4, x4, %x[seed]\n\t" + "sli v30.2d, v29.2d, #1\n\t" + "eor x9, x9, %x[seed]\n\t" + "eor v27.16b, v27.16b, v30.16b\n\t" + "eor x14, x14, %x[seed]\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x20, x20, %x[seed]\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x25, x25, %x[seed]\n\t" + "eor v29.16b, v29.16b, v30.16b\n\t" + "ldr %x[state], [x29, #32]\n\t" + /* Swap Rotate NEON */ + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor v31.16b, v1.16b, v26.16b\n\t" + "ldr %x[seed], [x29, #24]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x28, x28, x30, ror 63\n\t" + "ushr v30.2d, v31.2d, #63\n\t" + "eor x30, x30, %x[seed], ror 63\n\t" + "ushr v1.2d, v6.2d, #20\n\t" + "eor %x[seed], %x[seed], %x[state], ror 63\n\t" + "sli v30.2d, v31.2d, #1\n\t" + "eor x6, x6, x28\n\t" + "sli v1.2d, v6.2d, #44\n\t" + "eor x11, x11, x28\n\t" + "eor v31.16b, v9.16b, v29.16b\n\t" + "eor x16, x16, x28\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor x22, x22, x28\n\t" + "ushr v6.2d, v31.2d, #44\n\t" + "eor x27, x27, x28\n\t" + "ushr v9.2d, v22.2d, #3\n\t" + "eor x3, x3, x30\n\t" + "sli v6.2d, v31.2d, #20\n\t" + "eor x8, x8, x30\n\t" + "sli v9.2d, v22.2d, #61\n\t" + "eor x13, x13, x30\n\t" + "eor v31.16b, v14.16b, v29.16b\n\t" + "eor x19, x19, x30\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor x24, x24, x30\n\t" + "ushr v22.2d, v31.2d, #25\n\t" + "eor x5, x5, %x[seed]\n\t" + "ushr v14.2d, v20.2d, #46\n\t" + "eor x10, x10, %x[seed]\n\t" + "sli v22.2d, v31.2d, #39\n\t" + "eor x15, x15, %x[seed]\n\t" + "sli v14.2d, v20.2d, #18\n\t" + "eor x21, x21, %x[seed]\n\t" + "eor v31.16b, v2.16b, v27.16b\n\t" + "eor x26, x26, %x[seed]\n\t" + /* Swap Rotate Base */ + "eor v12.16b, v12.16b, v27.16b\n\t" + "ror %x[state], x3, #63\n\t" + "ushr v20.2d, v31.2d, #2\n\t" + "ror x3, x8, #20\n\t" + "ushr v2.2d, v12.2d, #21\n\t" + "ror x8, x11, #44\n\t" + "sli v20.2d, v31.2d, #62\n\t" + "ror x11, x25, #3\n\t" + "sli v2.2d, v12.2d, #43\n\t" + "ror x25, x16, #25\n\t" + "eor v31.16b, v13.16b, v28.16b\n\t" + "ror x16, x23, #46\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "ror x23, x4, #2\n\t" + "ushr v12.2d, v31.2d, #39\n\t" + "ror x4, x14, #21\n\t" + "ushr v13.2d, v19.2d, #56\n\t" + "ror x14, x15, #39\n\t" + "sli v12.2d, v31.2d, #25\n\t" + "ror x15, x22, #56\n\t" + "sli v13.2d, v19.2d, #8\n\t" + "ror x22, x26, #8\n\t" + "eor v31.16b, v23.16b, v28.16b\n\t" + "ror x26, x17, #23\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "ror x17, x6, #37\n\t" + "ushr v19.2d, v31.2d, #8\n\t" + "ror x6, x27, #50\n\t" + "ushr v23.2d, v15.2d, #23\n\t" + "ror x27, x24, #62\n\t" + "sli v19.2d, v31.2d, #56\n\t" + "ror x24, x10, #9\n\t" + "sli v23.2d, v15.2d, #41\n\t" + "ror x10, x19, #19\n\t" + "eor v31.16b, v4.16b, v29.16b\n\t" + "ror x19, x7, #28\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + "ror x7, x5, #36\n\t" + "ushr v15.2d, v31.2d, #37\n\t" + "ror x5, x21, #43\n\t" + "ushr v4.2d, v24.2d, #50\n\t" + "ror x21, x20, #49\n\t" + "sli v15.2d, v31.2d, #27\n\t" + "ror x20, x13, #54\n\t" + "sli v4.2d, v24.2d, #14\n\t" + "ror x13, x9, #58\n\t" + "eor v31.16b, v21.16b, v26.16b\n\t" + "ror x9, x12, #61\n\t" + /* Row Mix Base */ + "eor v8.16b, v8.16b, v28.16b\n\t" + "bic x12, x4, x3\n\t" + "ushr v24.2d, v31.2d, #62\n\t" + "bic %x[seed], x5, x4\n\t" + "ushr v21.2d, v8.2d, #9\n\t" + "bic x28, x2, x6\n\t" + "sli v24.2d, v31.2d, #2\n\t" + "bic x30, x3, x2\n\t" + "sli v21.2d, v8.2d, #55\n\t" + "eor x2, x2, x12\n\t" + "eor v31.16b, v16.16b, v26.16b\n\t" + "eor x3, x3, %x[seed]\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "bic x12, x6, x5\n\t" + "ushr v8.2d, v31.2d, #19\n\t" + "eor x5, x5, x28\n\t" + "ushr v16.2d, v5.2d, #28\n\t" + "eor x4, x4, x12\n\t" + "sli v8.2d, v31.2d, #45\n\t" + "eor x6, x6, x30\n\t" + "sli v16.2d, v5.2d, #36\n\t" + "bic x12, x9, x8\n\t" + "eor v31.16b, v3.16b, v28.16b\n\t" + "bic %x[seed], x10, x9\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "bic x28, x7, x11\n\t" + "ushr v5.2d, v31.2d, #36\n\t" + "bic x30, x8, x7\n\t" + "ushr v3.2d, v18.2d, #43\n\t" + "eor x7, x7, x12\n\t" + "sli v5.2d, v31.2d, #28\n\t" + "eor x8, x8, %x[seed]\n\t" + "sli v3.2d, v18.2d, #21\n\t" + "bic x12, x11, x10\n\t" + "eor v31.16b, v17.16b, v27.16b\n\t" + "eor x10, x10, x28\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor x9, x9, x12\n\t" + "ushr v18.2d, v31.2d, #49\n\t" + "eor x11, x11, x30\n\t" + "ushr v17.2d, v11.2d, #54\n\t" + "bic x12, x14, x13\n\t" + "sli v18.2d, v31.2d, #15\n\t" + "bic %x[seed], x15, x14\n\t" + "sli v17.2d, v11.2d, #10\n\t" + "bic x28, %x[state], x16\n\t" + "eor v31.16b, v7.16b, v27.16b\n\t" + "bic x30, x13, %x[state]\n\t" + "eor v10.16b, v10.16b, v25.16b\n\t" + "eor x12, %x[state], x12\n\t" + "ushr v11.2d, v31.2d, #58\n\t" + "eor x13, x13, %x[seed]\n\t" + "ushr v7.2d, v10.2d, #61\n\t" + "bic %x[state], x16, x15\n\t" + "sli v11.2d, v31.2d, #6\n\t" + "eor x15, x15, x28\n\t" + "sli v7.2d, v10.2d, #3\n\t" + "eor x14, x14, %x[state]\n\t" + /* Row Mix NEON */ + "bic v25.16b, v2.16b, v1.16b\n\t" + "eor x16, x16, x30\n\t" + "bic v26.16b, v3.16b, v2.16b\n\t" + "bic %x[state], x20, x19\n\t" + "bic v27.16b, v4.16b, v3.16b\n\t" + "bic %x[seed], x21, x20\n\t" + "bic v28.16b, v0.16b, v4.16b\n\t" + "bic x28, x17, x22\n\t" + "bic v29.16b, v1.16b, v0.16b\n\t" + "bic x30, x19, x17\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "eor x17, x17, %x[state]\n\t" + "eor v1.16b, v1.16b, v26.16b\n\t" + "eor x19, x19, %x[seed]\n\t" + "eor v2.16b, v2.16b, v27.16b\n\t" + "bic %x[state], x22, x21\n\t" + "eor v3.16b, v3.16b, v28.16b\n\t" + "eor x21, x21, x28\n\t" + "eor v4.16b, v4.16b, v29.16b\n\t" + "eor x20, x20, %x[state]\n\t" + "bic v25.16b, v7.16b, v6.16b\n\t" + "eor x22, x22, x30\n\t" + "bic v26.16b, v8.16b, v7.16b\n\t" + "bic %x[state], x25, x24\n\t" + "bic v27.16b, v9.16b, v8.16b\n\t" + "bic %x[seed], x26, x25\n\t" + "bic v28.16b, v5.16b, v9.16b\n\t" + "bic x28, x23, x27\n\t" + "bic v29.16b, v6.16b, v5.16b\n\t" + "bic x30, x24, x23\n\t" + "eor v5.16b, v5.16b, v25.16b\n\t" + "eor x23, x23, %x[state]\n\t" + "eor v6.16b, v6.16b, v26.16b\n\t" + "eor x24, x24, %x[seed]\n\t" + "eor v7.16b, v7.16b, v27.16b\n\t" + "bic %x[state], x27, x26\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor x26, x26, x28\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor x25, x25, %x[state]\n\t" + "bic v25.16b, v12.16b, v11.16b\n\t" + "eor x27, x27, x30\n\t" + "bic v26.16b, v13.16b, v12.16b\n\t" + "bic v27.16b, v14.16b, v13.16b\n\t" + "bic v28.16b, v30.16b, v14.16b\n\t" + "bic v29.16b, v11.16b, v30.16b\n\t" + "eor v10.16b, v30.16b, v25.16b\n\t" + "eor v11.16b, v11.16b, v26.16b\n\t" + "eor v12.16b, v12.16b, v27.16b\n\t" + "eor v13.16b, v13.16b, v28.16b\n\t" + "eor v14.16b, v14.16b, v29.16b\n\t" + "bic v25.16b, v17.16b, v16.16b\n\t" + "bic v26.16b, v18.16b, v17.16b\n\t" + "bic v27.16b, v19.16b, v18.16b\n\t" + "bic v28.16b, v15.16b, v19.16b\n\t" + "bic v29.16b, v16.16b, v15.16b\n\t" + "eor v15.16b, v15.16b, v25.16b\n\t" + "eor v16.16b, v16.16b, v26.16b\n\t" + "eor v17.16b, v17.16b, v27.16b\n\t" + "eor v18.16b, v18.16b, v28.16b\n\t" + "eor v19.16b, v19.16b, v29.16b\n\t" + "bic v25.16b, v22.16b, v21.16b\n\t" + "bic v26.16b, v23.16b, v22.16b\n\t" + "bic v27.16b, v24.16b, v23.16b\n\t" + "bic v28.16b, v20.16b, v24.16b\n\t" + "bic v29.16b, v21.16b, v20.16b\n\t" + "eor v20.16b, v20.16b, v25.16b\n\t" + "eor v21.16b, v21.16b, v26.16b\n\t" + "eor v22.16b, v22.16b, v27.16b\n\t" + "eor v23.16b, v23.16b, v28.16b\n\t" + "eor v24.16b, v24.16b, v29.16b\n\t" + /* Done tranforming */ + "ldp x28, %x[seed], [x29, #48]\n\t" + "ldr %x[state], [x28], #8\n\t" + "subs %x[seed], %x[seed], #1\n\t" + "mov v30.d[0], %x[state]\n\t" + "mov v30.d[1], %x[state]\n\t" + "eor x2, x2, %x[state]\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.d}[0], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[1], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[1], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[1], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[1], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[1], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[1], [%x[state]], #32\n\t" + "st1 {v24.d}[1], [%x[state]]\n\t" + "add %x[state], %x[state], #8\n\t" + "stp x2, x3, [%x[state]]\n\t" + "stp x4, x5, [%x[state], #16]\n\t" + "stp x6, x7, [%x[state], #32]\n\t" + "stp x8, x9, [%x[state], #48]\n\t" + "stp x10, x11, [%x[state], #64]\n\t" + "stp x12, x13, [%x[state], #80]\n\t" + "stp x14, x15, [%x[state], #96]\n\t" + "stp x16, x17, [%x[state], #112]\n\t" + "stp x19, x20, [%x[state], #128]\n\t" + "stp x21, x22, [%x[state], #144]\n\t" + "stp x23, x24, [%x[state], #160]\n\t" + "stp x25, x26, [%x[state], #176]\n\t" + "str x27, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state), [seed] "+r" (seed) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul), [L_kyber_aarch64_to_msg_neon_low] "S" (L_kyber_aarch64_to_msg_neon_low), [L_kyber_aarch64_to_msg_neon_high] "S" (L_kyber_aarch64_to_msg_neon_high), [L_kyber_aarch64_to_msg_neon_bits] "S" (L_kyber_aarch64_to_msg_neon_bits), [L_kyber_aarch64_from_msg_neon_q1half] "S" (L_kyber_aarch64_from_msg_neon_q1half), [L_kyber_aarch64_from_msg_neon_bits] "S" (L_kyber_aarch64_from_msg_neon_bits), [L_kyber_aarch64_rej_uniform_neon_mask] "S" (L_kyber_aarch64_rej_uniform_neon_mask), [L_kyber_aarch64_rej_uniform_neon_bits] "S" (L_kyber_aarch64_rej_uniform_neon_bits), [L_kyber_aarch64_rej_uniform_neon_indeces] "S" (L_kyber_aarch64_rej_uniform_neon_indeces) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); +} + +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#endif /* WOLFSSL_WC_KYBER */ +#endif /* __aarch64__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-sha3-asm.S index 1652f41b4c..112e2d0d86 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm.S @@ -47,29 +47,29 @@ .p2align 3 #endif /* __APPLE__ */ L_SHA3_transform_crypto_r: - .xword 0x1 - .xword 0x8082 + .xword 0x0000000000000001 + .xword 0x0000000000008082 .xword 0x800000000000808a .xword 0x8000000080008000 - .xword 0x808b - .xword 0x80000001 + .xword 0x000000000000808b + .xword 0x0000000080000001 .xword 0x8000000080008081 .xword 0x8000000000008009 - .xword 0x8a - .xword 0x88 - .xword 0x80008009 - .xword 0x8000000a - .xword 0x8000808b + .xword 0x000000000000008a + .xword 0x0000000000000088 + .xword 0x0000000080008009 + .xword 0x000000008000000a + .xword 0x000000008000808b .xword 0x800000000000008b .xword 0x8000000000008089 .xword 0x8000000000008003 .xword 0x8000000000008002 .xword 0x8000000000000080 - .xword 0x800a + .xword 0x000000000000800a .xword 0x800000008000000a .xword 0x8000000080008081 .xword 0x8000000000008080 - .xword 0x80000001 + .xword 0x0000000080000001 .xword 0x8000000080008008 #ifndef __APPLE__ .text @@ -206,6 +206,251 @@ L_sha3_crypto_begin: #ifndef __APPLE__ .size BlockSha3,.-BlockSha3 #endif /* __APPLE__ */ +#else +#ifndef __APPLE__ + .text + .type L_SHA3_transform_base_r, %object + .section .rodata + .size L_SHA3_transform_base_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_transform_base_r: + .xword 0x0000000000000001 + .xword 0x0000000000008082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x000000000000808b + .xword 0x0000000080000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x000000000000008a + .xword 0x0000000000000088 + .xword 0x0000000080008009 + .xword 0x000000008000000a + .xword 0x000000008000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x000000000000800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x0000000080000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl BlockSha3 +.type BlockSha3,@function +.align 2 +BlockSha3: +#else +.section __TEXT,__text +.globl _BlockSha3 +.p2align 2 +_BlockSha3: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-160]! + add x29, sp, #0 + stp x17, x19, [x29, #72] + stp x20, x21, [x29, #88] + stp x22, x23, [x29, #104] + stp x24, x25, [x29, #120] + stp x26, x27, [x29, #136] + str x28, [x29, #152] +#ifndef __APPLE__ + adrp x27, L_SHA3_transform_base_r + add x27, x27, :lo12:L_SHA3_transform_base_r +#else + adrp x27, L_SHA3_transform_base_r@PAGE + add x27, x27, :lo12:L_SHA3_transform_base_r@PAGEOFF +#endif /* __APPLE__ */ + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + ldp x5, x6, [x0, #32] + ldp x7, x8, [x0, #48] + ldp x9, x10, [x0, #64] + ldp x11, x12, [x0, #80] + ldp x13, x14, [x0, #96] + ldp x15, x16, [x0, #112] + ldp x17, x19, [x0, #128] + ldp x20, x21, [x0, #144] + ldp x22, x23, [x0, #160] + ldp x24, x25, [x0, #176] + ldr x26, [x0, #192] + str x0, [x29, #40] + mov x28, #24 + # Start of 24 rounds +L_SHA3_transform_base_begin: + stp x27, x28, [x29, #48] + eor x0, x5, x10 + eor x30, x1, x6 + eor x28, x3, x8 + eor x0, x0, x15 + eor x30, x30, x11 + eor x28, x28, x13 + eor x0, x0, x21 + eor x30, x30, x16 + eor x28, x28, x19 + eor x0, x0, x26 + eor x30, x30, x22 + eor x28, x28, x24 + str x0, [x29, #32] + str x28, [x29, #24] + eor x27, x2, x7 + eor x28, x4, x9 + eor x27, x27, x12 + eor x28, x28, x14 + eor x27, x27, x17 + eor x28, x28, x20 + eor x27, x27, x23 + eor x28, x28, x25 + eor x0, x0, x27, ror 63 + eor x27, x27, x28, ror 63 + eor x1, x1, x0 + eor x6, x6, x0 + eor x11, x11, x0 + eor x16, x16, x0 + eor x22, x22, x0 + eor x3, x3, x27 + eor x8, x8, x27 + eor x13, x13, x27 + eor x19, x19, x27 + eor x24, x24, x27 + ldr x0, [x29, #32] + ldr x27, [x29, #24] + eor x28, x28, x30, ror 63 + eor x30, x30, x27, ror 63 + eor x27, x27, x0, ror 63 + eor x5, x5, x28 + eor x10, x10, x28 + eor x15, x15, x28 + eor x21, x21, x28 + eor x26, x26, x28 + eor x2, x2, x30 + eor x7, x7, x30 + eor x12, x12, x30 + eor x17, x17, x30 + eor x23, x23, x30 + eor x4, x4, x27 + eor x9, x9, x27 + eor x14, x14, x27 + eor x20, x20, x27 + eor x25, x25, x27 + # Swap Rotate + ror x0, x2, #63 + ror x2, x7, #20 + ror x7, x10, #44 + ror x10, x24, #3 + ror x24, x15, #25 + ror x15, x22, #46 + ror x22, x3, #2 + ror x3, x13, #21 + ror x13, x14, #39 + ror x14, x21, #56 + ror x21, x25, #8 + ror x25, x16, #23 + ror x16, x5, #37 + ror x5, x26, #50 + ror x26, x23, #62 + ror x23, x9, #9 + ror x9, x17, #19 + ror x17, x6, #28 + ror x6, x4, #36 + ror x4, x20, #43 + ror x20, x19, #49 + ror x19, x12, #54 + ror x12, x8, #58 + ror x8, x11, #61 + # Row Mix + bic x11, x3, x2 + bic x27, x4, x3 + bic x28, x1, x5 + bic x30, x2, x1 + eor x1, x1, x11 + eor x2, x2, x27 + bic x11, x5, x4 + eor x4, x4, x28 + eor x3, x3, x11 + eor x5, x5, x30 + bic x11, x8, x7 + bic x27, x9, x8 + bic x28, x6, x10 + bic x30, x7, x6 + eor x6, x6, x11 + eor x7, x7, x27 + bic x11, x10, x9 + eor x9, x9, x28 + eor x8, x8, x11 + eor x10, x10, x30 + bic x11, x13, x12 + bic x27, x14, x13 + bic x28, x0, x15 + bic x30, x12, x0 + eor x11, x0, x11 + eor x12, x12, x27 + bic x0, x15, x14 + eor x14, x14, x28 + eor x13, x13, x0 + eor x15, x15, x30 + bic x0, x19, x17 + bic x27, x20, x19 + bic x28, x16, x21 + bic x30, x17, x16 + eor x16, x16, x0 + eor x17, x17, x27 + bic x0, x21, x20 + eor x20, x20, x28 + eor x19, x19, x0 + eor x21, x21, x30 + bic x0, x24, x23 + bic x27, x25, x24 + bic x28, x22, x26 + bic x30, x23, x22 + eor x22, x22, x0 + eor x23, x23, x27 + bic x0, x26, x25 + eor x25, x25, x28 + eor x24, x24, x0 + eor x26, x26, x30 + # Done tranforming + ldp x27, x28, [x29, #48] + ldr x0, [x27], #8 + subs x28, x28, #1 + eor x1, x1, x0 + bne L_SHA3_transform_base_begin + ldr x0, [x29, #40] + stp x1, x2, [x0] + stp x3, x4, [x0, #16] + stp x5, x6, [x0, #32] + stp x7, x8, [x0, #48] + stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] + stp x13, x14, [x0, #96] + stp x15, x16, [x0, #112] + stp x17, x19, [x0, #128] + stp x20, x21, [x0, #144] + stp x22, x23, [x0, #160] + stp x24, x25, [x0, #176] + str x26, [x0, #192] + ldp x17, x19, [x29, #72] + ldp x20, x21, [x29, #88] + ldp x22, x23, [x29, #104] + ldp x24, x25, [x29, #120] + ldp x26, x27, [x29, #136] + ldr x28, [x29, #152] + ldp x29, x30, [sp], #0xa0 + ret +#ifndef __APPLE__ + .size BlockSha3,.-BlockSha3 +#endif /* __APPLE__ */ #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ #endif /* WOLFSSL_SHA3 */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c index bb4114d42b..e52d02de1b 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c @@ -181,6 +181,222 @@ void BlockSha3(unsigned long* state) ); } +#else +static const uint64_t L_SHA3_transform_base_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void BlockSha3(unsigned long* state) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" +#ifndef __APPLE__ + "adrp x27, %[L_SHA3_transform_base_r]\n\t" + "add x27, x27, :lo12:%[L_SHA3_transform_base_r]\n\t" +#else + "adrp x27, %[L_SHA3_transform_base_r]@PAGE\n\t" + "add x27, x27, %[L_SHA3_transform_base_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldp x1, x2, [%x[state]]\n\t" + "ldp x3, x4, [%x[state], #16]\n\t" + "ldp x5, x6, [%x[state], #32]\n\t" + "ldp x7, x8, [%x[state], #48]\n\t" + "ldp x9, x10, [%x[state], #64]\n\t" + "ldp x11, x12, [%x[state], #80]\n\t" + "ldp x13, x14, [%x[state], #96]\n\t" + "ldp x15, x16, [%x[state], #112]\n\t" + "ldp x17, x19, [%x[state], #128]\n\t" + "ldp x20, x21, [%x[state], #144]\n\t" + "ldp x22, x23, [%x[state], #160]\n\t" + "ldp x24, x25, [%x[state], #176]\n\t" + "ldr x26, [%x[state], #192]\n\t" + "str %x[state], [x29, #40]\n\t" + "mov x28, #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_SHA3_transform_base_begin_%=: \n\t" + "stp x27, x28, [x29, #48]\n\t" + "eor %x[state], x5, x10\n\t" + "eor x30, x1, x6\n\t" + "eor x28, x3, x8\n\t" + "eor %x[state], %x[state], x15\n\t" + "eor x30, x30, x11\n\t" + "eor x28, x28, x13\n\t" + "eor %x[state], %x[state], x21\n\t" + "eor x30, x30, x16\n\t" + "eor x28, x28, x19\n\t" + "eor %x[state], %x[state], x26\n\t" + "eor x30, x30, x22\n\t" + "eor x28, x28, x24\n\t" + "str %x[state], [x29, #32]\n\t" + "str x28, [x29, #24]\n\t" + "eor x27, x2, x7\n\t" + "eor x28, x4, x9\n\t" + "eor x27, x27, x12\n\t" + "eor x28, x28, x14\n\t" + "eor x27, x27, x17\n\t" + "eor x28, x28, x20\n\t" + "eor x27, x27, x23\n\t" + "eor x28, x28, x25\n\t" + "eor %x[state], %x[state], x27, ror 63\n\t" + "eor x27, x27, x28, ror 63\n\t" + "eor x1, x1, %x[state]\n\t" + "eor x6, x6, %x[state]\n\t" + "eor x11, x11, %x[state]\n\t" + "eor x16, x16, %x[state]\n\t" + "eor x22, x22, %x[state]\n\t" + "eor x3, x3, x27\n\t" + "eor x8, x8, x27\n\t" + "eor x13, x13, x27\n\t" + "eor x19, x19, x27\n\t" + "eor x24, x24, x27\n\t" + "ldr %x[state], [x29, #32]\n\t" + "ldr x27, [x29, #24]\n\t" + "eor x28, x28, x30, ror 63\n\t" + "eor x30, x30, x27, ror 63\n\t" + "eor x27, x27, %x[state], ror 63\n\t" + "eor x5, x5, x28\n\t" + "eor x10, x10, x28\n\t" + "eor x15, x15, x28\n\t" + "eor x21, x21, x28\n\t" + "eor x26, x26, x28\n\t" + "eor x2, x2, x30\n\t" + "eor x7, x7, x30\n\t" + "eor x12, x12, x30\n\t" + "eor x17, x17, x30\n\t" + "eor x23, x23, x30\n\t" + "eor x4, x4, x27\n\t" + "eor x9, x9, x27\n\t" + "eor x14, x14, x27\n\t" + "eor x20, x20, x27\n\t" + "eor x25, x25, x27\n\t" + /* Swap Rotate */ + "ror %x[state], x2, #63\n\t" + "ror x2, x7, #20\n\t" + "ror x7, x10, #44\n\t" + "ror x10, x24, #3\n\t" + "ror x24, x15, #25\n\t" + "ror x15, x22, #46\n\t" + "ror x22, x3, #2\n\t" + "ror x3, x13, #21\n\t" + "ror x13, x14, #39\n\t" + "ror x14, x21, #56\n\t" + "ror x21, x25, #8\n\t" + "ror x25, x16, #23\n\t" + "ror x16, x5, #37\n\t" + "ror x5, x26, #50\n\t" + "ror x26, x23, #62\n\t" + "ror x23, x9, #9\n\t" + "ror x9, x17, #19\n\t" + "ror x17, x6, #28\n\t" + "ror x6, x4, #36\n\t" + "ror x4, x20, #43\n\t" + "ror x20, x19, #49\n\t" + "ror x19, x12, #54\n\t" + "ror x12, x8, #58\n\t" + "ror x8, x11, #61\n\t" + /* Row Mix */ + "bic x11, x3, x2\n\t" + "bic x27, x4, x3\n\t" + "bic x28, x1, x5\n\t" + "bic x30, x2, x1\n\t" + "eor x1, x1, x11\n\t" + "eor x2, x2, x27\n\t" + "bic x11, x5, x4\n\t" + "eor x4, x4, x28\n\t" + "eor x3, x3, x11\n\t" + "eor x5, x5, x30\n\t" + "bic x11, x8, x7\n\t" + "bic x27, x9, x8\n\t" + "bic x28, x6, x10\n\t" + "bic x30, x7, x6\n\t" + "eor x6, x6, x11\n\t" + "eor x7, x7, x27\n\t" + "bic x11, x10, x9\n\t" + "eor x9, x9, x28\n\t" + "eor x8, x8, x11\n\t" + "eor x10, x10, x30\n\t" + "bic x11, x13, x12\n\t" + "bic x27, x14, x13\n\t" + "bic x28, %x[state], x15\n\t" + "bic x30, x12, %x[state]\n\t" + "eor x11, %x[state], x11\n\t" + "eor x12, x12, x27\n\t" + "bic %x[state], x15, x14\n\t" + "eor x14, x14, x28\n\t" + "eor x13, x13, %x[state]\n\t" + "eor x15, x15, x30\n\t" + "bic %x[state], x19, x17\n\t" + "bic x27, x20, x19\n\t" + "bic x28, x16, x21\n\t" + "bic x30, x17, x16\n\t" + "eor x16, x16, %x[state]\n\t" + "eor x17, x17, x27\n\t" + "bic %x[state], x21, x20\n\t" + "eor x20, x20, x28\n\t" + "eor x19, x19, %x[state]\n\t" + "eor x21, x21, x30\n\t" + "bic %x[state], x24, x23\n\t" + "bic x27, x25, x24\n\t" + "bic x28, x22, x26\n\t" + "bic x30, x23, x22\n\t" + "eor x22, x22, %x[state]\n\t" + "eor x23, x23, x27\n\t" + "bic %x[state], x26, x25\n\t" + "eor x25, x25, x28\n\t" + "eor x24, x24, %x[state]\n\t" + "eor x26, x26, x30\n\t" + /* Done tranforming */ + "ldp x27, x28, [x29, #48]\n\t" + "ldr %x[state], [x27], #8\n\t" + "subs x28, x28, #1\n\t" + "eor x1, x1, %x[state]\n\t" + "bne L_SHA3_transform_base_begin_%=\n\t" + "ldr %x[state], [x29, #40]\n\t" + "stp x1, x2, [%x[state]]\n\t" + "stp x3, x4, [%x[state], #16]\n\t" + "stp x5, x6, [%x[state], #32]\n\t" + "stp x7, x8, [%x[state], #48]\n\t" + "stp x9, x10, [%x[state], #64]\n\t" + "stp x11, x12, [%x[state], #80]\n\t" + "stp x13, x14, [%x[state], #96]\n\t" + "stp x15, x16, [%x[state], #112]\n\t" + "stp x17, x19, [%x[state], #128]\n\t" + "stp x20, x21, [%x[state], #144]\n\t" + "stp x22, x23, [%x[state], #160]\n\t" + "stp x24, x25, [%x[state], #176]\n\t" + "str x26, [%x[state], #192]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [state] "+r" (state) + : [L_SHA3_transform_base_r] "S" (L_SHA3_transform_base_r) + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "cc" + ); +} + #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ #endif /* WOLFSSL_SHA3 */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index 5ff72c37b7..139b3e42ff 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -65,7 +65,7 @@ L_SHA512_transform_neon_len_k: .xword 0xc19bf174cf692694 .xword 0xe49b69c19ef14ad2 .xword 0xefbe4786384f25e3 - .xword 0xfc19dc68b8cd5b5 + .xword 0x0fc19dc68b8cd5b5 .xword 0x240ca1cc77ac9c65 .xword 0x2de92c6f592b0275 .xword 0x4a7484aa6ea6e483 @@ -77,7 +77,7 @@ L_SHA512_transform_neon_len_k: .xword 0xbf597fc7beef0ee4 .xword 0xc6e00bf33da88fc2 .xword 0xd5a79147930aa725 - .xword 0x6ca6351e003826f + .xword 0x06ca6351e003826f .xword 0x142929670a0e6e70 .xword 0x27b70a8546d22ffc .xword 0x2e1b21385c26c926 @@ -115,8 +115,8 @@ L_SHA512_transform_neon_len_k: .xword 0xd186b8c721c0c207 .xword 0xeada7dd6cde0eb1e .xword 0xf57d4f7fee6ed178 - .xword 0x6f067aa72176fba - .xword 0xa637dc5a2c898a6 + .xword 0x06f067aa72176fba + .xword 0x0a637dc5a2c898a6 .xword 0x113f9804bef90dae .xword 0x1b710b35131c471b .xword 0x28db77f523047d84 @@ -156,8 +156,7 @@ _Transform_Sha512_Len_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #16] - str x19, [x29, #24] + stp x17, x19, [x29, #16] stp x20, x21, [x29, #32] stp x22, x23, [x29, #48] stp x24, x25, [x29, #64] @@ -1082,8 +1081,7 @@ L_sha512_len_neon_start: stp x6, x7, [x0, #16] stp x8, x9, [x0, #32] stp x10, x11, [x0, #48] - ldr x17, [x29, #16] - ldr x19, [x29, #24] + ldp x17, x19, [x29, #16] ldp x20, x21, [x29, #32] ldp x22, x23, [x29, #48] ldp x24, x25, [x29, #64] @@ -1128,7 +1126,7 @@ L_SHA512_transform_crypto_len_k: .xword 0xc19bf174cf692694 .xword 0xe49b69c19ef14ad2 .xword 0xefbe4786384f25e3 - .xword 0xfc19dc68b8cd5b5 + .xword 0x0fc19dc68b8cd5b5 .xword 0x240ca1cc77ac9c65 .xword 0x2de92c6f592b0275 .xword 0x4a7484aa6ea6e483 @@ -1140,7 +1138,7 @@ L_SHA512_transform_crypto_len_k: .xword 0xbf597fc7beef0ee4 .xword 0xc6e00bf33da88fc2 .xword 0xd5a79147930aa725 - .xword 0x6ca6351e003826f + .xword 0x06ca6351e003826f .xword 0x142929670a0e6e70 .xword 0x27b70a8546d22ffc .xword 0x2e1b21385c26c926 @@ -1178,8 +1176,8 @@ L_SHA512_transform_crypto_len_k: .xword 0xd186b8c721c0c207 .xword 0xeada7dd6cde0eb1e .xword 0xf57d4f7fee6ed178 - .xword 0x6f067aa72176fba - .xword 0xa637dc5a2c898a6 + .xword 0x06f067aa72176fba + .xword 0x0a637dc5a2c898a6 .xword 0x113f9804bef90dae .xword 0x1b710b35131c471b .xword 0x28db77f523047d84 diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c index 2bba29bcef..1a3596a61a 100644 --- a/wolfcrypt/src/sha3.c +++ b/wolfcrypt/src/sha3.c @@ -62,8 +62,7 @@ } #endif -#if (!defined(WOLFSSL_ARMASM) || (!defined(__arm__) && \ - !defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) && !defined(WOLFSSL_RISCV_ASM) +#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) #ifdef USE_INTEL_SPEEDUP #include diff --git a/wolfcrypt/src/wc_kyber.c b/wolfcrypt/src/wc_kyber.c index 8e56bcc0e0..aa03a42b67 100644 --- a/wolfcrypt/src/wc_kyber.c +++ b/wolfcrypt/src/wc_kyber.c @@ -51,10 +51,11 @@ /* Use SHA3-512 to generate 64-bytes of hash. */ #define KYBER_HASH_G kyber_hash512 /* Use SHAKE-256 as a key derivation function (KDF). */ -#ifdef USE_INTEL_SPEEDUP -#define KYBER_KDF kyber_kdf +#if defined(USE_INTEL_SPEEDUP) || \ + (defined(WOLFSSL_ARMASM) && defined(__aarch64__)) + #define KYBER_KDF kyber_kdf #else -#define KYBER_KDF wc_Shake256Hash + #define KYBER_KDF wc_Shake256Hash #endif /******************************************************************************/ diff --git a/wolfcrypt/src/wc_kyber_poly.c b/wolfcrypt/src/wc_kyber_poly.c index cf8a5b03e5..4321f1a051 100644 --- a/wolfcrypt/src/wc_kyber_poly.c +++ b/wolfcrypt/src/wc_kyber_poly.c @@ -167,6 +167,7 @@ const sword16 zetas_inv[KYBER_N / 2] = { }; +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) /* Number-Theoretic Transform. * * @param [in, out] r Polynomial to transform. @@ -1045,6 +1046,7 @@ static void kyber_basemul_mont_add(sword16* r, const sword16* a, } #endif } +#endif /* Pointwise multiply elements of a and b, into r, and multiply by 2^-16. * @@ -1078,6 +1080,110 @@ void kyber_init(void) /******************************************************************************/ +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) + +/* Generate a public-private key pair from randomly generated data. + * + * @param [in, out] priv Private key vector of polynomials. + * @param [out] pub Public key vector of polynomials. + * @param [in] e Error values as a vector of polynomials. Modified. + * @param [in] a Random values in an array of vectors of polynomials. + * @param [in] kp Number of polynomials in vector. + */ +void kyber_keygen(sword16* priv, sword16* pub, sword16* e, const sword16* a, + int kp) +{ + int i; + + /* Transform private key. All of result used in public key calculation */ + for (i = 0; i < kp; ++i) { + kyber_ntt(priv + i * KYBER_N); + } + + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply a by private into public polynomial. */ + kyber_pointwise_acc_mont(pub + i * KYBER_N, a + i * kp * KYBER_N, priv, + kp); + /* Convert public polynomial to Montgomery form. */ + kyber_to_mont(pub + i * KYBER_N); + /* Transform error values polynomial. */ + kyber_ntt(e + i * KYBER_N); + /* Add errors to public key and reduce. */ + kyber_add_reduce(pub + i * KYBER_N, e + i * KYBER_N); + } +} + +/* Encapsuluate message. + * + * @param [in] pub Public key vector of polynomials. + * @param [out] bp Vector of polynomials. + * @param [out] v Polynomial. + * @param [in] at Array of vector of polynomials. + * @param [in] sp Vector of polynomials. + * @param [in] ep Error Vector of polynomials. + * @param [in] epp Error polynomial. + * @param [in] m Message polynomial. + * @param [in] kp Number of polynomials in vector. + */ +void kyber_encapsulate(const sword16* pub, sword16* bp, sword16* v, + const sword16* at, sword16* sp, const sword16* ep, const sword16* epp, + const sword16* m, int kp) +{ + int i; + + /* Transform sp. All of result used in calculation of bp and v. */ + for (i = 0; i < kp; ++i) { + kyber_ntt(sp + i * KYBER_N); + } + + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply at by sp into bp polynomial. */ + kyber_pointwise_acc_mont(bp + i * KYBER_N, at + i * kp * KYBER_N, sp, + kp); + /* Inverse transform bp polynomial. */ + kyber_invntt(bp + i * KYBER_N); + /* Add errors to bp and reduce. */ + kyber_add_reduce(bp + i * KYBER_N, ep + i * KYBER_N); + } + + /* Multiply public key by sp into v polynomial. */ + kyber_pointwise_acc_mont(v, pub, sp, kp); + /* Inverse transform v. */ + kyber_invntt(v); + /* Add errors and message to v and reduce. */ + kyber_add3_reduce(v, epp, m); +} + +/* Decapsulate message. + * + * @param [in] priv Private key vector of polynomials. + * @param [out] mp Message polynomial. + * @param [in] bp Vector of polynomials containing error. + * @param [in] v Encapsulated message polynomial. + * @param [in] kp Number of polynomials in vector. + */ +void kyber_decapsulate(const sword16* priv, sword16* mp, sword16* bp, + const sword16* v, int kp) +{ + int i; + + /* Transform bp. All of result used in calculation of mp. */ + for (i = 0; i < kp; ++i) { + kyber_ntt(bp + i * KYBER_N); + } + + /* Multiply private key by bp into mp polynomial. */ + kyber_pointwise_acc_mont(mp, priv, bp, kp); + /* Inverse transform mp. */ + kyber_invntt(mp); + /* Subtract errors (mp) out of v and reduce into mp. */ + kyber_rsub_reduce(mp, v); +} + +#else + /* Generate a public-private key pair from randomly generated data. * * @param [in, out] priv Private key vector of polynomials. @@ -1269,6 +1375,8 @@ void kyber_decapsulate(const sword16* priv, sword16* mp, sword16* bp, } } +#endif + /******************************************************************************/ #ifdef USE_INTEL_SPEEDUP @@ -1578,8 +1686,237 @@ static int kyber_gen_matrix_k4_avx2(sword16* a, byte* seed, int transposed) return 0; } #endif /* KYBER1024 */ +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) +#ifdef WOLFSSL_KYBER512 +/* Deterministically generate a matrix (or transpose) of uniform integers mod q. + * + * Seed used with XOF to generate random bytes. + * + * @param [out] a Matrix of uniform integers. + * @param [in] seed Bytes to seed XOF generation. + * @param [in] transposed Whether A or A^T is generated. + * @return 0 on success. + * @return MEMORY_E when dynamic memory allocation fails. Only possible when + * WOLFSSL_SMALL_STACK is defined. + */ +static int kyber_gen_matrix_k2_aarch64(sword16* a, byte* seed, int transposed) +{ + word64 state[3 * 25]; + word64* st = (word64*)state; + unsigned int ctr0; + unsigned int ctr1; + unsigned int ctr2; + byte* p; + + if (!transposed) { + state[0*25 + 4] = 0x1f0000 + (0 << 8) + 0; + state[1*25 + 4] = 0x1f0000 + (0 << 8) + 1; + state[2*25 + 4] = 0x1f0000 + (1 << 8) + 0; + } + else { + state[0*25 + 4] = 0x1f0000 + (0 << 8) + 0; + state[1*25 + 4] = 0x1f0000 + (1 << 8) + 0; + state[2*25 + 4] = 0x1f0000 + (0 << 8) + 1; + } + + kyber_shake128_blocksx3_seed_neon(state, seed); + /* Sample random bytes to create a polynomial. */ + p = (byte*)st; + ctr0 = kyber_rej_uniform_neon(a + 0 * KYBER_N, KYBER_N, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 = kyber_rej_uniform_neon(a + 1 * KYBER_N, KYBER_N, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 = kyber_rej_uniform_neon(a + 2 * KYBER_N, KYBER_N, p, XOF_BLOCK_SIZE); + while ((ctr0 < KYBER_N) || (ctr1 < KYBER_N) || (ctr2 < KYBER_N)) { + kyber_sha3_blocksx3_neon(st); + + p = (byte*)st; + ctr0 += kyber_rej_uniform_neon(a + 0 * KYBER_N + ctr0, KYBER_N - ctr0, + p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 += kyber_rej_uniform_neon(a + 1 * KYBER_N + ctr1, KYBER_N - ctr1, + p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 += kyber_rej_uniform_neon(a + 2 * KYBER_N + ctr2, KYBER_N - ctr2, + p, XOF_BLOCK_SIZE); + } + + a += 3 * KYBER_N; + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + /* Transposed value same as not. */ + state[4] = 0x1f0000 + (1 << 8) + 1; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[20] = 0x8000000000000000UL; + BlockSha3(state); + p = (byte*)state; + ctr0 = kyber_rej_uniform_neon(a, KYBER_N, p, XOF_BLOCK_SIZE); + while (ctr0 < KYBER_N) { + BlockSha3(state); + ctr0 += kyber_rej_uniform_neon(a + ctr0, KYBER_N - ctr0, p, + XOF_BLOCK_SIZE); + } + + return 0; +} +#endif + +#ifdef WOLFSSL_KYBER768 +/* Deterministically generate a matrix (or transpose) of uniform integers mod q. + * + * Seed used with XOF to generate random bytes. + * + * @param [out] a Matrix of uniform integers. + * @param [in] seed Bytes to seed XOF generation. + * @param [in] transposed Whether A or A^T is generated. + * @return 0 on success. + * @return MEMORY_E when dynamic memory allocation fails. Only possible when + * WOLFSSL_SMALL_STACK is defined. + */ +static int kyber_gen_matrix_k3_aarch64(sword16* a, byte* seed, int transposed) +{ + int i; + int k; + word64 state[3 * 25]; + word64* st = (word64*)state; + unsigned int ctr0; + unsigned int ctr1; + unsigned int ctr2; + byte* p; + + for (k = 0; k < 3; k++) { + for (i = 0; i < 3; i++) { + if (!transposed) { + state[i*25 + 4] = 0x1f0000 + ((k << 8) + i); + } + else { + state[i*25 + 4] = 0x1f0000 + ((i << 8) + k); + } + } + + kyber_shake128_blocksx3_seed_neon(state, seed); + /* Sample random bytes to create a polynomial. */ + p = (byte*)st; + ctr0 = kyber_rej_uniform_neon(a + 0 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 = kyber_rej_uniform_neon(a + 1 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p +=25 * 8; + ctr2 = kyber_rej_uniform_neon(a + 2 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + /* Create more blocks if too many rejected. */ + while ((ctr0 < KYBER_N) || (ctr1 < KYBER_N) || (ctr2 < KYBER_N)) { + kyber_sha3_blocksx3_neon(st); + + p = (byte*)st; + ctr0 += kyber_rej_uniform_neon(a + 0 * KYBER_N + ctr0, + KYBER_N - ctr0, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 += kyber_rej_uniform_neon(a + 1 * KYBER_N + ctr1, + KYBER_N - ctr1, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 += kyber_rej_uniform_neon(a + 2 * KYBER_N + ctr2, + KYBER_N - ctr2, p, XOF_BLOCK_SIZE); + } + + a += 3 * KYBER_N; + } + + return 0; +} +#endif + +#ifdef WOLFSSL_KYBER1024 +/* Deterministically generate a matrix (or transpose) of uniform integers mod q. + * + * Seed used with XOF to generate random bytes. + * + * @param [out] a Matrix of uniform integers. + * @param [in] seed Bytes to seed XOF generation. + * @param [in] transposed Whether A or A^T is generated. + * @return 0 on success. + * @return MEMORY_E when dynamic memory allocation fails. Only possible when + * WOLFSSL_SMALL_STACK is defined. + */ +static int kyber_gen_matrix_k4_aarch64(sword16* a, byte* seed, int transposed) +{ + int i; + int k; + word64 state[3 * 25]; + word64* st = (word64*)state; + unsigned int ctr0; + unsigned int ctr1; + unsigned int ctr2; + byte* p; + + for (k = 0; k < 5; k++) { + for (i = 0; i < 3; i++) { + byte bi = ((k * 3) + i) / 4; + byte bj = ((k * 3) + i) % 4; + if (!transposed) { + state[i*25 + 4] = 0x1f0000 + (bi << 8) + bj; + } + else { + state[i*25 + 4] = 0x1f0000 + (bj << 8) + bi; + } + } + + kyber_shake128_blocksx3_seed_neon(state, seed); + /* Sample random bytes to create a polynomial. */ + p = (byte*)st; + ctr0 = kyber_rej_uniform_neon(a + 0 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 = kyber_rej_uniform_neon(a + 1 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 = kyber_rej_uniform_neon(a + 2 * KYBER_N, KYBER_N, p, + XOF_BLOCK_SIZE); + /* Create more blocks if too many rejected. */ + while ((ctr0 < KYBER_N) || (ctr1 < KYBER_N) || (ctr2 < KYBER_N)) { + kyber_sha3_blocksx3_neon(st); + + p = (byte*)st; + ctr0 += kyber_rej_uniform_neon(a + 0 * KYBER_N + ctr0, + KYBER_N - ctr0, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr1 += kyber_rej_uniform_neon(a + 1 * KYBER_N + ctr1, + KYBER_N - ctr1, p, XOF_BLOCK_SIZE); + p += 25 * 8; + ctr2 += kyber_rej_uniform_neon(a + 2 * KYBER_N + ctr2, + KYBER_N - ctr2, p, XOF_BLOCK_SIZE); + } + + a += 3 * KYBER_N; + } + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + /* Transposed value same as not. */ + state[4] = 0x1f0000 + (3 << 8) + 3; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[20] = 0x8000000000000000UL; + BlockSha3(state); + p = (byte*)state; + ctr0 = kyber_rej_uniform_neon(a, KYBER_N, p, XOF_BLOCK_SIZE); + while (ctr0 < KYBER_N) { + BlockSha3(state); + ctr0 += kyber_rej_uniform_neon(a + ctr0, KYBER_N - ctr0, p, + XOF_BLOCK_SIZE); + } + + return 0; +} +#endif #endif /* USE_INTEL_SPEEDUP */ +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Absorb the seed data for squeezing out pseudo-random data. * * @param [in, out] shake128 SHAKE-128 object. @@ -1610,6 +1947,7 @@ static int kyber_xof_squeezeblocks(wc_Shake* shake128, byte* out, int blocks) { return wc_Shake128_SqueezeBlocks(shake128, out, blocks); } +#endif /* New/Initialize SHA-3 object. * @@ -1690,6 +2028,7 @@ void kyber_prf_free(wc_Shake* prf) wc_Shake256_Free(prf); } +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Create pseudo-random data from the key using SHAKE-256. * * @param [in, out] shake256 SHAKE-256 object. @@ -1739,6 +2078,7 @@ static int kyber_prf(wc_Shake* shake256, byte* out, unsigned int outLen, return ret; #endif } +#endif #ifdef USE_INTEL_SPEEDUP /* Create pseudo-random key from the seed using SHAKE-256. @@ -1777,6 +2117,36 @@ int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) } #endif +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) +/* Create pseudo-random key from the seed using SHAKE-256. + * + * @param [in] seed Data to derive from. + * @param [in] seedLen Length of data to derive from in bytes. + * @param [out] out Buffer to write to. + * @param [in] outLen Number of bytes to derive. + * @return 0 on success always. + */ +int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) +{ + word64 state[25]; + int i; + int len64 = seedLen / 8; + + for (i = 0; i < len64; i++) { + state[i] = ((word64*)seed)[i]; + } + state[len64] = 0x1f; + XMEMSET(state + len64 + 1, 0, (25 - len64 - 1) * sizeof(word64)); + state[WC_SHA3_256_COUNT - 1] = 0x8000000000000000UL; + + BlockSha3(state); + XMEMCPY(out, state, outLen); + + return 0; +} +#endif + +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Rejection sampling on uniform random bytes to generate uniform random * integers mod q. * @@ -1792,6 +2162,7 @@ static unsigned int kyber_rej_uniform_c(sword16* p, unsigned int len, unsigned int i; unsigned int j; +#if defined(WOLFSSL_KYBER_SMALL) || !defined(WC_64BIT_CPU) /* Keep sampling until maximum number of integers reached or buffer used up. */ for (i = 0, j = 0; (i < len) && (j <= rLen - 3); j += 3) { @@ -1812,10 +2183,90 @@ static unsigned int kyber_rej_uniform_c(sword16* p, unsigned int len, /* Move over used bytes. */ r += 3; } +#else + unsigned int minJ; + + minJ = len / 4 * 6; + if (minJ > rLen) + minJ = rLen; + i = 0; + for (j = 0; j < minJ; j += 6) { + /* Use 48 bits (6 bytes) as four 12-bit integers. */ + sword16 v0 = (*(word64*)r) & 0xfff; + sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; + sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; + sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + + p[i] = v0 & (0 - (v0 < KYBER_Q)); + i += v0 < KYBER_Q; + p[i] = v1 & (0 - (v1 < KYBER_Q)); + i += v1 < KYBER_Q; + p[i] = v2 & (0 - (v2 < KYBER_Q)); + i += v2 < KYBER_Q; + p[i] = v3 & (0 - (v3 < KYBER_Q)); + i += v3 < KYBER_Q; + + /* Move over used bytes. */ + r += 6; + } + if (j < rLen) { + for (; (i + 4 < len) && (j < rLen); j += 6) { + /* Use 48 bits (6 bytes) as four 12-bit integers. */ + sword16 v0 = (*(word64*)r) & 0xfff; + sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; + sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; + sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + + p[i] = v0; + i += v0 < KYBER_Q; + p[i] = v1; + i += v1 < KYBER_Q; + p[i] = v2; + i += v2 < KYBER_Q; + p[i] = v3; + i += v3 < KYBER_Q; + + /* Move over used bytes. */ + r += 6; + } + for (; (i < len) && (j < rLen); j += 6) { + /* Use 48 bits (6 bytes) as four 12-bit integers. */ + sword16 v0 = (*(word64*)r) & 0xfff; + sword16 v1 = ((*(word64*)r) >> 12) & 0xfff; + sword16 v2 = ((*(word64*)r) >> 24) & 0xfff; + sword16 v3 = ((*(word64*)r) >> 36) & 0xfff; + + /* Reject first 12-bit integer if greater than or equal to q. */ + if (v0 < KYBER_Q) { + p[i++] = v0; + } + /* Check second if we don't have enough integers yet. + * Reject second 12-bit integer if greater than or equal to q. */ + if ((i < len) && (v1 < KYBER_Q)) { + p[i++] = v1; + } + /* Check second if we don't have enough integers yet. + * Reject third 12-bit integer if greater than or equal to q. */ + if ((i < len) && (v2 < KYBER_Q)) { + p[i++] = v2; + } + /* Check second if we don't have enough integers yet. + * Reject fourth 12-bit integer if greater than or equal to q. */ + if ((i < len) && (v3 < KYBER_Q)) { + p[i++] = v3; + } + + /* Move over used bytes. */ + r += 6; + } + } +#endif return i; } +#endif +#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) /* Deterministically generate a matrix (or transpose) of uniform integers mod q. * * Seed used with XOF to generate random bytes. @@ -1851,6 +2302,12 @@ static int kyber_gen_matrix_c(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, } #endif +#if !defined(WOLFSSL_KYBER_SMALL) && defined(WC_64BIT_CPU) + /* Loading 64 bits, only using 48 bits. Loading 2 bytes more than used. */ + rand[GEN_MATRIX_SIZE+0] = 0xff; + rand[GEN_MATRIX_SIZE+1] = 0xff; +#endif + /* Generate each vector of polynomials. */ for (i = 0; (ret == 0) && (i < kp); i++, a += kp * KYBER_N) { int j; @@ -1871,35 +2328,17 @@ static int kyber_gen_matrix_c(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, ret = kyber_xof_squeezeblocks(prf, rand, GEN_MATRIX_NBLOCKS); } if (ret == 0) { - #if (GEN_MATRIX_SIZE % 3) != 0 - unsigned int randLen; - #endif unsigned int ctr; /* Sample random bytes to create a polynomial. */ ctr = kyber_rej_uniform_c(a + j * KYBER_N, KYBER_N, rand, GEN_MATRIX_SIZE); /* Create more blocks if too many rejected. */ - #if (GEN_MATRIX_SIZE % 3) != 0 - randLen = GEN_MATRIX_SIZE; - while (ctr < KYBER_N) { - int off = randLen % 3; - int k; - for (k = 0; k < off; k++) { - rand[k] = rand[randLen - off + k]; - } - kyber_xof_squeezeblocks(prf, rand + off, 1); - randLen = off + XOF_BLOCK_SIZE; - ctr += kyber_rej_uniform_c(a + j * KYBER_N + ctr, - KYBER_N - ctr, rand, randLen); - } - #else while (ctr < KYBER_N) { kyber_xof_squeezeblocks(prf, rand, 1); ctr += kyber_rej_uniform_c(a + j * KYBER_N + ctr, KYBER_N - ctr, rand, XOF_BLOCK_SIZE); } - #endif } } } @@ -1911,6 +2350,7 @@ static int kyber_gen_matrix_c(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, return ret; } +#endif /* Deterministically generate a matrix (or transpose) of uniform integers mod q. * @@ -1932,6 +2372,9 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, #ifdef WOLFSSL_KYBER512 if (kp == KYBER512_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_gen_matrix_k2_aarch64(a, seed, transposed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_gen_matrix_k2_avx2(a, seed, transposed); @@ -1941,11 +2384,15 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, { ret = kyber_gen_matrix_c(prf, a, KYBER512_K, seed, transposed); } +#endif } else #endif #ifdef WOLFSSL_KYBER768 if (kp == KYBER768_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_gen_matrix_k3_aarch64(a, seed, transposed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_gen_matrix_k3_avx2(a, seed, transposed); @@ -1955,11 +2402,15 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, { ret = kyber_gen_matrix_c(prf, a, KYBER768_K, seed, transposed); } +#endif } else #endif #ifdef WOLFSSL_KYBER1024 if (kp == KYBER1024_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_gen_matrix_k4_aarch64(a, seed, transposed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_gen_matrix_k4_avx2(a, seed, transposed); @@ -1969,6 +2420,7 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, { ret = kyber_gen_matrix_c(prf, a, KYBER1024_K, seed, transposed); } +#endif } else #endif @@ -1976,6 +2428,8 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed, ret = BAD_STATE_E; } + (void)prf; + return ret; } @@ -2240,6 +2694,8 @@ static void kyber_cbd_eta3(sword16* p, const byte* r) } #endif +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) + /* Get noise/error by calculating random bytes and sampling to a binomial * distribution. * @@ -2306,6 +2762,8 @@ static int kyber_get_noise_eta2_c(KYBER_PRF_T* prf, sword16* p, return ret; } +#endif + #ifdef USE_INTEL_SPEEDUP #define PRF_RAND_SZ (2 * SHA3_256_BYTES) @@ -2488,6 +2946,206 @@ static int kyber_get_noise_k4_avx2(KYBER_PRF_T* prf, sword16* vec1, #endif #endif /* USE_INTEL_SPEEDUP */ +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) + +#define PRF_RAND_SZ (2 * SHA3_256_BYTES) + +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + */ +static void kyber_get_noise_x3_eta2_aarch64(byte* rand, byte* seed, byte o) +{ + word64* state = (word64*)rand; + + state[0*25 + 4] = 0x1f00 + 0 + o; + state[1*25 + 4] = 0x1f00 + 1 + o; + state[2*25 + 4] = 0x1f00 + 2 + o; + + kyber_shake256_blocksx3_seed_neon(state, seed); +} + +#ifdef WOLFSSL_KYBER512 +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + */ +static void kyber_get_noise_x3_eta3_aarch64(byte* rand, byte* seed, byte o) +{ + word64 state[3 * 25]; + + state[0*25 + 4] = 0x1f00 + 0 + o; + state[1*25 + 4] = 0x1f00 + 1 + o; + state[2*25 + 4] = 0x1f00 + 2 + o; + + kyber_shake256_blocksx3_seed_neon(state, seed); + XMEMCPY(rand + 0 * ETA3_RAND_SIZE, state + 0*25, SHA3_256_BYTES); + XMEMCPY(rand + 1 * ETA3_RAND_SIZE, state + 1*25, SHA3_256_BYTES); + XMEMCPY(rand + 2 * ETA3_RAND_SIZE, state + 2*25, SHA3_256_BYTES); + kyber_sha3_blocksx3_neon(state); + rand += SHA3_256_BYTES; + XMEMCPY(rand + 0 * ETA3_RAND_SIZE, state + 0*25, + ETA3_RAND_SIZE - SHA3_256_BYTES); + XMEMCPY(rand + 1 * ETA3_RAND_SIZE, state + 1*25, + ETA3_RAND_SIZE - SHA3_256_BYTES); + XMEMCPY(rand + 2 * ETA3_RAND_SIZE, state + 2*25, + ETA3_RAND_SIZE - SHA3_256_BYTES); +} + +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + * @return 0 on success. + */ +static void kyber_get_noise_eta3_aarch64(byte* rand, byte* seed, byte o) +{ + word64 state[25]; + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + state[4] = 0x1f00 + o; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[16] = 0x8000000000000000UL; + BlockSha3(state); + XMEMCPY(rand , state, SHA3_256_BYTES); + BlockSha3(state); + XMEMCPY(rand + SHA3_256_BYTES, state, ETA3_RAND_SIZE - SHA3_256_BYTES); +} + +/* Get the noise/error by calculating random bytes and sampling to a binomial + * distribution. + * + * @param [out] vec1 First Vector of polynomials. + * @param [out] vec2 Second Vector of polynomials. + * @param [out] poly Polynomial. + * @param [in] seed Seed to use when calculating random. + * @return 0 on success. + */ +static int kyber_get_noise_k2_aarch64(sword16* vec1, sword16* vec2, + sword16* poly, byte* seed) +{ + int ret = 0; + byte rand[3 * 25 * 8]; + + kyber_get_noise_x3_eta3_aarch64(rand, seed, 0); + kyber_cbd_eta3(vec1 , rand + 0 * ETA3_RAND_SIZE); + kyber_cbd_eta3(vec1 + KYBER_N, rand + 1 * ETA3_RAND_SIZE); + if (poly == NULL) { + kyber_cbd_eta3(vec2 , rand + 2 * ETA3_RAND_SIZE); + kyber_get_noise_eta3_aarch64(rand, seed, 3); + kyber_cbd_eta3(vec2 + KYBER_N, rand ); + } + else { + kyber_get_noise_x3_eta2_aarch64(rand, seed, 2); + kyber_cbd_eta2(vec2 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 + KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(poly , rand + 2 * 25 * 8); + } + + return ret; +} +#endif + +#ifdef WOLFSSL_KYBER768 +/* Get the noise/error by calculating random bytes. + * + * @param [out] rand Random number byte array. + * @param [in] seed Seed to generate random from. + * @param [in] o Offset of seed count. + * @return 0 on success. + */ +static void kyber_get_noise_eta2_aarch64(byte* rand, byte* seed, byte o) +{ + word64* state = (word64*)rand; + + state[0] = ((word64*)seed)[0]; + state[1] = ((word64*)seed)[1]; + state[2] = ((word64*)seed)[2]; + state[3] = ((word64*)seed)[3]; + /* Transposed value same as not. */ + state[4] = 0x1f00 + o; + XMEMSET(state + 5, 0, sizeof(*state) * (25 - 5)); + state[16] = 0x8000000000000000UL; + BlockSha3(state); +} + +/* Get the noise/error by calculating random bytes and sampling to a binomial + * distribution. + * + * @param [out] vec1 First Vector of polynomials. + * @param [out] vec2 Second Vector of polynomials. + * @param [out] poly Polynomial. + * @param [in] seed Seed to use when calculating random. + * @return 0 on success. + */ +static int kyber_get_noise_k3_aarch64(sword16* vec1, sword16* vec2, + sword16* poly, byte* seed) +{ + byte rand[3 * 25 * 8]; + + kyber_get_noise_x3_eta2_aarch64(rand, seed, 0); + kyber_cbd_eta2(vec1 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec1 + 1 * KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(vec1 + 2 * KYBER_N, rand + 2 * 25 * 8); + kyber_get_noise_x3_eta2_aarch64(rand, seed, 3); + kyber_cbd_eta2(vec2 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 + 1 * KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(vec2 + 2 * KYBER_N, rand + 2 * 25 * 8); + if (poly != NULL) { + kyber_get_noise_eta2_aarch64(rand, seed, 6); + kyber_cbd_eta2(poly , rand + 0 * 25 * 8); + } + + return 0; +} +#endif + +#ifdef WOLFSSL_KYBER1024 +/* Get the noise/error by calculating random bytes and sampling to a binomial + * distribution. + * + * @param [out] vec1 First Vector of polynomials. + * @param [out] vec2 Second Vector of polynomials. + * @param [out] poly Polynomial. + * @param [in] seed Seed to use when calculating random. + * @return 0 on success. + */ +static int kyber_get_noise_k4_aarch64(sword16* vec1, sword16* vec2, + sword16* poly, byte* seed) +{ + int ret = 0; + byte rand[3 * 25 * 8]; + + kyber_get_noise_x3_eta2_aarch64(rand, seed, 0); + kyber_cbd_eta2(vec1 , rand + 0 * 25 * 8); + kyber_cbd_eta2(vec1 + 1 * KYBER_N, rand + 1 * 25 * 8); + kyber_cbd_eta2(vec1 + 2 * KYBER_N, rand + 2 * 25 * 8); + kyber_get_noise_x3_eta2_aarch64(rand, seed, 3); + kyber_cbd_eta2(vec1 + 3 * KYBER_N, rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 , rand + 1 * 25 * 8); + kyber_cbd_eta2(vec2 + 1 * KYBER_N, rand + 2 * 25 * 8); + kyber_get_noise_x3_eta2_aarch64(rand, seed, 6); + kyber_cbd_eta2(vec2 + 2 * KYBER_N, rand + 0 * 25 * 8); + kyber_cbd_eta2(vec2 + 3 * KYBER_N, rand + 1 * 25 * 8); + if (poly != NULL) { + kyber_cbd_eta2(poly, rand + 2 * 25 * 8); + } + + return ret; +} +#endif +#endif /* __aarch64__ && WOLFSSL_ARMASM */ + +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) + /* Get the noise/error by calculating random bytes and sampling to a binomial * distribution. * @@ -2531,6 +3189,8 @@ static int kyber_get_noise_c(KYBER_PRF_T* prf, int kp, sword16* vec1, int eta1, return ret; } +#endif /* __aarch64__ && WOLFSSL_ARMASM */ + /* Get the noise/error by calculating random bytes and sampling to a binomial * distribution. * @@ -2549,6 +3209,9 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, #ifdef WOLFSSL_KYBER512 if (kp == KYBER512_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_get_noise_k2_aarch64(vec1, vec2, poly, seed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_get_noise_k2_avx2(prf, vec1, vec2, poly, seed); @@ -2563,11 +3226,15 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = kyber_get_noise_c(prf, kp, vec1, KYBER_CBD_ETA3, vec2, KYBER_CBD_ETA2, poly, seed); } +#endif } else #endif #ifdef WOLFSSL_KYBER768 if (kp == KYBER768_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_get_noise_k3_aarch64(vec1, vec2, poly, seed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_get_noise_k3_avx2(vec1, vec2, poly, seed); @@ -2578,11 +3245,15 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = kyber_get_noise_c(prf, kp, vec1, KYBER_CBD_ETA2, vec2, KYBER_CBD_ETA2, poly, seed); } +#endif } else #endif #ifdef WOLFSSL_KYBER1024 if (kp == KYBER1024_K) { +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + ret = kyber_get_noise_k4_aarch64(vec1, vec2, poly, seed); +#else #ifdef USE_INTEL_SPEEDUP if (IS_INTEL_AVX2(cpuid_flags)) { ret = kyber_get_noise_k4_avx2(prf, vec1, vec2, poly, seed); @@ -2593,6 +3264,7 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = kyber_get_noise_c(prf, kp, vec1, KYBER_CBD_ETA2, vec2, KYBER_CBD_ETA2, poly, seed); } +#endif } else #endif @@ -2600,11 +3272,14 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, ret = BAD_STATE_E; } + (void)prf; + return ret; } /******************************************************************************/ +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) /* Compare two byte arrays of equal size. * * @param [in] a First array to compare. @@ -2624,6 +3299,7 @@ static int kyber_cmp_c(const byte* a, const byte* b, int sz) } return 0 - ((-(word32)r) >> 31); } +#endif /* Compare two byte arrays of equal size. * @@ -2635,6 +3311,9 @@ static int kyber_cmp_c(const byte* a, const byte* b, int sz) */ int kyber_cmp(const byte* a, const byte* b, int sz) { +#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) + return kyber_cmp_neon(a, b, sz); +#else int fail; #ifdef USE_INTEL_SPEEDUP @@ -2648,10 +3327,13 @@ int kyber_cmp(const byte* a, const byte* b, int sz) } return fail; +#endif } /******************************************************************************/ +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) + /* Conditional subtraction of q to each coefficient of a polynomial. * * @param [in, out] p Polynomial. @@ -2667,6 +3349,12 @@ static KYBER_NOINLINE void kyber_csubq_c(sword16* p) } } +#else + +#define kyber_csubq_c kyber_csubq_neon + +#endif + /******************************************************************************/ #if defined(CONV_WITH_DIV) || !defined(WORD64_AVAILABLE) @@ -3511,6 +4199,7 @@ void kyber_decompress_5(sword16* p, const unsigned char* b) /******************************************************************************/ +#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) /* Convert bit from byte to 0 or (KYBER_Q + 1) / 2. * * Constant time implementation. @@ -3622,7 +4311,7 @@ static void kyber_to_msg_c(byte* msg, sword16* p) /* Reduce each coefficient to mod q. */ kyber_csubq_c(p); - /* All values are now positive. */ + /* All values are now in range. */ for (i = 0; i < KYBER_N / 8; i++) { #ifdef WOLFSSL_KYBER_SMALL @@ -3663,6 +4352,27 @@ void kyber_to_msg(byte* msg, sword16* p) kyber_to_msg_c(msg, p); } } +#else +/* Convert message to polynomial. + * + * @param [out] p Polynomial. + * @param [in] msg Message as a byte array. + */ +void kyber_from_msg(sword16* p, const byte* msg) +{ + kyber_from_msg_neon(p, msg); +} + +/* Convert polynomial to message. + * + * @param [out] msg Message as a byte array. + * @param [in] p Polynomial. + */ +void kyber_to_msg(byte* msg, sword16* p) +{ + kyber_to_msg_neon(msg, p); +} +#endif /******************************************************************************/ diff --git a/wolfssl/wolfcrypt/sha3.h b/wolfssl/wolfcrypt/sha3.h index 0120051508..f65c41d322 100644 --- a/wolfssl/wolfcrypt/sha3.h +++ b/wolfssl/wolfcrypt/sha3.h @@ -220,8 +220,7 @@ WOLFSSL_LOCAL void sha3_block_bmi2(word64* s); WOLFSSL_LOCAL void sha3_block_avx2(word64* s); WOLFSSL_LOCAL void BlockSha3(word64 *s); #endif -#if (defined(WOLFSSL_ARMASM) && (defined(__arm__) || \ - defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) || defined(WOLFSSL_RISCV_ASM) +#if defined(WOLFSSL_ARMASM) || defined(WOLFSSL_RISCV_ASM) WOLFSSL_LOCAL void BlockSha3(word64 *s); #endif diff --git a/wolfssl/wolfcrypt/wc_kyber.h b/wolfssl/wolfcrypt/wc_kyber.h index 34b3d64ed9..2b8ac8da22 100644 --- a/wolfssl/wolfcrypt/wc_kyber.h +++ b/wolfssl/wolfcrypt/wc_kyber.h @@ -163,7 +163,8 @@ WOLFSSL_LOCAL int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, sword16* vec2, sword16* poly, byte* seed); -#ifdef USE_INTEL_SPEEDUP +#if defined(USE_INTEL_SPEEDUP) || \ + (defined(WOLFSSL_ARMASM) && defined(__aarch64__)) WOLFSSL_LOCAL int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen); #endif @@ -288,6 +289,27 @@ void kyber_decompress_5_avx2(sword16* p, const byte* r); WOLFSSL_LOCAL int kyber_cmp_avx2(const byte* a, const byte* b, int sz); +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) +WOLFSSL_LOCAL void kyber_ntt(sword16* r); +WOLFSSL_LOCAL void kyber_invntt(sword16* r); +WOLFSSL_LOCAL void kyber_basemul_mont(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_basemul_mont_add(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_add_reduce(sword16* r, const sword16* a); +WOLFSSL_LOCAL void kyber_add3_reduce(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_rsub_reduce(sword16* r, const sword16* a); +WOLFSSL_LOCAL void kyber_to_mont(sword16* p); +WOLFSSL_LOCAL void kyber_sha3_blocksx3_neon(word64* state); +WOLFSSL_LOCAL void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed); +WOLFSSL_LOCAL void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed); +WOLFSSL_LOCAL unsigned int kyber_rej_uniform_neon(sword16* p, unsigned int len, + const byte* r, unsigned int rLen); +WOLFSSL_LOCAL int kyber_cmp_neon(const byte* a, const byte* b, int sz); +WOLFSSL_LOCAL void kyber_csubq_neon(sword16* p); +WOLFSSL_LOCAL void kyber_from_msg_neon(sword16* p, const byte* msg); +WOLFSSL_LOCAL void kyber_to_msg_neon(byte* msg, sword16* p); #endif #ifdef __cplusplus